# Keras Pipeline 3-Folds
- Use features including `fc` meta feature from tabular models.

In [1]:
import logging
import sys
def get_logger():
    """
        credits to: https://www.kaggle.com/ogrellier/user-level-lightgbm-lb-1-4480
    """
    FORMAT = "[%(asctime)s] %(levelname)s : %(message)s"
    logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s',
                     level=logging.INFO, stream=sys.stdout)
    logger = logging.getLogger("main")
    logger.setLevel(logging.DEBUG)
    return logger

logger = get_logger()

In [2]:
import pandas as pd
import os
import time
import logging
import sys
from tqdm import tqdm
#os.environ["CUDA_VISIBLE_DEVICES"]="0"
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import tensorflow as tf
from keras.layers import Dense, Input, Activation
from keras.layers import BatchNormalization,Add,Dropout
from keras.optimizers import Adam
from keras.models import Model, load_model
from keras import callbacks
from keras import backend as K
from keras.layers.advanced_activations import LeakyReLU
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings(action="ignore",category=DeprecationWarning)
warnings.filterwarnings(action="ignore",category=FutureWarning)
import os
import gc

from datetime import datetime
from sklearn.metrics import mean_absolute_error

Using TensorFlow backend.


In [3]:
df_struct=pd.read_csv('../input/structures.csv')
df_train_sub_charge=pd.read_csv('../input/mulliken_charges.csv')
df_train_sub_tensor=pd.read_csv('../input/magnetic_shielding_tensors.csv')
train = pd.read_csv('../input/train.csv')

In [4]:
### CONFIGURABLES #######
bond_type = '3JHH'
MODEL_NUMBER = 'K004'
#########################

In [5]:
def plot_history(history, label):
    plt.figure(figsize=(15,5))
    plt.plot(history.history['loss'][-100:])
    plt.plot(history.history['val_loss'][-100:])
    plt.title('Loss for %s' % label)
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    _= plt.legend(['Train','Validation'], loc='upper left')
    plt.show()

In [6]:
# Set up GPU preferences
config = tf.ConfigProto( device_count = {'GPU': 2} ) 
config.gpu_options.allow_growth = True
config.gpu_options.per_process_gpu_memory_fraction = 0.8
sess = tf.Session(config=config) 
K.set_session(sess)

In [7]:
def map_atom_info(df_1,df_2, atom_idx):
    #logging.info('Mapping...', df_1.shape, df_2.shape, atom_idx)
    df = pd.merge(df_1, df_2.drop_duplicates(subset=['molecule_name', 'atom_index']), how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    df = df.drop('atom_index', axis=1)
    return df

def create_datasets(bond_type, X_train_file, fold):
    logger.info('Creating Datasets')
    # Read input for first fold
    X_test_file = X_train_file.replace('X_train','X_test')
    X_valid_file = X_train_file.replace('X_train','X_valid')
    X_train = pd.read_parquet(f'../type_results/{bond_type}/meta/{X_train_file}') 
    X_test = pd.read_parquet(f'../type_results/{bond_type}/meta/{X_test_file}') 
    X_valid = pd.read_parquet(f'../type_results/{bond_type}/meta/{X_valid_file}') 
    X_train['split'] = 'TRAIN'
    X_test['split'] = 'TEST'
    X_valid['split'] = 'VALID'
    logger.info('Adding target to dataset')
    # Add target to train and val
    X_tr_val = pd.concat([X_train, X_valid])
    X_tr_val = X_tr_val.sort_index()
    X_tr_val['scalar_coupling_constant'] = train.loc[train['type'] == '3JHH']['scalar_coupling_constant'].tolist()
    X_tr_val['molecule_name'] = train.loc[train['type'] == '3JHH']['molecule_name'].tolist()
    X_tr_val['atom_index_0'] = train.loc[train['type'] == '3JHH']['atom_index_0'].tolist()
    X_tr_val['atom_index_1'] = train.loc[train['type'] == '3JHH']['atom_index_1'].tolist()

    # Combine all
    X_all = pd.concat([X_tr_val, X_test])
    logger.info('Adding custom target features')
    for atom_idx in [0,1]:
        X_all = map_atom_info(X_all,df_struct, atom_idx)
        X_all = map_atom_info(X_all,df_train_sub_charge, atom_idx)
        X_all = map_atom_info(X_all,df_train_sub_tensor, atom_idx)
        X_all = X_all.rename(columns={'atom': f'atom_{atom_idx}',
                                            'x': f'x_{atom_idx}',
                                            'y': f'y_{atom_idx}',
                                            'z': f'z_{atom_idx}',
                                            'mulliken_charge': f'charge_{atom_idx}',
                                            'XX': f'XX_{atom_idx}',
                                            'YX': f'YX_{atom_idx}',
                                            'ZX': f'ZX_{atom_idx}',
                                            'XY': f'XY_{atom_idx}',
                                            'YY': f'YY_{atom_idx}',
                                            'ZY': f'ZY_{atom_idx}',
                                            'XZ': f'XZ_{atom_idx}',
                                            'YZ': f'YZ_{atom_idx}',
                                            'ZZ': f'ZZ_{atom_idx}',})
    
    ys_all = X_all[['scalar_coupling_constant',"charge_0","charge_1",
                "XX_0","YY_0","ZZ_0","XX_1","YY_1","ZZ_1","YX_0","ZX_0",
                "XY_0","ZY_0","XZ_0","YZ_0","YX_1","ZX_1","XY_1","ZY_1",
                "XZ_1","YZ_1","split"]].copy()
    target_all = ys_all[['scalar_coupling_constant','split']].copy()
    splits = X_all['split']
    ys_all = ys_all.drop('split', axis=1)
    
    X_all = X_all.drop(['scalar_coupling_constant',"charge_0","charge_1",
                "XX_0","YY_0","ZZ_0","XX_1","YY_1","ZZ_1","YX_0","ZX_0",
                "XY_0","ZY_0","XZ_0","YZ_0","YX_1","ZX_1","XY_1","ZY_1",
                "XZ_1","YZ_1"], axis=1)
    # Remove non numeric columns
    X_all = X_all.drop(['molecule_name', 'split', 'atom_0', 'atom_1'], axis=1)

    #Impute NA with mean
    # THIS PART TAKES A LONG TIME
    logger.info('Filling in NA vaules with the mean value')
    MEAN = X_all.mean()
    X_all.fillna( value=MEAN, inplace=True )
    
    # STANDARD SCALAR STUFF
    logger.info('Applying Standard scalar to data')
    X_all[X_all.columns] = StandardScaler().fit_transform(X_all[X_all.columns])
    ys_all[ys_all.columns] = StandardScaler().fit_transform(ys_all[ys_all.columns])
    
    X_train = X_all.loc[splits == 'TRAIN']
    X_valid = X_all.loc[splits == 'VALID']
    X_test = X_all.loc[splits == 'TEST']
    
    y_train = ys_all.loc[splits == 'TRAIN']
    y_valid = ys_all.loc[splits == 'VALID']
    y_test = ys_all.loc[splits == 'TEST']
    
    target_train = target_all[target_all['split'] == 'TRAIN']['scalar_coupling_constant']
    target_valid = target_all[target_all['split']  == 'VALID']['scalar_coupling_constant']
    target_test = target_all[target_all['split']  == 'TEST']['scalar_coupling_constant']

    m1=2
    m2=4
    m3=1

    train_input=X_train.values
    cv_input=X_valid.values
    train_target=target_train.values
    cv_target=target_valid.values
    train_target_1=m1 * y_train[["charge_0","charge_1"]].values
    cv_target_1=m1 * y_valid[["charge_0","charge_1"]].values
    train_target_2=m2 * y_train[["XX_0","YY_0","ZZ_0","XX_1","YY_1","ZZ_1"]].values
    cv_target_2=m2 * y_valid[["XX_0","YY_0","ZZ_0","XX_1","YY_1","ZZ_1"]].values
    train_target_3=m3 * y_train[["YX_0","ZX_0","XY_0","ZY_0","XZ_0","YZ_0","YX_1","ZX_1","XY_1","ZY_1","XZ_1","YZ_1"]].values
    cv_target_3=m3 * y_valid[["YX_0","ZX_0","XY_0","ZY_0","XZ_0","YZ_0","YX_1","ZX_1","XY_1","ZY_1","XZ_1","YZ_1"]].values
    test_input=X_test.values
    logger.info('Done creating data for model')
    return train_input, cv_input, train_target, cv_target, train_target_1, cv_target_1, train_target_2, cv_target_2, train_target_3, cv_target_3, test_input, X_valid.index

In [8]:
def create_nn_model(input_shape):
    inp = Input(shape=(input_shape,))
    x = Dense(1024)(inp)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.10)(x)
    x = Dropout(0.2)(x)
    x = Dense(1024)(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.10)(x)
    x = Dropout(0.2)(x)
    x = Dense(1024)(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.10)(x)
    x = Dropout(0.2)(x)
    x = Dense(1024)(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.10)(x)
    out1 = Dense(2, activation="linear",name='outM2')(x)#mulliken charge 2
    out2 = Dense(6, activation="linear",name='outT6')(x)#tensor 6(xx,yy,zz)
    out3 = Dense(12, activation="linear",name='outT12')(x)#tensor 12(others) 
    x = Dense(256)(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.10)(x)
    x = Dense(128)(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.10)(x)
    out = Dense(1, activation="linear",name='out')(x)#scalar_coupling_constant    
    model = Model(inputs=inp, outputs=[out,out1,out2,out3])
    return model

In [9]:
def save_type_data(
    type_,
    oof,
    sub,
    fi,
    MODEL_NUMBER,
    run_id,
    MODEL_TYPE,
    N_FOLDS,
    N_ESTIMATORS,
    LEARNING_RATE,
):
    """
    Saves the oof, sub, and fi files int he type_results folder with correct naming convention
    """
    oof_type = oof.loc[oof["type"] == type_]
    score = mean_absolute_error(
        oof_type["scalar_coupling_constant"], oof_type["oof_preds"]
    )
    logscore = np.log(score)
    if score > 1:
        logger.error(f"No predictions for {type_}")
    print(
        f"===== Saving results for for type {type_} - mae {score} - log mae {logscore}"
    )

    oof_type = oof.loc[oof["type"] == type_]

    sub_type = test[["id", "molecule_name", "type"]].merge(sub, on="id")
    sub_type = sub_type.loc[sub_type["type"] == type_]
    if np.sum(sub_type["scalar_coupling_constant"] == 0) > 10:
        logger.error("ERROR! Sub has too many zero predictions")
    expected_len = len(test.loc[test["type"] == type_])
    if expected_len != len(sub_type):
        logger.error("ERRROR LENGTHS NOT THE SAME")

    # Name Files and save
    fn_template = "../type_results/{}/{}_{}_{}_XXXXXXX_{:0.4f}MAE_{:0.4}LMAE_{}_{}folds_{}iter_{}lr.parquet".format(
        type_,
        MODEL_NUMBER,
        run_id,
        type_,
        score,
        logscore,
        MODEL_TYPE,
        N_FOLDS,
        N_ESTIMATORS,
        LEARNING_RATE,
    )
    sub_name = fn_template.replace("XXXXXXX", "sub")
    oof_name = fn_template.replace("XXXXXXX", "oof")
    sub_type.to_parquet(sub_name)
    oof_type.to_parquet(oof_name)

    logger.info(f'{type_}: Saving sub to {sub_name}')
    logger.info(f'{type_}: Saving oof to {oof_name}')

    if fi is not None:
        fi_type = fi.loc[fi["type"] == type_]
        fi_name = fn_template.replace("XXXXXXX", "fi")
        print(fi_name)
        fi_type.to_parquet(fi_name)

In [10]:
def train_keras_model(fold, MODEL_NUMBER, bond_type, X_train_file, EPOCH_N=500, BATCH_SIZE=4096):
    logger.info(f'Training model for fold {fold}')
    model_name_wrt = (f'{MODEL_NUMBER}molecule_model_{bond_type}-{fold}.hdf5')
    train_input, cv_input, train_target, cv_target, train_target_1, cv_target_1, train_target_2, cv_target_2, train_target_3, cv_target_3, test_input, val_idx = create_datasets(bond_type, X_train_file, fold)

    # Build the Neural Net
    nn_model=create_nn_model(train_input.shape[1])
    nn_model.compile(loss="mae", optimizer=Adam())

    es = callbacks.EarlyStopping(
        monitor="val_out_loss",
        min_delta=0.0005,
        patience=30,
        verbose=1,
        mode="auto",
        restore_best_weights=True,
    )

    rlr = callbacks.ReduceLROnPlateau(
        monitor="val_out_loss",
        factor=0.3333,
        patience=15,
        min_lr=1e-6,
        mode="auto",
        verbose=1,
    )

    sv_mod = callbacks.ModelCheckpoint(
        model_name_wrt, monitor="val_out_loss", save_best_only=True, period=1
    )

    history = nn_model.fit(train_input,[train_target,train_target_1,train_target_2,train_target_3], 
                       validation_data=(cv_input,[cv_target,cv_target_1,cv_target_2,cv_target_3]), 
                       callbacks=[es, rlr, sv_mod],
                       epochs=EPOCH_N,
                       batch_size=BATCH_SIZE,
                       verbose=verbose)
    
    cv_predict=nn_model.predict(cv_input)
    test_predict=nn_model.predict(test_input)
    plot_history(history, bond_type)
    
    # CREATE OOF SUB AND STUFF
    return cv_predict, test_predict, val_idx

In [None]:
EPOCH_N = 500
INPUT_MODEL = 'M053_0725_0821_3JHH'

cv_score=[]
cv_score_total=0 
verbose = 2

test = pd.read_csv('../input/test.csv')

oof = train.loc[train['type'] == bond_type].reset_index(drop=True).drop(['molecule_name','atom_index_0','atom_index_1'], axis=1)
sub = test.loc[test['type'] == bond_type]

for fold in [1, 2, 3]:
    # Find the correct input filename for this fold
    for f in os.listdir(f'../type_results/{bond_type}/meta'):
        if ('X_train' in f) and (INPUT_MODEL in f) and (f'_f{fold}_' in f):
            X_train_file = f
    logger.info(f'Using X_train file {X_train_file}')
        
    cv_predict, test_predict, val_idx = train_keras_model(fold=fold,
                                                 MODEL_NUMBER=MODEL_NUMBER,
                                                 bond_type=bond_type,
                                                 X_train_file=X_train_file,
                                                 EPOCH_N=EPOCH_N)
    oof.loc[oof.index.isin(val_idx), 'oof_preds'] = cv_predict[0][:,0]
    sub[f'scalar_coupling_constant_f{fold}'] = test_predict[0][:,0]

2019-07-31 21:55:41,639 | INFO : Using X_train file M053_0725_0821_3JHH_X_train_meta_fc_f1_0.1753MAE_-1.7413LMAE.parquet
2019-07-31 21:55:41,640 | INFO : Training model for fold 1
2019-07-31 21:55:41,641 | INFO : Creating Datasets
2019-07-31 21:55:42,611 | INFO : Adding target to dataset
2019-07-31 21:55:48,572 | INFO : Adding custom target features
2019-07-31 21:57:03,175 | INFO : Filling in NA vaules with the mean value
2019-07-31 21:57:06,852 | INFO : Applying Standard scalar to data
2019-07-31 21:58:13,942 | INFO : Done creating data for model




Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Train on 393740 samples, validate on 196871 samples
Epoch 1/500
 - 9s - loss: 4.6447 - out_loss: 1.1379 - outM2_loss: 0.3927 - outT6_loss: 2.3148 - outT12_loss: 0.7993 - val_loss: 4.0876 - val_out_loss: 0.9249 - val_outM2_loss: 0.3730 - val_outT6_loss: 2.0265 - val_outT12_loss: 0.7631
Epoch 2/500
 - 7s - loss: 3.3520 - out_

In [None]:
run_id = "{:%m%d_%H%M}".format(datetime.now())
logger.info('Plotting Results')
oof.plot(x='scalar_coupling_constant', y='oof_preds', kind='scatter', figsize=(5, 5), title='OOF Preds vs Actual')
plt.show()

df = pd.read_csv('../submissions/BLEND035_sub_-2.00491CV.csv') # Pull a good submission
sub['scc'] = df['scalar_coupling_constant']
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5), sharex=True, sharey=True)
sub.plot(x='scc', y='scalar_coupling_constant_f1', kind='scatter', ax=ax1)
sub.plot(x='scc', y='scalar_coupling_constant_f2', kind='scatter', ax=ax2)
sub.plot(x='scc', y='scalar_coupling_constant_f3', kind='scatter', ax=ax3)
plt.show()

sub['scalar_coupling_constant'] = sub[['scalar_coupling_constant_f1',
                                       'scalar_coupling_constant_f2',
                                       'scalar_coupling_constant_f3']].mean(axis=1)
logger.info('Saving Results')
save_type_data(
    type_=bond_type,
    oof=oof,
    sub=sub.drop('type', axis=1),
    fi=None,
    MODEL_NUMBER=MODEL_NUMBER,
    run_id=run_id,
    MODEL_TYPE='keras',
    N_FOLDS=3,
    N_ESTIMATORS='',
    LEARNING_RATE='',
)