In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import random
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor

import pickle

In [2]:
FOLDER = '../../data_kaggle/champs/'
OUTPUT = FOLDER + 'out/'

In [3]:
# df_mulliken_charges = pd.read_csv(FOLDER + 'mulliken_charges.csv')
# df_sample =  pd.read_csv(FOLDER + 'sample_submission.csv')
# df_magnetic_shielding_tensors = pd.read_csv(FOLDER + 'magnetic_shielding_tensors.csv')
# df_train = pd.read_csv(FOLDER + 'train.csv')
# df_test = pd.read_csv(FOLDER + 'test.csv')
# df_dipole_moments = pd.read_csv(FOLDER + 'dipole_moments.csv')
# df_potential_energy = pd.read_csv(FOLDER + 'potential_energy.csv')
# df_structures = pd.read_csv(FOLDER + 'structures.csv')
# df_scalar_coupling_contributions = pd.read_csv(FOLDER + 'scalar_coupling_contributions.csv')

In [4]:
df_train_dist = pd.read_csv(OUTPUT + '20190611_train_dist_cos.csv')

In [5]:
def type_score(y_val, y_pred):
    return np.log(sum(np.abs(y_val- y_pred)) / len(y_val))

In [6]:
bond_types = df_train_dist['type'].unique()

In [7]:
mols = df_train_dist['molecule_name'].unique()
num = len(mols)
num_train = int(num * 0.8)
pickup = random.sample(range(num), num)
pick_train = pickup[:num_train]
pick_val = pickup[num_train:]

In [8]:
rounds = 20000
num_seed = 5

In [9]:
models = []
scores = []
for s in range(num_seed):
    print('seed No.{}'.format(s))
    models_s = []
    scores_s = np.zeros([0])
    for i, b in enumerate(bond_types):
        print('seed No.{}, bond type '.format(s, b))
        df_bond = df_train_dist.query('type == "{}"'.format(b))

        df_train = df_bond[df_bond['molecule_name'].isin(mols[pick_train])]
        y_train = df_train['scalar_coupling_constant'].values
        X_train = df_train[df_train.columns[6:]].values

        df_val = df_bond[df_bond['molecule_name'].isin(mols[pick_val])]
        y_val = df_val['scalar_coupling_constant'].values
        X_val = df_val[df_val.columns[6:]].values

        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

        params = {
                'task' : 'train',
                'boosting_type' : 'gbdt',
                'objective' : 'regression',
                'metric' : {'l2'},
                'num_leaves' : 31,
                'learning_rate' : 0.1,
                'feature_fraction' : 0.9,
                'bagging_fraction' : 0.8,
                'bagging_freq': 5,
            'seed':s
        }

        gbm = lgb.train(params,
                lgb_train,
                num_boost_round=rounds,
                verbose_eval=0,
                valid_sets=lgb_eval,
                early_stopping_rounds=100)

        y_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
        
        score = type_score(y_val, y_pred)
        scores_s = np.hstack([scores_s, score])
        models_s.append(gbm)
    models.append(models_s)
    scores.append(scores_s)

Training until validation scores don't improve for 10 rounds.
[100]	valid_0's l2: 5.0577
[200]	valid_0's l2: 3.75177
[300]	valid_0's l2: 3.19995
[400]	valid_0's l2: 2.86778
[500]	valid_0's l2: 2.64098
[600]	valid_0's l2: 2.4643
[700]	valid_0's l2: 2.3251
[800]	valid_0's l2: 2.21202
[900]	valid_0's l2: 2.11416
[1000]	valid_0's l2: 2.02953
[1100]	valid_0's l2: 1.95413
[1200]	valid_0's l2: 1.89666
[1300]	valid_0's l2: 1.83333
[1400]	valid_0's l2: 1.78246
[1500]	valid_0's l2: 1.73638
[1600]	valid_0's l2: 1.69371
[1700]	valid_0's l2: 1.65741
[1800]	valid_0's l2: 1.62383
[1900]	valid_0's l2: 1.59371
[2000]	valid_0's l2: 1.56345
[2100]	valid_0's l2: 1.53613
[2200]	valid_0's l2: 1.5103
[2300]	valid_0's l2: 1.48581
[2400]	valid_0's l2: 1.46377
[2500]	valid_0's l2: 1.44155
[2600]	valid_0's l2: 1.42242
[2700]	valid_0's l2: 1.40433
[2800]	valid_0's l2: 1.38659
[2900]	valid_0's l2: 1.37021
[3000]	valid_0's l2: 1.35441
[3100]	valid_0's l2: 1.33876
[3200]	valid_0's l2: 1.32481
[3300]	valid_0's l2: 1.

[1800]	valid_0's l2: 0.505063
[1900]	valid_0's l2: 0.494578
[2000]	valid_0's l2: 0.485011
[2100]	valid_0's l2: 0.476729
[2200]	valid_0's l2: 0.468491
[2300]	valid_0's l2: 0.460519
[2400]	valid_0's l2: 0.452793
[2500]	valid_0's l2: 0.447233
[2600]	valid_0's l2: 0.441176
[2700]	valid_0's l2: 0.43584
[2800]	valid_0's l2: 0.429673
[2900]	valid_0's l2: 0.423832
[3000]	valid_0's l2: 0.4181
[3100]	valid_0's l2: 0.413595
[3200]	valid_0's l2: 0.409748
[3300]	valid_0's l2: 0.405502
[3400]	valid_0's l2: 0.401284
[3500]	valid_0's l2: 0.397368
[3600]	valid_0's l2: 0.393574
[3700]	valid_0's l2: 0.390162
[3800]	valid_0's l2: 0.386672
[3900]	valid_0's l2: 0.383474
[4000]	valid_0's l2: 0.380227
[4100]	valid_0's l2: 0.37713
[4200]	valid_0's l2: 0.374373
[4300]	valid_0's l2: 0.371311
[4400]	valid_0's l2: 0.368612
[4500]	valid_0's l2: 0.365615
[4600]	valid_0's l2: 0.36313
[4700]	valid_0's l2: 0.360704
[4800]	valid_0's l2: 0.358112
[4900]	valid_0's l2: 0.356093
[5000]	valid_0's l2: 0.353975
[5100]	valid_0'

In [10]:
for s in range(num_seed):
    print('-'*10, s, '-'*10)
    print('type score:', scores[s])
    print('total score:', scores[s].mean())

type score: [-0.42611669 -1.80932696 -1.04113371 -1.63113263 -1.14246793 -0.51528461
 -0.33062297 -1.75250977]
total score: -1.0810744097041554


In [19]:
# scores_en = np.zeros([0])

# for i, b in enumerate(bond_types):
#     df_bond = df_train_dist.query('type == "{}"'.format(b))
#     df_train = df_bond[df_bond['molecule_name'].isin(mols[pick_train])]
#     y_train = df_train['scalar_coupling_constant'].values
#     X_train = df_train[df_train.columns[6:]].values
    
#     df_val = df_bond[df_bond['molecule_name'].isin(mols[pick_val])]
#     y_val = df_val['scalar_coupling_constant'].values
#     X_val = df_val[df_val.columns[6:]].values
    
#     gbm_0 = models_0[i]
#     gbm_1 = models_1[i]
#     gbm_2 = models_2[i]
#     gbm_3 = models_3[i]
#     gbm_4 = models_4[i]
    
#     y_pred_train_0 = gbm_0.predict(X_train, num_iteration=gbm_0.best_iteration)
#     y_pred_train_1 = gbm_1.predict(X_train, num_iteration=gbm_1.best_iteration)
#     y_pred_train_2 = gbm_2.predict(X_train, num_iteration=gbm_2.best_iteration)
#     y_pred_train_3 = gbm_3.predict(X_train, num_iteration=gbm_3.best_iteration)
#     y_pred_train_4 = gbm_4.predict(X_train, num_iteration=gbm_4.best_iteration)
#     y_pred_train = (y_pred_train_0 + y_pred_train_1 + y_pred_train_2 + y_pred_train_3 + y_pred_train_4) / 5.0
    
#     y_pred_0 = gbm_0.predict(X_val, num_iteration=gbm_0.best_iteration)
#     y_pred_1 = gbm_1.predict(X_val, num_iteration=gbm_1.best_iteration)
#     y_pred_2 = gbm_2.predict(X_val, num_iteration=gbm_2.best_iteration)
#     y_pred_3 = gbm_3.predict(X_val, num_iteration=gbm_3.best_iteration)
#     y_pred_4 = gbm_4.predict(X_val, num_iteration=gbm_4.best_iteration)
#     y_pred_en = (y_pred_0 + y_pred_1 + y_pred_2 + y_pred_3 + y_pred_4) / 5.0
    
#     score = type_score(y_val, y_pred_en)
#     scores_en = np.hstack([scores_en, score])
#     plt.scatter(y_train, y_pred_train)
#     plt.scatter(y_val, y_pred_en)
#     plt.title('{}'.format(b))
#     plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()])
#     plt.show()

In [20]:
# print('type score:', scores_en)
# print('total score:', scores_en.mean())

In [21]:
with open(OUTPUT + '20190622_champs_models_lgb_en.pickle', 'wb') as f:
    pickle.dump(models, f)