In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import random
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor

import pickle

In [2]:
FOLDER = '../../data_kaggle/champs/'
OUTPUT = FOLDER + 'out/'

In [3]:
# df_mulliken_charges = pd.read_csv(FOLDER + 'mulliken_charges.csv')
# df_sample =  pd.read_csv(FOLDER + 'sample_submission.csv')
# df_magnetic_shielding_tensors = pd.read_csv(FOLDER + 'magnetic_shielding_tensors.csv')
# df_train = pd.read_csv(FOLDER + 'train.csv')
# df_test = pd.read_csv(FOLDER + 'test.csv')
# df_dipole_moments = pd.read_csv(FOLDER + 'dipole_moments.csv')
# df_potential_energy = pd.read_csv(FOLDER + 'potential_energy.csv')
# df_structures = pd.read_csv(FOLDER + 'structures.csv')
# df_scalar_coupling_contributions = pd.read_csv(FOLDER + 'scalar_coupling_contributions.csv')

In [4]:
df_train_dist = pd.read_csv(OUTPUT + '20190611_train_dist_cos.csv')

In [5]:
def type_score(y_val, y_pred):
    return np.log(sum(np.abs(y_val- y_pred)) / len(y_val))

In [6]:
bond_types = df_train_dist['type'].unique()

In [7]:
mols = df_train_dist['molecule_name'].unique()
num = len(mols)
num_train = int(num * 0.8)
pickup = random.sample(range(num), num)
pick_train = pickup[:num_train]
pick_val = pickup[num_train:]

In [8]:
rouds = 10000

In [9]:
models_0 = []
scores_0 = np.zeros([0])
for i, b in enumerate(bond_types):
    df_bond = df_train_dist.query('type == "{}"'.format(b))
    
    df_train = df_bond[df_bond['molecule_name'].isin(mols[pick_train])]
    y_train = df_train['scalar_coupling_constant'].values
    X_train = df_train[df_train.columns[6:]].values
    
    df_val = df_bond[df_bond['molecule_name'].isin(mols[pick_val])]
    y_val = df_val['scalar_coupling_constant'].values
    X_val = df_val[df_val.columns[6:]].values
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)
    
    params = {
            'task' : 'train',
            'boosting_type' : 'gbdt',
            'objective' : 'regression',
            'metric' : {'l2'},
            'num_leaves' : 31,
            'learning_rate' : 0.1,
            'feature_fraction' : 0.9,
            'bagging_fraction' : 0.8,
            'bagging_freq': 5,
        'seed':0
    }
    
    gbm = lgb.train(params,
            lgb_train,
            num_boost_round=rouds,
            verbose_eval=100,
            valid_sets=lgb_eval,
            early_stopping_rounds=10)
    
    y_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
    
#     y_pred_train = gbm.predict(X_val, num_iteration=gbm.best_iteration)
    score = type_score(y_val, y_pred)
    scores_0 = np.hstack([scores_0, score])
    models_0.append(gbm)


Training until validation scores don't improve for 10 rounds.
[100]	valid_0's l2: 5.0577
[200]	valid_0's l2: 3.75177
[300]	valid_0's l2: 3.19995
[400]	valid_0's l2: 2.86778
[500]	valid_0's l2: 2.64098
[600]	valid_0's l2: 2.4643
[700]	valid_0's l2: 2.3251
[800]	valid_0's l2: 2.21202
[900]	valid_0's l2: 2.11416
[1000]	valid_0's l2: 2.02953
[1100]	valid_0's l2: 1.95413
[1200]	valid_0's l2: 1.89666
[1300]	valid_0's l2: 1.83333
[1400]	valid_0's l2: 1.78246
[1500]	valid_0's l2: 1.73638
[1600]	valid_0's l2: 1.69371
[1700]	valid_0's l2: 1.65741
[1800]	valid_0's l2: 1.62383
[1900]	valid_0's l2: 1.59371
[2000]	valid_0's l2: 1.56345
[2100]	valid_0's l2: 1.53613
[2200]	valid_0's l2: 1.5103
[2300]	valid_0's l2: 1.48581
[2400]	valid_0's l2: 1.46377
[2500]	valid_0's l2: 1.44155
[2600]	valid_0's l2: 1.42242
[2700]	valid_0's l2: 1.40433
[2800]	valid_0's l2: 1.38659
[2900]	valid_0's l2: 1.37021
[3000]	valid_0's l2: 1.35441
[3100]	valid_0's l2: 1.33876
[3200]	valid_0's l2: 1.32481
[3300]	valid_0's l2: 1.

[1800]	valid_0's l2: 0.505063
[1900]	valid_0's l2: 0.494578
[2000]	valid_0's l2: 0.485011
[2100]	valid_0's l2: 0.476729
[2200]	valid_0's l2: 0.468491
[2300]	valid_0's l2: 0.460519
[2400]	valid_0's l2: 0.452793
[2500]	valid_0's l2: 0.447233
[2600]	valid_0's l2: 0.441176
[2700]	valid_0's l2: 0.43584
[2800]	valid_0's l2: 0.429673
[2900]	valid_0's l2: 0.423832
[3000]	valid_0's l2: 0.4181
[3100]	valid_0's l2: 0.413595
[3200]	valid_0's l2: 0.409748
[3300]	valid_0's l2: 0.405502
[3400]	valid_0's l2: 0.401284
[3500]	valid_0's l2: 0.397368
[3600]	valid_0's l2: 0.393574
[3700]	valid_0's l2: 0.390162
[3800]	valid_0's l2: 0.386672
[3900]	valid_0's l2: 0.383474
[4000]	valid_0's l2: 0.380227
[4100]	valid_0's l2: 0.37713
[4200]	valid_0's l2: 0.374373
[4300]	valid_0's l2: 0.371311
[4400]	valid_0's l2: 0.368612
[4500]	valid_0's l2: 0.365615
[4600]	valid_0's l2: 0.36313
[4700]	valid_0's l2: 0.360704
[4800]	valid_0's l2: 0.358112
[4900]	valid_0's l2: 0.356093
[5000]	valid_0's l2: 0.353975
[5100]	valid_0'

In [10]:
print('type score:', scores_0)
print('total score:', scores_0.mean())

type score: [-0.42611669 -1.80932696 -1.04113371 -1.63113263 -1.14246793 -0.51528461
 -0.33062297 -1.75250977]
total score: -1.0810744097041554


In [11]:
models_1 = []
scores_1 = np.zeros([0])
for i, b in enumerate(bond_types):
    df_bond = df_train_dist.query('type == "{}"'.format(b))
    
    df_train = df_bond[df_bond['molecule_name'].isin(mols[pick_train])]
    y_train = df_train['scalar_coupling_constant'].values
    X_train = df_train[df_train.columns[6:]].values
    
    df_val = df_bond[df_bond['molecule_name'].isin(mols[pick_val])]
    y_val = df_val['scalar_coupling_constant'].values
    X_val = df_val[df_val.columns[6:]].values
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)
    
    params = {
            'task' : 'train',
            'boosting_type' : 'gbdt',
            'objective' : 'regression',
            'metric' : {'l2'},
            'num_leaves' : 31,
            'learning_rate' : 0.1,
            'feature_fraction' : 0.9,
            'bagging_fraction' : 0.8,
            'bagging_freq': 5,
        'seed':1
    }
    
    gbm = lgb.train(params,
            lgb_train,
            num_boost_round=rouds,
            valid_sets=lgb_eval,
            verbose_eval=100,
            early_stopping_rounds=10)
    
    y_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
    
#     y_pred_train = gbm.predict(X_val, num_iteration=gbm.best_iteration)
    score = type_score(y_val, y_pred)
    scores_1 = np.hstack([scores_1, score])
    models_1.append(gbm)


Training until validation scores don't improve for 10 rounds.
[100]	valid_0's l2: 5.08255
[200]	valid_0's l2: 3.76924
[300]	valid_0's l2: 3.21308
[400]	valid_0's l2: 2.87761
[500]	valid_0's l2: 2.65745
[600]	valid_0's l2: 2.47387
[700]	valid_0's l2: 2.33514
[800]	valid_0's l2: 2.21949
[900]	valid_0's l2: 2.12311
[1000]	valid_0's l2: 2.03328
[1100]	valid_0's l2: 1.96525
[1200]	valid_0's l2: 1.90329
[1300]	valid_0's l2: 1.84306
[1400]	valid_0's l2: 1.79385
[1500]	valid_0's l2: 1.74962
[1600]	valid_0's l2: 1.70635
[1700]	valid_0's l2: 1.66661
[1800]	valid_0's l2: 1.63083
[1900]	valid_0's l2: 1.59878
[2000]	valid_0's l2: 1.56698
[2100]	valid_0's l2: 1.53882
[2200]	valid_0's l2: 1.51335
[2300]	valid_0's l2: 1.48793
[2400]	valid_0's l2: 1.46532
[2500]	valid_0's l2: 1.44356
[2600]	valid_0's l2: 1.42348
[2700]	valid_0's l2: 1.40506
[2800]	valid_0's l2: 1.38875
[2900]	valid_0's l2: 1.3725
[3000]	valid_0's l2: 1.35606
[3100]	valid_0's l2: 1.34076
[3200]	valid_0's l2: 1.32638
[3300]	valid_0's l2:

[1700]	valid_0's l2: 0.513054
[1800]	valid_0's l2: 0.50206
[1900]	valid_0's l2: 0.491658
[2000]	valid_0's l2: 0.482588
[2100]	valid_0's l2: 0.473852
[2200]	valid_0's l2: 0.465335
[2300]	valid_0's l2: 0.458483
[2400]	valid_0's l2: 0.452499
[2500]	valid_0's l2: 0.445123
[2600]	valid_0's l2: 0.438492
[2700]	valid_0's l2: 0.432621
[2800]	valid_0's l2: 0.427098
[2900]	valid_0's l2: 0.42218
[3000]	valid_0's l2: 0.417169
[3100]	valid_0's l2: 0.41282
[3200]	valid_0's l2: 0.408459
[3300]	valid_0's l2: 0.404669
[3400]	valid_0's l2: 0.400984
[3500]	valid_0's l2: 0.396916
[3600]	valid_0's l2: 0.393695
[3700]	valid_0's l2: 0.389687
[3800]	valid_0's l2: 0.38608
[3900]	valid_0's l2: 0.383073
[4000]	valid_0's l2: 0.38014
[4100]	valid_0's l2: 0.376974
[4200]	valid_0's l2: 0.374178
[4300]	valid_0's l2: 0.371385
[4400]	valid_0's l2: 0.368768
[4500]	valid_0's l2: 0.366294
[4600]	valid_0's l2: 0.363703
[4700]	valid_0's l2: 0.36116
[4800]	valid_0's l2: 0.358582
[4900]	valid_0's l2: 0.356091
[5000]	valid_0's

In [12]:
print('type score:', scores_1)
print('total score:', scores_1.mean())

type score: [-0.42501043 -1.82356912 -0.96479636 -1.62369179 -1.14394376 -0.49076289
 -0.33093402 -1.77501596]
total score: -1.0722155423818838


In [13]:
models_2 = []
scores_2 = np.zeros([0])
for i, b in enumerate(bond_types):
    df_bond = df_train_dist.query('type == "{}"'.format(b))
    
    df_train = df_bond[df_bond['molecule_name'].isin(mols[pick_train])]
    y_train = df_train['scalar_coupling_constant'].values
    X_train = df_train[df_train.columns[6:]].values
    
    df_val = df_bond[df_bond['molecule_name'].isin(mols[pick_val])]
    y_val = df_val['scalar_coupling_constant'].values
    X_val = df_val[df_val.columns[6:]].values
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)
    
    params = {
            'task' : 'train',
            'boosting_type' : 'gbdt',
            'objective' : 'regression',
            'metric' : {'l2'},
            'num_leaves' : 31,
            'learning_rate' : 0.1,
            'feature_fraction' : 0.9,
            'bagging_fraction' : 0.8,
            'bagging_freq': 5,
        'seed':2
    }
    
    gbm = lgb.train(params,
            lgb_train,
            num_boost_round=rouds,
            valid_sets=lgb_eval,
            verbose_eval=100,
            early_stopping_rounds=10)
    
    y_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
    
#     y_pred_train = gbm.predict(X_val, num_iteration=gbm.best_iteration)
    score = type_score(y_val, y_pred)
    scores_2 = np.hstack([scores_2, score])
    models_2.append(gbm)


Training until validation scores don't improve for 10 rounds.
[100]	valid_0's l2: 5.02265
[200]	valid_0's l2: 3.7405
[300]	valid_0's l2: 3.18633
[400]	valid_0's l2: 2.86285
[500]	valid_0's l2: 2.62749
[600]	valid_0's l2: 2.44945
[700]	valid_0's l2: 2.31263
[800]	valid_0's l2: 2.19663
[900]	valid_0's l2: 2.09814
[1000]	valid_0's l2: 2.013
[1100]	valid_0's l2: 1.93544
[1200]	valid_0's l2: 1.87407
[1300]	valid_0's l2: 1.82018
[1400]	valid_0's l2: 1.76904
[1500]	valid_0's l2: 1.72363
[1600]	valid_0's l2: 1.68418
[1700]	valid_0's l2: 1.64708
[1800]	valid_0's l2: 1.61463
[1900]	valid_0's l2: 1.58194
[2000]	valid_0's l2: 1.5521
[2100]	valid_0's l2: 1.52616
[2200]	valid_0's l2: 1.49834
[2300]	valid_0's l2: 1.47616
[2400]	valid_0's l2: 1.4535
[2500]	valid_0's l2: 1.43354
[2600]	valid_0's l2: 1.41289
[2700]	valid_0's l2: 1.39379
[2800]	valid_0's l2: 1.37827
[2900]	valid_0's l2: 1.36205
[3000]	valid_0's l2: 1.34624
[3100]	valid_0's l2: 1.33119
[3200]	valid_0's l2: 1.31819
[3300]	valid_0's l2: 1.3

[2400]	valid_0's l2: 0.455895
[2500]	valid_0's l2: 0.449123
[2600]	valid_0's l2: 0.442855
[2700]	valid_0's l2: 0.436451
[2800]	valid_0's l2: 0.431015
[2900]	valid_0's l2: 0.42587
[3000]	valid_0's l2: 0.421203
[3100]	valid_0's l2: 0.416963
[3200]	valid_0's l2: 0.411673
[3300]	valid_0's l2: 0.407585
[3400]	valid_0's l2: 0.402824
[3500]	valid_0's l2: 0.398545
[3600]	valid_0's l2: 0.394678
[3700]	valid_0's l2: 0.390867
[3800]	valid_0's l2: 0.387143
[3900]	valid_0's l2: 0.383516
[4000]	valid_0's l2: 0.380754
[4100]	valid_0's l2: 0.377805
[4200]	valid_0's l2: 0.374541
[4300]	valid_0's l2: 0.371686
[4400]	valid_0's l2: 0.368665
[4500]	valid_0's l2: 0.365857
[4600]	valid_0's l2: 0.363298
[4700]	valid_0's l2: 0.360859
[4800]	valid_0's l2: 0.358266
[4900]	valid_0's l2: 0.355813
[5000]	valid_0's l2: 0.353517
[5100]	valid_0's l2: 0.351389
[5200]	valid_0's l2: 0.349348
[5300]	valid_0's l2: 0.347431
[5400]	valid_0's l2: 0.345439
[5500]	valid_0's l2: 0.343672
[5600]	valid_0's l2: 0.34168
[5700]	valid

In [14]:
print('type score:', scores_2)
print('total score:', scores_2.mean())

type score: [-0.42790368 -1.79459683 -0.99935047 -1.65770854 -1.14570606 -0.53483852
 -0.33186052 -1.78909998]
total score: -1.0851330757750979


In [15]:
models_3 = []
scores_3 = np.zeros([0])
for i, b in enumerate(bond_types):
    df_bond = df_train_dist.query('type == "{}"'.format(b))
    
    df_train = df_bond[df_bond['molecule_name'].isin(mols[pick_train])]
    y_train = df_train['scalar_coupling_constant'].values
    X_train = df_train[df_train.columns[6:]].values
    
    df_val = df_bond[df_bond['molecule_name'].isin(mols[pick_val])]
    y_val = df_val['scalar_coupling_constant'].values
    X_val = df_val[df_val.columns[6:]].values
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)
    
    params = {
            'task' : 'train',
            'boosting_type' : 'gbdt',
            'objective' : 'regression',
            'metric' : {'l2'},
            'num_leaves' : 31,
            'learning_rate' : 0.1,
            'feature_fraction' : 0.9,
            'bagging_fraction' : 0.8,
            'bagging_freq': 5,
        'seed':3
    }
    
    gbm = lgb.train(params,
            lgb_train,
            num_boost_round=rouds,
            valid_sets=lgb_eval,
            verbose_eval=100,
            early_stopping_rounds=10)
    
    y_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
    
#     y_pred_train = gbm.predict(X_val, num_iteration=gbm.best_iteration)
    score = type_score(y_val, y_pred)
    scores_3 = np.hstack([scores_3, score])
    models_3.append(gbm)


Training until validation scores don't improve for 10 rounds.
[100]	valid_0's l2: 4.98613
[200]	valid_0's l2: 3.70223
[300]	valid_0's l2: 3.1574
[400]	valid_0's l2: 2.83064
[500]	valid_0's l2: 2.60369
[600]	valid_0's l2: 2.42866
[700]	valid_0's l2: 2.29236
[800]	valid_0's l2: 2.1729
[900]	valid_0's l2: 2.07386
[1000]	valid_0's l2: 1.99251
[1100]	valid_0's l2: 1.92407
[1200]	valid_0's l2: 1.86028
[1300]	valid_0's l2: 1.80549
[1400]	valid_0's l2: 1.75828
[1500]	valid_0's l2: 1.71653
[1600]	valid_0's l2: 1.67806
[1700]	valid_0's l2: 1.63857
[1800]	valid_0's l2: 1.60362
[1900]	valid_0's l2: 1.57218
[2000]	valid_0's l2: 1.54031
[2100]	valid_0's l2: 1.51517
[2200]	valid_0's l2: 1.48888
[2300]	valid_0's l2: 1.46532
[2400]	valid_0's l2: 1.44429
[2500]	valid_0's l2: 1.42401
[2600]	valid_0's l2: 1.40261
[2700]	valid_0's l2: 1.3847
[2800]	valid_0's l2: 1.36734
[2900]	valid_0's l2: 1.35203
[3000]	valid_0's l2: 1.33741
[3100]	valid_0's l2: 1.32247
[3200]	valid_0's l2: 1.30755
[3300]	valid_0's l2: 1

[900]	valid_0's l2: 0.662245
[1000]	valid_0's l2: 0.634716
[1100]	valid_0's l2: 0.61275
[1200]	valid_0's l2: 0.592367
[1300]	valid_0's l2: 0.573479
[1400]	valid_0's l2: 0.555263
[1500]	valid_0's l2: 0.540982
[1600]	valid_0's l2: 0.529114
[1700]	valid_0's l2: 0.516389
[1800]	valid_0's l2: 0.505553
[1900]	valid_0's l2: 0.494971
[2000]	valid_0's l2: 0.485183
[2100]	valid_0's l2: 0.476384
[2200]	valid_0's l2: 0.468457
[2300]	valid_0's l2: 0.460624
[2400]	valid_0's l2: 0.452771
[2500]	valid_0's l2: 0.446892
[2600]	valid_0's l2: 0.439734
[2700]	valid_0's l2: 0.432822
[2800]	valid_0's l2: 0.426722
[2900]	valid_0's l2: 0.42115
[3000]	valid_0's l2: 0.415343
[3100]	valid_0's l2: 0.41043
[3200]	valid_0's l2: 0.405693
[3300]	valid_0's l2: 0.401099
[3400]	valid_0's l2: 0.396989
[3500]	valid_0's l2: 0.393122
[3600]	valid_0's l2: 0.389651
[3700]	valid_0's l2: 0.385735
[3800]	valid_0's l2: 0.382411
[3900]	valid_0's l2: 0.378805
[4000]	valid_0's l2: 0.375806
[4100]	valid_0's l2: 0.37277
[4200]	valid_0'

In [16]:
print('type score:', scores_3)
print('total score:', scores_3.mean())

type score: [-0.43294588 -1.8207781  -1.01344189 -1.62550069 -1.1479221  -0.51572132
 -0.33105836 -1.7551164 ]
total score: -1.0803105928742167


In [17]:
models_4 = []
scores_4 = np.zeros([0])
for i, b in enumerate(bond_types):
    df_bond = df_train_dist.query('type == "{}"'.format(b))
    
    df_train = df_bond[df_bond['molecule_name'].isin(mols[pick_train])]
    y_train = df_train['scalar_coupling_constant'].values
    X_train = df_train[df_train.columns[6:]].values
    
    df_val = df_bond[df_bond['molecule_name'].isin(mols[pick_val])]
    y_val = df_val['scalar_coupling_constant'].values
    X_val = df_val[df_val.columns[6:]].values
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)
    
    params = {
            'task' : 'train',
            'boosting_type' : 'gbdt',
            'objective' : 'regression',
            'metric' : {'l2'},
            'num_leaves' : 31,
            'learning_rate' : 0.1,
            'feature_fraction' : 0.9,
            'bagging_fraction' : 0.8,
            'bagging_freq': 5,
        'seed':4
    }
    
    gbm = lgb.train(params,
            lgb_train,
            num_boost_round=rouds,
            valid_sets=lgb_eval,
            verbose_eval=100,
            early_stopping_rounds=10)
    
    y_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
    
#     y_pred_train = gbm.predict(X_val, num_iteration=gbm.best_iteration)
    score = type_score(y_val, y_pred)
    scores_4 = np.hstack([scores_4, score])
    models_4.append(gbm)


Training until validation scores don't improve for 10 rounds.
[100]	valid_0's l2: 5.04975
[200]	valid_0's l2: 3.74762
[300]	valid_0's l2: 3.20491
[400]	valid_0's l2: 2.8782
[500]	valid_0's l2: 2.64197
[600]	valid_0's l2: 2.4586
[700]	valid_0's l2: 2.31522
[800]	valid_0's l2: 2.19385
[900]	valid_0's l2: 2.09871
[1000]	valid_0's l2: 2.0187
[1100]	valid_0's l2: 1.94478
[1200]	valid_0's l2: 1.88484
[1300]	valid_0's l2: 1.8281
[1400]	valid_0's l2: 1.77741
[1500]	valid_0's l2: 1.73167
[1600]	valid_0's l2: 1.69022
[1700]	valid_0's l2: 1.65209
[1800]	valid_0's l2: 1.61681
[1900]	valid_0's l2: 1.5866
[2000]	valid_0's l2: 1.55562
[2100]	valid_0's l2: 1.52731
[2200]	valid_0's l2: 1.49981
[2300]	valid_0's l2: 1.47878
[2400]	valid_0's l2: 1.45636
[2500]	valid_0's l2: 1.43519
[2600]	valid_0's l2: 1.41616
[2700]	valid_0's l2: 1.39929
[2800]	valid_0's l2: 1.38369
[2900]	valid_0's l2: 1.36728
[3000]	valid_0's l2: 1.35287
[3100]	valid_0's l2: 1.33742
[3200]	valid_0's l2: 1.32181
[3300]	valid_0's l2: 1.3

[2900]	valid_0's l2: 0.430322
[3000]	valid_0's l2: 0.425034
[3100]	valid_0's l2: 0.420347
[3200]	valid_0's l2: 0.4156
[3300]	valid_0's l2: 0.410753
[3400]	valid_0's l2: 0.405972
[3500]	valid_0's l2: 0.402187
[3600]	valid_0's l2: 0.398337
[3700]	valid_0's l2: 0.394731
[3800]	valid_0's l2: 0.391336
[3900]	valid_0's l2: 0.387973
[4000]	valid_0's l2: 0.385139
[4100]	valid_0's l2: 0.381418
[4200]	valid_0's l2: 0.378625
[4300]	valid_0's l2: 0.375844
[4400]	valid_0's l2: 0.373008
[4500]	valid_0's l2: 0.370317
[4600]	valid_0's l2: 0.368006
[4700]	valid_0's l2: 0.365559
[4800]	valid_0's l2: 0.363346
[4900]	valid_0's l2: 0.361082
[5000]	valid_0's l2: 0.358904
[5100]	valid_0's l2: 0.356617
[5200]	valid_0's l2: 0.354302
[5300]	valid_0's l2: 0.352457
[5400]	valid_0's l2: 0.350379
[5500]	valid_0's l2: 0.348078
[5600]	valid_0's l2: 0.345936
[5700]	valid_0's l2: 0.34421
[5800]	valid_0's l2: 0.342431
[5900]	valid_0's l2: 0.340611
[6000]	valid_0's l2: 0.33888
[6100]	valid_0's l2: 0.337305
[6200]	valid_0

In [18]:
print('type score:', scores_4)
print('total score:', scores_4.mean())

type score: [-0.42912339 -1.7950848  -1.01741002 -1.62456846 -1.14469798 -0.53706147
 -0.33108229 -1.78495746]
total score: -1.0829982344359699


In [19]:
# scores_en = np.zeros([0])

# for i, b in enumerate(bond_types):
#     df_bond = df_train_dist.query('type == "{}"'.format(b))
#     df_train = df_bond[df_bond['molecule_name'].isin(mols[pick_train])]
#     y_train = df_train['scalar_coupling_constant'].values
#     X_train = df_train[df_train.columns[6:]].values
    
#     df_val = df_bond[df_bond['molecule_name'].isin(mols[pick_val])]
#     y_val = df_val['scalar_coupling_constant'].values
#     X_val = df_val[df_val.columns[6:]].values
    
#     gbm_0 = models_0[i]
#     gbm_1 = models_1[i]
#     gbm_2 = models_2[i]
#     gbm_3 = models_3[i]
#     gbm_4 = models_4[i]
    
#     y_pred_train_0 = gbm_0.predict(X_train, num_iteration=gbm_0.best_iteration)
#     y_pred_train_1 = gbm_1.predict(X_train, num_iteration=gbm_1.best_iteration)
#     y_pred_train_2 = gbm_2.predict(X_train, num_iteration=gbm_2.best_iteration)
#     y_pred_train_3 = gbm_3.predict(X_train, num_iteration=gbm_3.best_iteration)
#     y_pred_train_4 = gbm_4.predict(X_train, num_iteration=gbm_4.best_iteration)
#     y_pred_train = (y_pred_train_0 + y_pred_train_1 + y_pred_train_2 + y_pred_train_3 + y_pred_train_4) / 5.0
    
#     y_pred_0 = gbm_0.predict(X_val, num_iteration=gbm_0.best_iteration)
#     y_pred_1 = gbm_1.predict(X_val, num_iteration=gbm_1.best_iteration)
#     y_pred_2 = gbm_2.predict(X_val, num_iteration=gbm_2.best_iteration)
#     y_pred_3 = gbm_3.predict(X_val, num_iteration=gbm_3.best_iteration)
#     y_pred_4 = gbm_4.predict(X_val, num_iteration=gbm_4.best_iteration)
#     y_pred_en = (y_pred_0 + y_pred_1 + y_pred_2 + y_pred_3 + y_pred_4) / 5.0
    
#     score = type_score(y_val, y_pred_en)
#     scores_en = np.hstack([scores_en, score])
#     plt.scatter(y_train, y_pred_train)
#     plt.scatter(y_val, y_pred_en)
#     plt.title('{}'.format(b))
#     plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()])
#     plt.show()

In [20]:
# print('type score:', scores_en)
# print('total score:', scores_en.mean())

In [21]:
with open(OUTPUT + '20190614_champs_models_lgb_en.pickle', 'wb') as f:
    pickle.dump(models_0, f)
    pickle.dump(models_1, f)
    pickle.dump(models_2, f)
    pickle.dump(models_3, f)
    pickle.dump(models_4, f)