In [1]:
import os
import pandas as pd
import numpy as np
import time
import random
import lightgbm as lgb

import pickle

In [2]:
FOLDER = '../../data_kaggle/champs/'
OUTPUT = FOLDER + 'out/'

In [3]:
def type_score(y_val, y_pred):
    return np.log(sum(np.abs(y_val- y_pred)) / len(y_val))

In [None]:
rounds = 100000
num_seed = 5
bond_types = ['1JHC', '2JHH', '1JHN', '2JHN', '2JHC', '3JHH', '3JHC', '3JHN']

In [None]:
models = []
scores = []
start0 = time.time()
for i, b in enumerate(bond_types):
    print('-'*10, b, '-'*10)
    
    df_bond = pd.read_pickle(OUTPUT + '20190629_dist_bond_dir_cos3j_train_{}.pickle'.format(b))
    mols = df_bond['molecule_name'].unique()
    num = len(mols)
    num_train = int(num * 0.8)
    pickup = random.sample(range(num), num)
    pick_train = pickup[:num_train]
    pick_val = pickup[num_train:]
    
    models_b = []
    y_pred_b =[]
    
    df_train = df_bond[df_bond['molecule_name'].isin(mols[pick_train])]
    y_train = df_train['scalar_coupling_constant'].values
    X_train = df_train[df_train.columns[6:]].values

    df_val = df_bond[df_bond['molecule_name'].isin(mols[pick_val])]
    y_val = df_val['scalar_coupling_constant'].values
    X_val = df_val[df_val.columns[6:]].values

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)
        
    for s in range(num_seed):
        print('seed No.{}, bond type {}'.format(s, b))
        start = time.time()
        
        params = {
                'task' : 'train',
                'boosting_type' : 'gbdt',
                'objective' : 'regression',
                'metric' : {'l2'},
                'num_leaves' : 31,
                'learning_rate' : 0.1,
                'feature_fraction' : 0.9,
                'bagging_fraction' : 0.8,
                'bagging_freq': 5,
            'seed':s
        }

        gbm = lgb.train(params,
                lgb_train,
                num_boost_round=rounds,
                verbose_eval=0,
                valid_sets=lgb_eval,
                early_stopping_rounds=100)
        
        models_b.append(gbm)
        y_pred_single = gbm.predict(X_val, num_iteration=gbm.best_iteration)
        print(gbm.best_iteration)
        print('single model score:',type_score(y_val, y_pred_single))

        y_pred_b.append(y_pred_single)
        elapsed_time = time.time() - start
        print ("elapsed_time:{0}".format(elapsed_time) + "[sec]")
        
    y_pred = np.array(y_pred_b).mean(axis=0)
    score = type_score(y_val, y_pred)
    print('mean-ensemble score:', score)
    
    with open(OUTPUT + '20190629_champs_models_lgb_{}.pickle'.format(b), 'wb') as f:
        pickle.dump(models_b, f)

    scores.append(score)
elapsed_time = time.time() - start0
print ("total elapsed_time:{0}".format(elapsed_time/3600) + "[hours]")

---------- 1JHC ----------
seed No.0, bond type 1JHC


In [None]:
print('type score:', scores)
print('total score:', np.array(scores).mean())

# type score: [-0.7107571299257489, -2.0767665113807867, -1.2548499603026722, -1.854568916875408, -1.4865422286105732, -2.074127209929504, -0.9917317367984064, -2.101555204837629]
# total score: -1.568862362332591

# type score: [-0.7395686378862215, -2.0418889965244227, -1.2178209199205257, -1.815672423977162, -1.4683662039626242, -0.705921299898279, -0.6104222463407887, -1.9517610071479379]
# total score: -1.318927716957245