In [1]:
import os
import pandas as pd
import numpy as np
import time
import random
import lightgbm as lgb

import pickle

In [2]:
FOLDER = '../../data_kaggle/champs/'
OUTPUT = FOLDER + 'out/'

In [3]:
# df_mulliken_charges = pd.read_csv(FOLDER + 'mulliken_charges.csv')
# df_sample =  pd.read_csv(FOLDER + 'sample_submission.csv')
# df_magnetic_shielding_tensors = pd.read_csv(FOLDER + 'magnetic_shielding_tensors.csv')
# df_train = pd.read_csv(FOLDER + 'train.csv')
# df_test = pd.read_csv(FOLDER + 'test.csv')
# df_dipole_moments = pd.read_csv(FOLDER + 'dipole_moments.csv')
# df_potential_energy = pd.read_csv(FOLDER + 'potential_energy.csv')
# df_structures = pd.read_csv(FOLDER + 'structures.csv')
# df_scalar_coupling_contributions = pd.read_csv(FOLDER + 'scalar_coupling_contributions.csv')

In [4]:
def type_score(y_val, y_pred):
    return np.log(sum(np.abs(y_val- y_pred)) / len(y_val))

In [5]:
# import gc
# import numpy as np
# import pandas as pd
# from multiprocessing import Pool

# df = None

# for tmp in  pd.read_csv(OUTPUT + '20190625_dist_bond_dir_cos3j_train.csv', chunksize=100000):
#     if df is None:
#         df = tmp
#     else:
#         df = df.append(tmp, ignore_index=True)
#     del tmp
#     gc.collect()
    
# bond_types = df['type'].unique()
# for b in bond_types:
#     df.query('type == "{}"'.format(b)).to_csv(OUTPUT + '20190625_dist_bond_dir_cos3j_train_{}.csv'.format(b), index=False)


In [6]:
rounds = 100000
num_seed = 5
bond_types = ['1JHC', '2JHH', '1JHN', '2JHN', '2JHC', '3JHH', '3JHC', '3JHN']

In [None]:
models = []
scores = []
start0 = time.time()
for i, b in enumerate(bond_types[3:]):
    print('-'*10, b, '-'*10)
    
    df_bond = pd.read_csv(OUTPUT + '20190628_dist_bond_dir_cos3j_train_{}.csv'.format(b))
    mols = df_bond['molecule_name'].unique()
    num = len(mols)
    num_train = int(num * 0.8)
    pickup = random.sample(range(num), num)
    pick_train = pickup[:num_train]
    pick_val = pickup[num_train:]
    
    models_b = []
    y_pred_b =[]
    
    df_train = df_bond[df_bond['molecule_name'].isin(mols[pick_train])]
    y_train = df_train['scalar_coupling_constant'].values
    X_train = df_train[df_train.columns[6:]].values

    df_val = df_bond[df_bond['molecule_name'].isin(mols[pick_val])]
    y_val = df_val['scalar_coupling_constant'].values
    X_val = df_val[df_val.columns[6:]].values

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)
        
    for s in range(num_seed):
        print('seed No.{}, bond type {}'.format(s, b))
        start = time.time()
        
        params = {
                'task' : 'train',
                'boosting_type' : 'gbdt',
                'objective' : 'regression',
                'metric' : {'l2'},
                'num_leaves' : 31,
                'learning_rate' : 0.1,
                'feature_fraction' : 0.9,
                'bagging_fraction' : 0.8,
                'bagging_freq': 5,
            'seed':s
        }

        gbm = lgb.train(params,
                lgb_train,
                num_boost_round=rounds,
                verbose_eval=0,
                valid_sets=lgb_eval,
                early_stopping_rounds=100)
        
        models_b.append(gbm)
        y_pred_single = gbm.predict(X_val, num_iteration=gbm.best_iteration)
        print(gbm.best_iteration)
        print('single model score:',type_score(y_val, y_pred_single))

        y_pred_b.append(y_pred_single)
        elapsed_time = time.time() - start
        print ("elapsed_time:{0}".format(elapsed_time) + "[sec]")
        
    y_pred = np.array(y_pred_b).mean(axis=0)
    score = type_score(y_val, y_pred)
    print('mean-ensemble score:', score)
    
    with open(OUTPUT + '20190628_champs_models_lgb_{}.pickle'.format(b), 'wb') as f:
        pickle.dump(models_b, f)

    scores.append(score)
elapsed_time = time.time() - start0
print ("total elapsed_time:{0}".format(elapsed_time/3600) + "[hours]")

---------- 2JHN ----------
seed No.0, bond type 2JHN
11156
single model score: -1.6422693536449287
elapsed_time:117.30366539955139[sec]
seed No.1, bond type 2JHN
18827
single model score: -1.653441210165582
elapsed_time:212.20055866241455[sec]
seed No.2, bond type 2JHN
14507
single model score: -1.6464183575525757
elapsed_time:155.2100329399109[sec]
seed No.3, bond type 2JHN
14581
single model score: -1.6753866555627073
elapsed_time:147.53985404968262[sec]
seed No.4, bond type 2JHN
22514
single model score: -1.6503494369905791
elapsed_time:229.83143281936646[sec]
mean-ensemble score: -1.797108905029614
---------- 2JHC ----------
seed No.0, bond type 2JHC
39237
single model score: -1.333874135693143
elapsed_time:2792.0203042030334[sec]
seed No.1, bond type 2JHC
44212
single model score: -1.3376407608862348
elapsed_time:3235.304008245468[sec]
seed No.2, bond type 2JHC
39912
single model score: -1.329049451415722
elapsed_time:2860.482324361801[sec]
seed No.3, bond type 2JHC
42221
single m

In [None]:
print('type score:', scores)
print('total score:', np.array(scores).mean())

# type score: [-0.7107571299257489, -2.0767665113807867, -1.2548499603026722, -1.854568916875408, -1.4865422286105732, -2.074127209929504, -0.9917317367984064, -2.101555204837629]
# total score: -1.568862362332591

# type score: [-0.7395686378862215, -2.0418889965244227, -1.2178209199205257, -1.815672423977162, -1.4683662039626242, -0.705921299898279, -0.6104222463407887, -1.9517610071479379]
# total score: -1.318927716957245

In [None]:
gbm.feature_importance()