In [1]:
import os
import pandas as pd
import numpy as np
import time
import random
import lightgbm as lgb

import pickle

In [2]:
FOLDER = '../../data_kaggle/champs/'
OUTPUT = FOLDER + 'out/'

In [3]:
# df_mulliken_charges = pd.read_csv(FOLDER + 'mulliken_charges.csv')
# df_sample =  pd.read_csv(FOLDER + 'sample_submission.csv')
# df_magnetic_shielding_tensors = pd.read_csv(FOLDER + 'magnetic_shielding_tensors.csv')
# df_train = pd.read_csv(FOLDER + 'train.csv')
# df_test = pd.read_csv(FOLDER + 'test.csv')
# df_dipole_moments = pd.read_csv(FOLDER + 'dipole_moments.csv')
# df_potential_energy = pd.read_csv(FOLDER + 'potential_energy.csv')
# df_structures = pd.read_csv(FOLDER + 'structures.csv')
# df_scalar_coupling_contributions = pd.read_csv(FOLDER + 'scalar_coupling_contributions.csv')

In [4]:
import gc
import numpy as np
import pandas as pd
from multiprocessing import Pool

df = None

for tmp in  pd.read_csv(OUTPUT + '20190625_dist_bond_dir_cos3j_train.csv', chunksize=100000):
    if df is None:
        df = tmp
    else:
        df = df.append(tmp, ignore_index=True)
    del tmp
    gc.collect()

In [5]:
df_train_dist = df

In [6]:
df_train_dist.shape

(4658147, 156)

In [7]:
# df_train_dist = pd.read_csv(OUTPUT + '20190625_dist_bond_dir_cos3j_train.csv')

In [8]:
def type_score(y_val, y_pred):
    return np.log(sum(np.abs(y_val- y_pred)) / len(y_val))

In [9]:
bond_types = df_train_dist['type'].unique()

In [10]:
df_train_dist['type'].value_counts()

3JHC    1510379
2JHC    1140674
1JHC     709416
3JHH     590611
2JHH     378036
3JHN     166415
2JHN     119253
1JHN      43363
Name: type, dtype: int64

In [11]:
mols = df_train_dist['molecule_name'].unique()
num = len(mols)
num_train = int(num * 0.8)
pickup = random.sample(range(num), num)
pick_train = pickup[:num_train]
pick_val = pickup[num_train:]

In [12]:
rounds = 100000
num_seed = 3

In [13]:
bond_types

array(['1JHC', '2JHH', '1JHN', '2JHN', '2JHC', '3JHH', '3JHC', '3JHN'],
      dtype=object)

In [14]:
models = []
scores = []
start0 = time.time()
for i, b in enumerate(bond_types):
    print('-'*10, b, '-'*10)

    models_b = []
    y_pred_b =[]
    
    df_bond = df_train_dist.query('type == "{}"'.format(b))
    
    df_train = df_bond[df_bond['molecule_name'].isin(mols[pick_train])]
    y_train = df_train['scalar_coupling_constant'].values
    X_train = df_train[df_train.columns[6:]].values

    df_val = df_bond[df_bond['molecule_name'].isin(mols[pick_val])]
    y_val = df_val['scalar_coupling_constant'].values
    X_val = df_val[df_val.columns[6:]].values

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)
        
    for s in range(num_seed):
        print('seed No.{}, bond type {}'.format(s, b))
        start = time.time()
        
        params = {
                'task' : 'train',
                'boosting_type' : 'gbdt',
                'objective' : 'regression',
                'metric' : {'l2'},
                'num_leaves' : 31,
                'learning_rate' : 0.1,
                'feature_fraction' : 0.9,
                'bagging_fraction' : 0.8,
                'bagging_freq': 5,
            'seed':s
        }

        gbm = lgb.train(params,
                lgb_train,
                num_boost_round=rounds,
                verbose_eval=0,
                valid_sets=lgb_eval,
                early_stopping_rounds=100)
        
        models_b.append(gbm)
        y_pred_single = gbm.predict(X_val, num_iteration=gbm.best_iteration)
        print(gbm.best_iteration)
        print('single model score:',type_score(y_val, y_pred_single))

        y_pred_b.append(y_pred_single)
        elapsed_time = time.time() - start
        print ("elapsed_time:{0}".format(elapsed_time) + "[sec]")
        
    y_pred = np.array(y_pred_b).mean(axis=0)
    score = type_score(y_val, y_pred)
    print('mean-ensemble score:', score)
    
    with open(OUTPUT + '20190625_champs_models_lgb_"{}".pickle'.format(b), 'wb') as f:
        pickle.dump(models_b, f)

    scores.append(score)
elapsed_time = time.time() - start0
print ("total elapsed_time:{0}".format(elapsed_time/3600) + "[hours]")

---------- 1JHC ----------
seed No.0, bond type 1JHC
70173
single model score: -0.6165349174333216
elapsed_time:3022.881049156189[sec]
seed No.1, bond type 1JHC


KeyboardInterrupt: 

In [None]:
print('type score:', scores)
print('total score:', np.array(scores).mean())

# type score: [-0.7107571299257489, -2.0767665113807867, -1.2548499603026722, -1.854568916875408, -1.4865422286105732, -2.074127209929504, -0.9917317367984064, -2.101555204837629]
# total score: -1.568862362332591


In [None]:
models_b[0].feature_importance()

In [None]:
df_train[df_train.columns[6:]].columns

In [None]:
# with open(OUTPUT + '20190623_champs_models_lgb_en.pickle', 'wb') as f:
#     pickle.dump(models, f)

In [None]:
# print('type score:', scores)
# print('total score:', np.array(scores).mean())

# type score: [-0.6307462882009246, 
#              -1.9522702266377576, 
#              -1.1847392620329305, 
#              -1.786991824141306, 
#              -1.3634166829730652, 
#              -1.9711113791862445, 
#              -0.8384572680890678, 
#              -2.0103828612544006]
# total score: -1.467264474064462
