In [1]:
import pandas as pd
import numpy as np
import time
import random
import lightgbm as lgb
import time

import pickle
import matplotlib.pyplot as plt

In [2]:
from process import gen_second_data

In [3]:
FOLDER = '../../data_kaggle/champs/'
OUTPUT = FOLDER + 'out/'

In [4]:
second_train = "20190721_second_train.pickle"
second_val = "20190721_second_val.pickle"
second_test = "20190721_second_test.pickle"
first_train = "20190721_dist_ang_ori_bond_cos_train_{}.pickle"
first_test = "20190721_dist_ang_ori_bond_cos_test_{}.pickle"

In [5]:
df_val_temp = pd.read_pickle(OUTPUT + second_val)
df_train_temp = pd.read_pickle(OUTPUT + second_train)

In [None]:
df_feat = pd.DataFrame()
bond_types = ['1JHC', '2JHH', '1JHN', '2JHN', '2JHC', '3JHH', '3JHC', '3JHN']
for b in bond_types:
    df_bond = pd.read_pickle(OUTPUT + train_data.format(b)).fillna(0)
    df_feat = pd.concat([df_feat, df_bond], axis=0)

In [7]:
df_feat.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,dist_H_0_x,dist_H_1_x,dist_H_2_x,dist_H_3_x,...,orientation_O_2_y,dist_F_0_y,dist_F_1_y,angle_F_0_y,angle_F_1_y,orientation_F_0_y,orientation_F_1_y,cos_3j,cos_3j^2,dist_center
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,0.560815,0.560806,0.560803,0.0,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
1,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,0.560815,0.560806,0.560803,0.0,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
2,7,dsgdb9nsd_000001,3,0,1JHC,84.8093,0.560806,0.560806,0.560803,0.0,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
3,9,dsgdb9nsd_000001,4,0,1JHC,84.8095,0.560806,0.560806,0.560803,0.0,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
4,17,dsgdb9nsd_000005,2,0,1JHC,171.22,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0


In [8]:
df_train_temp.columns[:6]

Index(['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type',
       'scalar_coupling_constant'],
      dtype='object')

In [9]:
df_train = pd.merge(df_feat, df_train_temp, \
                    left_on=['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type','scalar_coupling_constant'],
                    right_on=['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type','scalar_coupling_constant'])

In [10]:
df_val = pd.merge(df_feat, df_val_temp, \
                    left_on=['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type','scalar_coupling_constant'],
                    right_on=['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type','scalar_coupling_constant'])

In [27]:
rounds = 200000
params = {
        'task' : 'train',
        'boosting_type' : 'gbdt',
        'objective' : 'regression',
        'metric' : {'l1'},
        'num_leaves' : 63,
        'learning_rate' : 0.005,
        'feature_fraction' : 0.5,
        'bagging_fraction' : 0.5,
        'bagging_freq': 5,
    'seed':0
}

In [28]:
def type_score(y_val, y_pred):
    return np.log(sum(np.abs(y_val- y_pred)) / len(y_val))

In [29]:
for b in bond_types:
    print('-'*10, b, '-'*10)
    
    y_train = df_train.query('type=="{}"'.format(bond))['scalar_coupling_constant'].values
    X_train = df_train.query('type=="{}"'.format(bond))[df_train.columns[6:]].values

    y_val = df_val.query('type=="{}"'.format(bond))['scalar_coupling_constant'].values
    X_val = df_val.query('type=="{}"'.format(bond))[df_val.columns[6:]].values
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)
    
    start = time.time()
    gbm = lgb.train(params,
            lgb_train,
            num_boost_round=rounds,
            verbose_eval=0,
            valid_sets=lgb_eval,
            early_stopping_rounds=30)
    
    y_val_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
    print(gbm.best_iteration)
    print("score first model: %.2f" %  (type_score(y_val, X_val[:, -98])))
    print("score second model: %.2f" %  (type_score(y_val, y_val_pred)))
    plt.scatter(y_val, X_val[:, -98])
    plt.scatter(y_val, y_val_pred)
    plt.show()
    elapsed_time = time.time() - start
    print ("elapsed_time:%.2f" % elapsed_time + "[sec]")


In [36]:
df_imp = pd.Series(gbm.feature_importance(), index=df_train.query('type=="{}"'.format(bond))[df_train.columns[6:]].columns)

In [38]:
df_imp

dist_H_0_x            54559
dist_H_1_x            61089
dist_H_2_x            73271
dist_H_3_x            54011
dist_H_4_x            42779
dist_H_5_x            35297
dist_H_6_x            29059
dist_H_7_x            22829
dist_H_8_x            18090
dist_H_9_x            15329
angle_H_0_x           49663
angle_H_1_x           57123
angle_H_2_x           66233
angle_H_3_x           60813
angle_H_4_x           55598
angle_H_5_x           47605
angle_H_6_x           39847
angle_H_7_x           29621
angle_H_8_x           22650
angle_H_9_x           15750
orientation_H_0_x     28887
orientation_H_1_x     58997
orientation_H_2_x     61597
orientation_H_3_x     59795
orientation_H_4_x     55002
orientation_H_5_x     49300
orientation_H_6_x     40888
orientation_H_7_x     31796
orientation_H_8_x     23688
orientation_H_9_x     16617
                      ...  
orientation_F_1_y       722
cos_3j                    0
cos_3j^2                  0
dist_center               0
0                   