In [1]:
import pandas as pd
import numpy as np
import time
import random
import lightgbm as lgb
import time
import gc
import pickle
import matplotlib.pyplot as plt

In [2]:
from process import type_score

In [3]:
FOLDER = '../../data_kaggle/champs/'
OUTPUT = FOLDER + 'out/'

In [4]:
DATA_DATE = '20190721'
VER = '02'
MAP_DATE = '20190728'
MODEL_DATE = '20190804'
num_rep = 3

In [5]:
# input files
mols_split = OUTPUT + DATA_DATE + '_' + 'molecule_name_split.pickle'
features_train = OUTPUT + MAP_DATE + '_' + VER + '_' + 'features_train_{}.pickle'
features_test = OUTPUT + MAP_DATE + '_' + VER + '_' + 'features_test_{}.pickle'
trained_models = OUTPUT + MAP_DATE + '_' + VER + '_' + 'champs_models_lgb_{}_{}.pickle'

In [6]:
# output files
pred_train_temp = OUTPUT + 'pred_train_{}.pickle'
pred_train = OUTPUT + MODEL_DATE + '_' + VER + '_' + 'pred_train_{}.pickle'.format(num_rep)

submission = OUTPUT + MODEL_DATE + '_' + VER + '_' + 'submission_{}.csv'.format(num_rep)
submission_temp = OUTPUT + 'submission_{}.pickle'

In [7]:
bond_types = ['3JHN', '3JHC','1JHC', '2JHH', '1JHN', '2JHN', '2JHC', '3JHH']

In [8]:
for b in bond_types:
    print('-'*10, b, '-'*10)
    start = time.time()
    df = pd.read_pickle(features_train.format(b))

    with open(trained_models.format(b, num_rep), 'rb') as f:
        gbm = pickle.load(f)

    X = df[df.columns[6:]].values  
    y_pred = gbm.predict(X, num_iteration=gbm.best_iteration)
    
    df_pred = pd.DataFrame([df['id'], y_pred], index=['id', 'scalar_coupling_constant']).T
    df_pred['id'] = df_pred['id'].astype('int32')
    df_pred.to_pickle(pred_train_temp.format(b))
    
    elapsed_time = time.time() - start
    print ("elapsed_time:{0}".format(elapsed_time) + "[sec]")
    gc.collect()
    
df_pred_train = pd.DataFrame()
for b in bond_types:
    df_b = pd.read_pickle(pred_train_temp.format(b))
    df_pred_train = pd.concat([df_pred_train, df_b], axis=0, sort=False)
    
df_pred_train['id'] = df_pred_train['id'].astype('int32')
df_pred_train = df_pred_train.reset_index(drop=True)
df_pred_train.head()

df_pred_train.to_pickle(pred_train)

---------- 3JHN ----------
elapsed_time:6.371759414672852[sec]
---------- 3JHC ----------
elapsed_time:224.6197633743286[sec]
---------- 1JHC ----------
elapsed_time:54.98912787437439[sec]
---------- 2JHH ----------
elapsed_time:17.08229374885559[sec]
---------- 1JHN ----------
elapsed_time:1.3498890399932861[sec]
---------- 2JHN ----------
elapsed_time:5.257400751113892[sec]
---------- 2JHC ----------
elapsed_time:211.59247612953186[sec]
---------- 3JHH ----------
elapsed_time:50.161311626434326[sec]


In [9]:
df_pred_train.head()

Unnamed: 0,id,scalar_coupling_constant
0,73,0.774622
1,78,0.750297
2,82,0.774249
3,213,-0.272996
4,218,1.448226


In [10]:
df_pred_train.tail()

Unnamed: 0,id,scalar_coupling_constant
4658142,4658101,0.058521
4658143,4658111,0.004197
4658144,4658138,0.324748
4658145,4658109,0.049869
4658146,4658110,0.176755


In [11]:
del df_pred_train
gc.collect()

7

In [12]:
for b in bond_types:
    print('-'*10, b, '-'*10)
    start = time.time()
    df_bond = pd.read_pickle(features_test.format(b))
    y_pred_b =[]

    with open(trained_models.format(b, num_rep), 'rb') as f:
        gbm = pickle.load(f)

    X_test = df_bond[df_bond.columns[5:]].values  
    y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
    
    df_pred = pd.DataFrame([df_bond['id'], y_pred], index=['id', 'scalar_coupling_constant']).T
    df_pred['id'] = df_pred['id'].astype('int32')
    df_pred.to_pickle(submission_temp.format(b))
    
    elapsed_time = time.time() - start
    print ("elapsed_time:{0}".format(elapsed_time) + "[sec]")
    gc.collect()
    
df_submit = pd.DataFrame()
for b in bond_types:
    df_submit_b = pd.read_pickle(submission_temp.format(b))
    df_submit = pd.concat([df_submit, df_submit_b], axis=0)
    
df_submit['id'] = df_submit['id'].astype('int32')
df_submit = df_submit.sort_values('id').reset_index(drop=True)
df_submit.head()

df_submit.to_csv(submission, index=False)

---------- 3JHN ----------
elapsed_time:3.22182297706604[sec]
---------- 3JHC ----------
elapsed_time:123.84311962127686[sec]
---------- 1JHC ----------
elapsed_time:29.129895448684692[sec]
---------- 2JHH ----------
elapsed_time:8.798673868179321[sec]
---------- 1JHN ----------
elapsed_time:0.72859787940979[sec]
---------- 2JHN ----------
elapsed_time:2.660564661026001[sec]
---------- 2JHC ----------
elapsed_time:114.49173641204834[sec]
---------- 3JHH ----------
elapsed_time:28.194542169570923[sec]


In [13]:
df_submit.head()

Unnamed: 0,id,scalar_coupling_constant
0,4658147,17.796563
1,4658148,188.519345
2,4658149,13.106957
3,4658150,188.605161
4,4658151,17.796563


In [14]:
df_submit.tail()

Unnamed: 0,id,scalar_coupling_constant
2505537,7163684,1.054492
2505538,7163685,4.711128
2505539,7163686,5.22985
2505540,7163687,4.03862
2505541,7163688,118.460035
