In [16]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import random
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor

import pickle

In [17]:
FOLDER = '../../data_kaggle/champs/'
OUTPUT = FOLDER + 'out/'

In [18]:
# df_mulliken_charges = pd.read_csv(FOLDER + 'mulliken_charges.csv')
# df_sample =  pd.read_csv(FOLDER + 'sample_submission.csv')
# df_magnetic_shielding_tensors = pd.read_csv(FOLDER + 'magnetic_shielding_tensors.csv')
# df_train = pd.read_csv(FOLDER + 'train.csv')
# df_test = pd.read_csv(FOLDER + 'test.csv')
# df_dipole_moments = pd.read_csv(FOLDER + 'dipole_moments.csv')
# df_potential_energy = pd.read_csv(FOLDER + 'potential_energy.csv')
# df_structures = pd.read_csv(FOLDER + 'structures.csv')
# df_scalar_coupling_contributions = pd.read_csv(FOLDER + 'scalar_coupling_contributions.csv')

In [19]:
df_train_dist = pd.read_csv(OUTPUT + '20190618_dist_bond_cos3j_train.csv')

In [20]:
def type_score(y_val, y_pred):
    return np.log(sum(np.abs(y_val- y_pred)) / len(y_val))

In [21]:
bond_types = df_train_dist['type'].unique()

In [41]:
df_train_dist['type'].value_counts()

3JHC    146744
2JHC    114763
1JHC     76490
3JHH     61696
2JHH     45380
3JHN     15854
2JHN     11397
1JHN      4964
0            1
Name: type, dtype: int64

In [43]:
df_train_dist[df_train_dist['type'] == '0']

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,dist_H_0_x,dist_H_1_x,dist_H_2_x,dist_H_3_x,...,dist_F_1_y,dist_F_2_y,dist_F_3_y,dist_F_4_y,num_bond_F_0_y,num_bond_F_1_y,num_bond_F_2_y,num_bond_F_3_y,num_bond_F_4_y,cos_3j
477288,477272,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.925589


In [22]:
mols = df_train_dist['molecule_name'].unique()
num = len(mols)
num_train = int(num * 0.8)
pickup = random.sample(range(num), num)
pick_train = pickup[:num_train]
pick_val = pickup[num_train:]

In [45]:
rounds = 200
num_seed = 10

In [46]:
bond_types

array(['1JHC', '2JHH', '1JHN', '2JHN', '2JHC', '3JHH', '3JHC', '3JHN',
       '0'], dtype=object)

In [47]:
models = []
scores = []
for i, b in enumerate(bond_types):
    print('-'*10, b, '-'*10)

    models_b = []
    y_pred_b =[]
    
    df_bond = df_train_dist.query('type == "{}"'.format(b))
    
    df_train = df_bond[df_bond['molecule_name'].isin(mols[pick_train])]
    y_train = df_train['scalar_coupling_constant'].values
    X_train = df_train[df_train.columns[6:]].values

    df_val = df_bond[df_bond['molecule_name'].isin(mols[pick_val])]
    y_val = df_val['scalar_coupling_constant'].values
    X_val = df_val[df_val.columns[6:]].values

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)
        
    for s in range(num_seed):
        print('seed No.{}, bond type {}'.format(s, b))
        
        params = {
                'task' : 'train',
                'boosting_type' : 'gbdt',
                'objective' : 'regression',
                'metric' : {'l2'},
                'num_leaves' : 31,
                'learning_rate' : 0.1,
                'feature_fraction' : 0.9,
                'bagging_fraction' : 0.8,
                'bagging_freq': 5,
            'seed':s
        }

        gbm = lgb.train(params,
                lgb_train,
                num_boost_round=rounds,
                verbose_eval=0,
                valid_sets=lgb_eval,
                early_stopping_rounds=100)
        
        models_b.append(gbm)
        y_pred_single = gbm.predict(X_val, num_iteration=gbm.best_iteration)
        print('single model score:',type_score(y_val, y_pred_single))

        y_pred_b.append(y_pred_single)
        
    y_pred = np.array(y_pred_b).mean(axis=0)
    score = type_score(y_val, y_pred)
    print('mean-ensemble score:', score)
    models.append(models_b)
    scores.append(score)

---------- 1JHC ----------
seed No.0, bond type 1JHC
single model score: 0.1647386197949984
seed No.1, bond type 1JHC
single model score: 0.17156603348972602
seed No.2, bond type 1JHC
single model score: 0.17091896794019867
seed No.3, bond type 1JHC
single model score: 0.17690596676717824
seed No.4, bond type 1JHC
single model score: 0.1744645591830072
seed No.5, bond type 1JHC
single model score: 0.17006005353088588
seed No.6, bond type 1JHC
single model score: 0.1726674818376928
seed No.7, bond type 1JHC
single model score: 0.16869616016398595
seed No.8, bond type 1JHC
single model score: 0.16982193632567583
seed No.9, bond type 1JHC
single model score: 0.1789198727150758
mean-ensemble score: 0.13813108272496347
---------- 2JHH ----------
seed No.0, bond type 2JHH
single model score: -1.0693298693779725
seed No.1, bond type 2JHH
single model score: -1.0732400206751656
seed No.2, bond type 2JHH
single model score: -1.0829869323115948
seed No.3, bond type 2JHH
single model score: -1.07

LightGBMError: Check failed: num_data > 0 at D:\a\1\s\python-package\compile\src\io\dataset.cpp, line 27 .


In [48]:
print('type score:', scores)
print('total score:', np.array(scores).mean())

type score: [0.13813108272496347, -1.126661414588477, -0.5937012846751502, -1.2231183982089298, -0.545668172082448, -1.1331700162142355, 0.1360328672884935, -1.3630286469193083]
total score: -0.7138979978343865


In [21]:
with open(OUTPUT + '20190622_champs_models_lgb_en.pickle', 'wb') as f:
    pickle.dump(models, f)