In [1]:
import pickle
import pandas as pd
import numpy as np
import time

import xgboost as xgb
import lightgbm as lgb

from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing

In [2]:
# load data
with open('../data/HCD35_pos.pickle', mode='rb') as fp:
    df = pickle.load(fp)

In [3]:
# divide objective and target
objective = df.Subclass
le = preprocessing.LabelEncoder()
objective = le.fit_transform(objective)

features = df.drop('Subclass', axis=1)

In [4]:
# train test split
random_state=np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(
    features, 
    objective,
    test_size=0.2
)

In [5]:
# define and fit
start = time.time()
gbm = lgb.LGBMClassifier(
    objective='multiclass',
    device = 'gpu',
    gpu_device_id=0,
    n_jobs=-1 
)

gbm.fit(
    X_train, 
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=5
)
elapsed_time = time.time() - start

# save model to sav
pickle.dump(gbm, open('../model/lgbm_HCD35_pos.sav', 'wb'))

[1]	valid_0's multi_logloss: 1.28656
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's multi_logloss: 1.20656
[3]	valid_0's multi_logloss: 1.14126
[4]	valid_0's multi_logloss: 1.08732
[5]	valid_0's multi_logloss: 1.03837
[6]	valid_0's multi_logloss: 0.996364
[7]	valid_0's multi_logloss: 0.956187
[8]	valid_0's multi_logloss: 0.91757
[9]	valid_0's multi_logloss: 0.882987
[10]	valid_0's multi_logloss: 0.855899
[11]	valid_0's multi_logloss: 0.829978
[12]	valid_0's multi_logloss: 0.807145
[13]	valid_0's multi_logloss: 0.785858
[14]	valid_0's multi_logloss: 0.763774
[15]	valid_0's multi_logloss: 0.747096
[16]	valid_0's multi_logloss: 0.729668
[17]	valid_0's multi_logloss: 0.71468
[18]	valid_0's multi_logloss: 0.698034
[19]	valid_0's multi_logloss: 0.681936
[20]	valid_0's multi_logloss: 0.666601
[21]	valid_0's multi_logloss: 0.653423
[22]	valid_0's multi_logloss: 0.640647
[23]	valid_0's multi_logloss: 0.628899
[24]	valid_0's multi_logloss: 0.616486
[25]	valid_0's mult

In [6]:
# add result to list 
o = {}
d = {}
o['HCD35_Positive'] = [round(gbm.score(X_test, y_test)*100, 2)]
d['HCD35_Positive'] = [elapsed_time]

  if diff:


In [7]:
# load data
with open('../data/HCD45_pos.pickle', mode='rb') as fp:
    df = pickle.load(fp)

In [8]:
# divide objective and target
objective = df.Subclass
le = preprocessing.LabelEncoder()
objective = le.fit_transform(objective)

features = df.drop('Subclass', axis=1)

In [9]:
# train test split
random_state=np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(
    features, 
    objective,
    test_size=0.2
)

In [10]:
# define and fit
start = time.time()
gbm = lgb.LGBMClassifier(
    objective='multiclass',
    device = 'gpu',
    gpu_device_id=0,
    n_jobs=-1 
)

gbm.fit(
    X_train, 
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=5
)
elapsed_time = time.time() - start

# save model to sav
pickle.dump(gbm, open('../model/lgbm_HCD45_pos.sav', 'wb'))

gbm.score(X_test, y_test)

[1]	valid_0's multi_logloss: 1.29593
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's multi_logloss: 1.22366
[3]	valid_0's multi_logloss: 1.16062
[4]	valid_0's multi_logloss: 1.10791
[5]	valid_0's multi_logloss: 1.05615
[6]	valid_0's multi_logloss: 1.01349
[7]	valid_0's multi_logloss: 0.974257
[8]	valid_0's multi_logloss: 0.939148
[9]	valid_0's multi_logloss: 0.909792
[10]	valid_0's multi_logloss: 0.882501
[11]	valid_0's multi_logloss: 0.855557
[12]	valid_0's multi_logloss: 0.830246
[13]	valid_0's multi_logloss: 0.806693
[14]	valid_0's multi_logloss: 0.788011
[15]	valid_0's multi_logloss: 0.76566
[16]	valid_0's multi_logloss: 0.747321
[17]	valid_0's multi_logloss: 0.730759
[18]	valid_0's multi_logloss: 0.715679
[19]	valid_0's multi_logloss: 0.701715
[20]	valid_0's multi_logloss: 0.691165
[21]	valid_0's multi_logloss: 0.678869
[22]	valid_0's multi_logloss: 0.669394
[23]	valid_0's multi_logloss: 0.660105
[24]	valid_0's multi_logloss: 0.650777
[25]	valid_0's mult

  if diff:


0.8125

In [11]:
# add result to list 
o['HCD45_Positive'] = [round(gbm.score(X_test, y_test)*100, 2)]
d['HCD45_Positive'] = [elapsed_time]

  if diff:


In [12]:
# load data
with open('../data/HCD65_pos.pickle', mode='rb') as fp:
    df = pickle.load(fp)

In [13]:
# divide objective and target
objective = df.Subclass
le = preprocessing.LabelEncoder()
objective = le.fit_transform(objective)

features = df.drop('Subclass', axis=1)

In [14]:
# train test split
random_state=np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(
    features, 
    objective,
    test_size=0.2
)

In [15]:
# define and fit
start = time.time()
gbm = lgb.LGBMClassifier(
    objective='multiclass',
    device = 'gpu',
    gpu_device_id=0,
    n_jobs=-1 
)

gbm.fit(
    X_train, 
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=5
)
elapsed_time = time.time() - start

# save model to sav
pickle.dump(gbm, open('../model/lgbm_HCD65_pos.sav', 'wb'))
gbm.score(X_test, y_test)

[1]	valid_0's multi_logloss: 1.29837
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's multi_logloss: 1.22412
[3]	valid_0's multi_logloss: 1.16172
[4]	valid_0's multi_logloss: 1.10862
[5]	valid_0's multi_logloss: 1.05928
[6]	valid_0's multi_logloss: 1.01723
[7]	valid_0's multi_logloss: 0.975638
[8]	valid_0's multi_logloss: 0.940134
[9]	valid_0's multi_logloss: 0.910396
[10]	valid_0's multi_logloss: 0.880453
[11]	valid_0's multi_logloss: 0.85347
[12]	valid_0's multi_logloss: 0.829439
[13]	valid_0's multi_logloss: 0.806678
[14]	valid_0's multi_logloss: 0.785287
[15]	valid_0's multi_logloss: 0.767483
[16]	valid_0's multi_logloss: 0.747339
[17]	valid_0's multi_logloss: 0.733292
[18]	valid_0's multi_logloss: 0.718219
[19]	valid_0's multi_logloss: 0.704397
[20]	valid_0's multi_logloss: 0.69108
[21]	valid_0's multi_logloss: 0.677567
[22]	valid_0's multi_logloss: 0.665974
[23]	valid_0's multi_logloss: 0.656456
[24]	valid_0's multi_logloss: 0.648125
[25]	valid_0's multi

  if diff:


0.8035714285714286

In [16]:
# add result to list 
o['HCD65_Positive'] = [round(gbm.score(X_test, y_test)*100, 2)]
d['HCD65_Positive'] = [elapsed_time]

  if diff:


In [17]:
# load data 
with open('../data/HCD35_neg.pickle', mode='rb') as fp:
    df = pickle.load(fp)

In [18]:
# divide objective and target
objective = df.Subclass
le = preprocessing.LabelEncoder()
objective = le.fit_transform(objective)

features = df.drop('Subclass', axis=1)

In [19]:
# train test split
random_state=np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(
    features, 
    objective,
    test_size=0.2
)

In [20]:
# define and fit
start = time.time()
gbm = lgb.LGBMClassifier(
    objective='multiclass',
    device = 'gpu',
    gpu_device_id=0,
    n_jobs=-1 
)

gbm.fit(
    X_train, 
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=5
)
elapsed_time = time.time() - start
# save model to sav
pickle.dump(gbm, open('../model/lgbm_HCD35_neg.sav', 'wb'))
gbm.score(X_test, y_test)

[1]	valid_0's multi_logloss: 1.30076
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's multi_logloss: 1.22521
[3]	valid_0's multi_logloss: 1.16091
[4]	valid_0's multi_logloss: 1.10669
[5]	valid_0's multi_logloss: 1.06334
[6]	valid_0's multi_logloss: 1.01917
[7]	valid_0's multi_logloss: 0.984905
[8]	valid_0's multi_logloss: 0.955252
[9]	valid_0's multi_logloss: 0.927404
[10]	valid_0's multi_logloss: 0.90556
[11]	valid_0's multi_logloss: 0.881257
[12]	valid_0's multi_logloss: 0.863964
[13]	valid_0's multi_logloss: 0.844036
[14]	valid_0's multi_logloss: 0.827341
[15]	valid_0's multi_logloss: 0.80775
[16]	valid_0's multi_logloss: 0.793169
[17]	valid_0's multi_logloss: 0.777887
[18]	valid_0's multi_logloss: 0.765466
[19]	valid_0's multi_logloss: 0.756627
[20]	valid_0's multi_logloss: 0.743157
[21]	valid_0's multi_logloss: 0.731932
[22]	valid_0's multi_logloss: 0.723996
[23]	valid_0's multi_logloss: 0.715226
[24]	valid_0's multi_logloss: 0.706638
[25]	valid_0's multi

  if diff:


0.7340425531914894

In [21]:
# result append to list
o['HCD35_Negative'] = [round(gbm.score(X_test, y_test)*100, 2)]
d['HCD35_Negative'] = [elapsed_time]

  if diff:


In [22]:
# load data
with open('../data/HCD45_neg.pickle', mode='rb') as fp:
    df = pickle.load(fp)

In [23]:
# divide objective and target
objective = df.Subclass
le = preprocessing.LabelEncoder()
objective = le.fit_transform(objective)

features = df.drop('Subclass', axis=1)

In [24]:
# train test split
random_state=np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(
    features, 
    objective,
    test_size=0.2
)

In [25]:
# define and fit
start = time.time()
gbm = lgb.LGBMClassifier(
    objective='multiclass',
    device = 'gpu',
    gpu_device_id=0,
    n_jobs=-1 
)

gbm.fit(
    X_train, 
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=5
)
elapsed_time = time.time() - start

# save model to sav
pickle.dump(gbm, open('../model/lgbm_HCD45_neg.sav', 'wb'))
gbm.score(X_test, y_test)

[1]	valid_0's multi_logloss: 1.30802
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's multi_logloss: 1.24697
[3]	valid_0's multi_logloss: 1.19617
[4]	valid_0's multi_logloss: 1.152
[5]	valid_0's multi_logloss: 1.1143
[6]	valid_0's multi_logloss: 1.07895
[7]	valid_0's multi_logloss: 1.04932
[8]	valid_0's multi_logloss: 1.02542
[9]	valid_0's multi_logloss: 1.00082
[10]	valid_0's multi_logloss: 0.978054
[11]	valid_0's multi_logloss: 0.952186
[12]	valid_0's multi_logloss: 0.929902
[13]	valid_0's multi_logloss: 0.911323
[14]	valid_0's multi_logloss: 0.8953
[15]	valid_0's multi_logloss: 0.876641
[16]	valid_0's multi_logloss: 0.863211
[17]	valid_0's multi_logloss: 0.847697
[18]	valid_0's multi_logloss: 0.833415
[19]	valid_0's multi_logloss: 0.824735
[20]	valid_0's multi_logloss: 0.815683
[21]	valid_0's multi_logloss: 0.804287
[22]	valid_0's multi_logloss: 0.797441
[23]	valid_0's multi_logloss: 0.792791
[24]	valid_0's multi_logloss: 0.784498
[25]	valid_0's multi_loglo

  if diff:


0.7021276595744681

In [26]:
# result append to list
o['HCD45_Negative'] = [round(gbm.score(X_test, y_test)*100, 2)]
d['HCD45_Negative'] = [elapsed_time]

  if diff:


In [27]:
# load data
with open('../data/HCD65_neg.pickle', mode='rb') as fp:
    df = pickle.load(fp)

In [28]:
# divide objective and target
objective = df.Subclass
le = preprocessing.LabelEncoder()
objective = le.fit_transform(objective)

features = df.drop('Subclass', axis=1)

In [29]:
# train test split
random_state=np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(
    features, 
    objective,
    test_size=0.2
)

In [30]:
# define and fit
start = time.time()
gbm = lgb.LGBMClassifier(
    objective='multiclass',
    device = 'gpu',
    gpu_device_id=0,
    n_jobs=-1 
)

gbm.fit(
    X_train, 
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=5
)
elapsed_time = time.time() - start

# save model to sav
pickle.dump(gbm, open('../model/lgbm_HCD65_neg.sav', 'wb'))
gbm.score(X_test, y_test)

[1]	valid_0's multi_logloss: 1.30826
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's multi_logloss: 1.24177
[3]	valid_0's multi_logloss: 1.18825
[4]	valid_0's multi_logloss: 1.14339
[5]	valid_0's multi_logloss: 1.10488
[6]	valid_0's multi_logloss: 1.06876
[7]	valid_0's multi_logloss: 1.04488
[8]	valid_0's multi_logloss: 1.02053
[9]	valid_0's multi_logloss: 0.994713
[10]	valid_0's multi_logloss: 0.975622
[11]	valid_0's multi_logloss: 0.959097
[12]	valid_0's multi_logloss: 0.946242
[13]	valid_0's multi_logloss: 0.93064
[14]	valid_0's multi_logloss: 0.915976
[15]	valid_0's multi_logloss: 0.908088
[16]	valid_0's multi_logloss: 0.895834
[17]	valid_0's multi_logloss: 0.883615
[18]	valid_0's multi_logloss: 0.878021
[19]	valid_0's multi_logloss: 0.868379
[20]	valid_0's multi_logloss: 0.863698
[21]	valid_0's multi_logloss: 0.854834
[22]	valid_0's multi_logloss: 0.848577
[23]	valid_0's multi_logloss: 0.842967
[24]	valid_0's multi_logloss: 0.835559
[25]	valid_0's multi_

  if diff:


0.6808510638297872

In [31]:
# result append to list
o['HCD65_Negative'] = [round(gbm.score(X_test, y_test)*100, 2)]
d['HCD65_Negative'] = [elapsed_time]

  if diff:


In [32]:
# change result list to Dataframe
g = pd.concat([pd.DataFrame(o).T, pd.DataFrame(d).T], axis=1)
g.columns = ['Accuracy', 'Time']

In [33]:
# dataframe to csv
g.to_csv('../result/LightGBM.csv')