In [1]:
import pickle
import pandas as pd
import numpy as np
import time

import xgboost as xgb
import lightgbm as lgb

from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing

In [2]:
# load data
with open('../data/HCD35_pos.pickle', mode='rb') as fp:
    df = pickle.load(fp)

In [3]:
# divide objective and target
objective = df.Subclass
le = preprocessing.LabelEncoder()
objective = le.fit_transform(objective)

features = df.drop('Subclass', axis=1)

In [4]:
# train test split
random_state=np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(
    features, 
    objective,
    test_size=0.2
)

In [5]:
# define and fit
start = time.time()
gbm = lgb.LGBMClassifier(
    objective='multiclass',
    device = 'gpu',
    gpu_device_id=0,
    n_jobs=-1 
)

gbm.fit(
    X_train, 
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=5
)
elapsed_time = time.time() - start

# save model to sav
pickle.dump(gbm, open('../model/lgbm_HCD35_pos.sav', 'wb'))

[1]	valid_0's multi_logloss: 1.28656
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's multi_logloss: 1.20656
[3]	valid_0's multi_logloss: 1.14131
[4]	valid_0's multi_logloss: 1.08736
[5]	valid_0's multi_logloss: 1.03841
[6]	valid_0's multi_logloss: 0.996411
[7]	valid_0's multi_logloss: 0.956233
[8]	valid_0's multi_logloss: 0.917615
[9]	valid_0's multi_logloss: 0.883031
[10]	valid_0's multi_logloss: 0.85603
[11]	valid_0's multi_logloss: 0.830104
[12]	valid_0's multi_logloss: 0.807268
[13]	valid_0's multi_logloss: 0.785981
[14]	valid_0's multi_logloss: 0.763894
[15]	valid_0's multi_logloss: 0.747209
[16]	valid_0's multi_logloss: 0.729778
[17]	valid_0's multi_logloss: 0.714786
[18]	valid_0's multi_logloss: 0.698121
[19]	valid_0's multi_logloss: 0.682024
[20]	valid_0's multi_logloss: 0.666685
[21]	valid_0's multi_logloss: 0.653509
[22]	valid_0's multi_logloss: 0.640732
[23]	valid_0's multi_logloss: 0.628841
[24]	valid_0's multi_logloss: 0.616433
[25]	valid_0's mul

In [6]:
# add result to list 
o = {}
d = {}
o['HCD35_Positive'] = [round(gbm.score(X_test, y_test)*100, 2)]
d['HCD35_Positive'] = [elapsed_time]

  if diff:


In [7]:
# load data
with open('../data/HCD45_pos.pickle', mode='rb') as fp:
    df = pickle.load(fp)

In [8]:
# divide objective and target
objective = df.Subclass
le = preprocessing.LabelEncoder()
objective = le.fit_transform(objective)

features = df.drop('Subclass', axis=1)

In [9]:
# train test split
random_state=np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(
    features, 
    objective,
    test_size=0.2
)

In [10]:
# define and fit
start = time.time()
gbm = lgb.LGBMClassifier(
    objective='multiclass',
    device = 'gpu',
    gpu_device_id=0,
    n_jobs=-1 
)

gbm.fit(
    X_train, 
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=5
)
elapsed_time = time.time() - start

# save model to sav
pickle.dump(gbm, open('../model/lgbm_HCD45_pos.sav', 'wb'))

gbm.score(X_test, y_test)

[1]	valid_0's multi_logloss: 1.29593
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's multi_logloss: 1.22366
[3]	valid_0's multi_logloss: 1.16062
[4]	valid_0's multi_logloss: 1.10791
[5]	valid_0's multi_logloss: 1.05615
[6]	valid_0's multi_logloss: 1.0134
[7]	valid_0's multi_logloss: 0.973814
[8]	valid_0's multi_logloss: 0.938707
[9]	valid_0's multi_logloss: 0.909245
[10]	valid_0's multi_logloss: 0.881957
[11]	valid_0's multi_logloss: 0.855009
[12]	valid_0's multi_logloss: 0.82971
[13]	valid_0's multi_logloss: 0.806159
[14]	valid_0's multi_logloss: 0.787479
[15]	valid_0's multi_logloss: 0.765123
[16]	valid_0's multi_logloss: 0.746776
[17]	valid_0's multi_logloss: 0.730379
[18]	valid_0's multi_logloss: 0.715307
[19]	valid_0's multi_logloss: 0.701327
[20]	valid_0's multi_logloss: 0.690784
[21]	valid_0's multi_logloss: 0.678497
[22]	valid_0's multi_logloss: 0.669146
[23]	valid_0's multi_logloss: 0.659797
[24]	valid_0's multi_logloss: 0.650464
[25]	valid_0's multi

  if diff:


0.8125

In [11]:
# add result to list 
o['HCD45_Positive'] = [round(gbm.score(X_test, y_test)*100, 2)]
d['HCD45_Positive'] = [elapsed_time]

  if diff:


In [12]:
# load data
with open('../data/HCD65_pos.pickle', mode='rb') as fp:
    df = pickle.load(fp)

In [13]:
# divide objective and target
objective = df.Subclass
le = preprocessing.LabelEncoder()
objective = le.fit_transform(objective)

features = df.drop('Subclass', axis=1)

In [14]:
# train test split
random_state=np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(
    features, 
    objective,
    test_size=0.2
)

In [15]:
# define and fit
start = time.time()
gbm = lgb.LGBMClassifier(
    objective='multiclass',
    device = 'gpu',
    gpu_device_id=0,
    n_jobs=-1 
)

gbm.fit(
    X_train, 
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=5
)
elapsed_time = time.time() - start

# save model to sav
pickle.dump(gbm, open('../model/lgbm_HCD65_pos.sav', 'wb'))
gbm.score(X_test, y_test)

[1]	valid_0's multi_logloss: 1.29837
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's multi_logloss: 1.22455
[3]	valid_0's multi_logloss: 1.16215
[4]	valid_0's multi_logloss: 1.10904
[5]	valid_0's multi_logloss: 1.05967
[6]	valid_0's multi_logloss: 1.01761
[7]	valid_0's multi_logloss: 0.975806
[8]	valid_0's multi_logloss: 0.940529
[9]	valid_0's multi_logloss: 0.910785
[10]	valid_0's multi_logloss: 0.880893
[11]	valid_0's multi_logloss: 0.853897
[12]	valid_0's multi_logloss: 0.829851
[13]	valid_0's multi_logloss: 0.807085
[14]	valid_0's multi_logloss: 0.785691
[15]	valid_0's multi_logloss: 0.767879
[16]	valid_0's multi_logloss: 0.747723
[17]	valid_0's multi_logloss: 0.733658
[18]	valid_0's multi_logloss: 0.718659
[19]	valid_0's multi_logloss: 0.70483
[20]	valid_0's multi_logloss: 0.691515
[21]	valid_0's multi_logloss: 0.677993
[22]	valid_0's multi_logloss: 0.666396
[23]	valid_0's multi_logloss: 0.656875
[24]	valid_0's multi_logloss: 0.648543
[25]	valid_0's mult

  if diff:


0.8035714285714286

In [16]:
# add result to list 
o['HCD65_Positive'] = [round(gbm.score(X_test, y_test)*100, 2)]
d['HCD65_Positive'] = [elapsed_time]

  if diff:


In [17]:
# load data 
with open('../data/HCD35_neg.pickle', mode='rb') as fp:
    df = pickle.load(fp)

In [18]:
# divide objective and target
objective = df.Subclass
le = preprocessing.LabelEncoder()
objective = le.fit_transform(objective)

features = df.drop('Subclass', axis=1)

In [19]:
# train test split
random_state=np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(
    features, 
    objective,
    test_size=0.2
)

In [20]:
# define and fit
start = time.time()
gbm = lgb.LGBMClassifier(
    objective='multiclass',
    device = 'gpu',
    gpu_device_id=0,
    n_jobs=-1 
)

gbm.fit(
    X_train, 
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=5
)
elapsed_time = time.time() - start
# save model to sav
pickle.dump(gbm, open('../model/lgbm_HCD35_neg.sav', 'wb'))
gbm.score(X_test, y_test)

[1]	valid_0's multi_logloss: 1.30076
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's multi_logloss: 1.22521
[3]	valid_0's multi_logloss: 1.16091
[4]	valid_0's multi_logloss: 1.10669
[5]	valid_0's multi_logloss: 1.06334
[6]	valid_0's multi_logloss: 1.01917
[7]	valid_0's multi_logloss: 0.984907
[8]	valid_0's multi_logloss: 0.955254
[9]	valid_0's multi_logloss: 0.927405
[10]	valid_0's multi_logloss: 0.905561
[11]	valid_0's multi_logloss: 0.881258
[12]	valid_0's multi_logloss: 0.863965
[13]	valid_0's multi_logloss: 0.844037
[14]	valid_0's multi_logloss: 0.827342
[15]	valid_0's multi_logloss: 0.807751
[16]	valid_0's multi_logloss: 0.79317
[17]	valid_0's multi_logloss: 0.777888
[18]	valid_0's multi_logloss: 0.765467
[19]	valid_0's multi_logloss: 0.756628
[20]	valid_0's multi_logloss: 0.743159
[21]	valid_0's multi_logloss: 0.731933
[22]	valid_0's multi_logloss: 0.723997
[23]	valid_0's multi_logloss: 0.715227
[24]	valid_0's multi_logloss: 0.706639
[25]	valid_0's mult

  if diff:


0.7340425531914894

In [21]:
# result append to list
o['HCD35_Negative'] = [round(gbm.score(X_test, y_test)*100, 2)]
d['HCD35_Negative'] = [elapsed_time]

  if diff:


In [22]:
# load data
with open('../data/HCD45_neg.pickle', mode='rb') as fp:
    df = pickle.load(fp)

In [23]:
# divide objective and target
objective = df.Subclass
le = preprocessing.LabelEncoder()
objective = le.fit_transform(objective)

features = df.drop('Subclass', axis=1)

In [24]:
# train test split
random_state=np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(
    features, 
    objective,
    test_size=0.2
)

In [25]:
# define and fit
start = time.time()
gbm = lgb.LGBMClassifier(
    objective='multiclass',
    device = 'gpu',
    gpu_device_id=0,
    n_jobs=-1 
)

gbm.fit(
    X_train, 
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=5
)
elapsed_time = time.time() - start

# save model to sav
pickle.dump(gbm, open('../model/lgbm_HCD45_neg.sav', 'wb'))
gbm.score(X_test, y_test)

[1]	valid_0's multi_logloss: 1.30802
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's multi_logloss: 1.24697
[3]	valid_0's multi_logloss: 1.19617
[4]	valid_0's multi_logloss: 1.152
[5]	valid_0's multi_logloss: 1.1143
[6]	valid_0's multi_logloss: 1.07895
[7]	valid_0's multi_logloss: 1.04932
[8]	valid_0's multi_logloss: 1.02542
[9]	valid_0's multi_logloss: 1.00082
[10]	valid_0's multi_logloss: 0.978054
[11]	valid_0's multi_logloss: 0.952186
[12]	valid_0's multi_logloss: 0.929902
[13]	valid_0's multi_logloss: 0.911323
[14]	valid_0's multi_logloss: 0.8953
[15]	valid_0's multi_logloss: 0.876641
[16]	valid_0's multi_logloss: 0.863211
[17]	valid_0's multi_logloss: 0.847697
[18]	valid_0's multi_logloss: 0.833415
[19]	valid_0's multi_logloss: 0.824735
[20]	valid_0's multi_logloss: 0.815683
[21]	valid_0's multi_logloss: 0.804287
[22]	valid_0's multi_logloss: 0.797426
[23]	valid_0's multi_logloss: 0.792776
[24]	valid_0's multi_logloss: 0.784482
[25]	valid_0's multi_loglo

  if diff:


0.7021276595744681

In [26]:
# result append to list
o['HCD45_Negative'] = [round(gbm.score(X_test, y_test)*100, 2)]
d['HCD45_Negative'] = [elapsed_time]

  if diff:


In [27]:
# load data
with open('../data/HCD65_neg.pickle', mode='rb') as fp:
    df = pickle.load(fp)

In [28]:
# divide objective and target
objective = df.Subclass
le = preprocessing.LabelEncoder()
objective = le.fit_transform(objective)

features = df.drop('Subclass', axis=1)

In [29]:
# train test split
random_state=np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(
    features, 
    objective,
    test_size=0.2
)

In [30]:
# define and fit
start = time.time()
gbm = lgb.LGBMClassifier(
    objective='multiclass',
    device = 'gpu',
    gpu_device_id=0,
    n_jobs=-1 
)

gbm.fit(
    X_train, 
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=5
)
elapsed_time = time.time() - start

# save model to sav
pickle.dump(gbm, open('../model/lgbm_HCD65_neg.sav', 'wb'))
gbm.score(X_test, y_test)

[1]	valid_0's multi_logloss: 1.30826
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's multi_logloss: 1.24177
[3]	valid_0's multi_logloss: 1.18825
[4]	valid_0's multi_logloss: 1.14339
[5]	valid_0's multi_logloss: 1.10488
[6]	valid_0's multi_logloss: 1.06876
[7]	valid_0's multi_logloss: 1.04488
[8]	valid_0's multi_logloss: 1.02053
[9]	valid_0's multi_logloss: 0.994713
[10]	valid_0's multi_logloss: 0.975622
[11]	valid_0's multi_logloss: 0.959097
[12]	valid_0's multi_logloss: 0.946242
[13]	valid_0's multi_logloss: 0.93064
[14]	valid_0's multi_logloss: 0.915976
[15]	valid_0's multi_logloss: 0.908088
[16]	valid_0's multi_logloss: 0.895834
[17]	valid_0's multi_logloss: 0.883615
[18]	valid_0's multi_logloss: 0.878021
[19]	valid_0's multi_logloss: 0.868379
[20]	valid_0's multi_logloss: 0.863698
[21]	valid_0's multi_logloss: 0.854834
[22]	valid_0's multi_logloss: 0.848577
[23]	valid_0's multi_logloss: 0.842967
[24]	valid_0's multi_logloss: 0.835559
[25]	valid_0's multi_

  if diff:


0.6808510638297872

In [31]:
# result append to list
o['HCD65_Negative'] = [round(gbm.score(X_test, y_test)*100, 2)]
d['HCD65_Negative'] = [elapsed_time]

  if diff:


In [32]:
# change result list to Dataframe
g = pd.concat([pd.DataFrame(o).T, pd.DataFrame(d).T], axis=1)
g.columns = ['Accuracy', 'Time']

In [33]:
# dataframe to csv
g.to_csv('../result/LightGBM.csv')