In [1]:
import numpy as np
import pickle
import pandas as pd

import xgboost as xgb
import lightgbm as lgb

from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
# read data
with open('../data/HCD35_pos.pickle', mode='rb') as fp:
    df_3 = pickle.load(fp)
    
df = pd.concat([
    df_3,
    pd.read_csv('../data/decomp_pos.csv').drop('Subclass', axis=1)
], axis=1)

In [3]:
# divide objective and target
objective = df.Subclass
le = preprocessing.LabelEncoder()
objective = le.fit_transform(objective)
features = df.drop('Subclass', axis=1)

# train test split
random_state=np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(
    features, 
    objective,
    test_size=0.2
)

In [4]:
# define and fit
clf = rf()
clf = clf.fit(X_train, y_train)

# pickle.dump(clf, open('../model/rf_pos_HCD35+decomp.sav', 'wb'))

In [5]:
# define and fit
gbm = lgb.LGBMClassifier(
    objective='multiclass',
    device = 'gpu'
)

gbm.fit(
    X_train, 
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=5
)

# pickle.dump(gbm, open('../model/lgbm_pos_HCD35+decomp.sav', 'wb'))

[1]	valid_0's multi_logloss: 1.28903
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's multi_logloss: 1.20755
[3]	valid_0's multi_logloss: 1.14199
[4]	valid_0's multi_logloss: 1.08496
[5]	valid_0's multi_logloss: 1.03621
[6]	valid_0's multi_logloss: 0.991759
[7]	valid_0's multi_logloss: 0.950848
[8]	valid_0's multi_logloss: 0.913097
[9]	valid_0's multi_logloss: 0.880608
[10]	valid_0's multi_logloss: 0.851294
[11]	valid_0's multi_logloss: 0.823468
[12]	valid_0's multi_logloss: 0.798924
[13]	valid_0's multi_logloss: 0.776416
[14]	valid_0's multi_logloss: 0.756633
[15]	valid_0's multi_logloss: 0.735563
[16]	valid_0's multi_logloss: 0.716066
[17]	valid_0's multi_logloss: 0.698056
[18]	valid_0's multi_logloss: 0.679382
[19]	valid_0's multi_logloss: 0.664126
[20]	valid_0's multi_logloss: 0.650303
[21]	valid_0's multi_logloss: 0.634791
[22]	valid_0's multi_logloss: 0.619971
[23]	valid_0's multi_logloss: 0.608476
[24]	valid_0's multi_logloss: 0.596775
[25]	valid_0's mu

In [6]:
# set parameters
params = {}
params['device'] = 'gpu'
params['gpu_id'] = 0
params['updater'] = 'grow_gpu_hist'

In [7]:
# define and fit
model = xgb.XGBClassifier(**params)
model.fit(X_train, y_train)

# pickle.dump(model, open('../model/xgb_pos_HCD35+decomp.sav', 'wb'))

In [8]:
with open('../data/HCD35_neg.pickle', mode='rb') as fp:
    df_3 = pickle.load(fp)
    
df = pd.concat([
    df_3,
    pd.read_csv('../data/decomp_neg.csv').drop('Subclass', axis=1)
], axis=1)

In [10]:
# divide objective and target
objective = df.Subclass
le = preprocessing.LabelEncoder()
objective = le.fit_transform(objective)
features = df.drop('Subclass', axis=1)

# train test split
random_state=np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(
    features, 
    objective,
    test_size=0.2
)

In [11]:
# define and fit 
clf = rf()
clf = clf.fit(X_train, y_train)

# pickle.dump(clf, open('../model/rf_ng_HCD35+decomp.sav', 'wb'))

In [13]:
# define and fit
gbm = lgb.LGBMClassifier(
    objective='multiclass',
    device = 'gpu'
)

gbm.fit(
    X_train, 
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=5
)

# pickle.dump(gbm, open('../model/lgbm_ng_HCD35+decomp.sav', 'wb'))

[1]	valid_0's multi_logloss: 1.28919
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's multi_logloss: 1.20761
[3]	valid_0's multi_logloss: 1.14127
[4]	valid_0's multi_logloss: 1.08399
[5]	valid_0's multi_logloss: 1.03192
[6]	valid_0's multi_logloss: 0.985134
[7]	valid_0's multi_logloss: 0.948937
[8]	valid_0's multi_logloss: 0.919607
[9]	valid_0's multi_logloss: 0.884904
[10]	valid_0's multi_logloss: 0.857844
[11]	valid_0's multi_logloss: 0.831593
[12]	valid_0's multi_logloss: 0.811628
[13]	valid_0's multi_logloss: 0.793627
[14]	valid_0's multi_logloss: 0.77842
[15]	valid_0's multi_logloss: 0.762973
[16]	valid_0's multi_logloss: 0.746611
[17]	valid_0's multi_logloss: 0.734568
[18]	valid_0's multi_logloss: 0.72581
[19]	valid_0's multi_logloss: 0.715228
[20]	valid_0's multi_logloss: 0.706909
[21]	valid_0's multi_logloss: 0.699649
[22]	valid_0's multi_logloss: 0.692077
[23]	valid_0's multi_logloss: 0.685201
[24]	valid_0's multi_logloss: 0.678678
[25]	valid_0's mult

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        device='gpu', importance_type='split', learning_rate=0.1,
        max_depth=-1, min_child_samples=20, min_child_weight=0.001,
        min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31,
        objective='multiclass', random_state=None, reg_alpha=0.0,
        reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [1]:
# setting parameters
params = {}
params['device'] = 'gpu'
params['gpu_id'] = 0
params['updater'] = 'grow_gpu_hist'

In [3]:
# define and fit 
model = xgb.XGBClassifier(**params)
model.fit(X_train, y_train)

# pickle.dump(model, open('../model/xgb_ng_HCD35+decomp.sav', 'wb'))