In [1]:
import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
import itertools
import time

%matplotlib inline

In [2]:
# load data
df = pd.concat([
    pd.read_csv('../data/feature_selection_positive.csv', index_col=0),
    pd.read_csv('../data/decomp_pos.csv', index_col=0).drop('Subclass', axis=1)
], axis=1)

In [3]:
# divide objective and target
objective = df.Subclass
le = preprocessing.LabelEncoder()
objective = le.fit_transform(objective)

features = df.drop('Subclass', axis=1)

In [4]:
# train test split
random_state=np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(
    features, 
    objective,
    test_size=0.2
)

In [5]:
# define and fit
clf = rf()
clf = clf.fit(X_train, y_train)

# pickle.dump(clf, open('../model/rf_pos_fs+decomp.sav', 'wb'))

In [6]:
# setting params
params = {}
params['device'] = 'gpu'
params['gpu_id'] = 0
params['updater'] = 'grow_gpu_hist'

In [7]:
# define and fit
model = xgb.XGBClassifier(**params)
model.fit(X_train, y_train)

# pickle.dump(model, open('../model/xgb_pos_fs+decomp.sav', 'wb'))

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, device='gpu', gamma=0, gpu_id=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1, updater='grow_gpu_hist')

In [8]:
# define and fit 
gbm = lgb.LGBMClassifier(
    objective='multiclass',
    device = 'gpu'
)

gbm.fit(
    X_train, 
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=5
)
# pickle.dump(gbm, open('../model/lgbm_pos_fs+decomp.sav', 'wb'))

[1]	valid_0's multi_logloss: 1.29069
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's multi_logloss: 1.21012
[3]	valid_0's multi_logloss: 1.14197
[4]	valid_0's multi_logloss: 1.08361
[5]	valid_0's multi_logloss: 1.03128
[6]	valid_0's multi_logloss: 0.984376
[7]	valid_0's multi_logloss: 0.940129
[8]	valid_0's multi_logloss: 0.899799
[9]	valid_0's multi_logloss: 0.864108
[10]	valid_0's multi_logloss: 0.832423
[11]	valid_0's multi_logloss: 0.804506
[12]	valid_0's multi_logloss: 0.779539
[13]	valid_0's multi_logloss: 0.755452
[14]	valid_0's multi_logloss: 0.733423
[15]	valid_0's multi_logloss: 0.715317
[16]	valid_0's multi_logloss: 0.697163
[17]	valid_0's multi_logloss: 0.680644
[18]	valid_0's multi_logloss: 0.666459
[19]	valid_0's multi_logloss: 0.649679
[20]	valid_0's multi_logloss: 0.637496
[21]	valid_0's multi_logloss: 0.622585
[22]	valid_0's multi_logloss: 0.61208
[23]	valid_0's multi_logloss: 0.602414
[24]	valid_0's multi_logloss: 0.592352
[25]	valid_0's mul

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        device='gpu', importance_type='split', learning_rate=0.1,
        max_depth=-1, min_child_samples=20, min_child_weight=0.001,
        min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31,
        objective='multiclass', random_state=None, reg_alpha=0.0,
        reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [9]:
# load data
df = pd.concat([
    pd.read_csv('../data/feature_selection_negative.csv', index_col=0),
    pd.read_csv('../data/decomp_neg.csv', index_col=0).drop('Subclass', axis=1)
], axis=1)

In [10]:
# divide objective and target
objective = df.Subclass
le = preprocessing.LabelEncoder()
objective = le.fit_transform(objective)

features = df.drop('Subclass', axis=1)

In [11]:
# train test split
random_state=np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(
    features, 
    objective,
    test_size=0.2
)

In [12]:
# define and fit
clf = rf()
clf = clf.fit(X_train, y_train)

# pickle.dump(clf, open('../model/rf_ng_fs+decomp.sav', 'wb'))

In [13]:
# setting params
params = {}
params['device'] = 'gpu'
params['gpu_id'] = 0
params['updater'] = 'grow_gpu_hist'

In [14]:
# define and fit
model = xgb.XGBClassifier(**params)
model.fit(X_train, y_train)

# pickle.dump(model, open('../model/xgb_ng_fs+decomp.sav', 'wb'))

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, device='gpu', gamma=0, gpu_id=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1, updater='grow_gpu_hist')

In [15]:
# define and fit 
gbm = lgb.LGBMClassifier(
    objective='multiclass',
    device = 'gpu'
)

gbm.fit(
    X_train, 
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=5
)

# pickle.dump(gbm, open('../model/lgbm_ng_fs+decomp.sav', 'wb'))

[1]	valid_0's multi_logloss: 1.28878
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's multi_logloss: 1.20847
[3]	valid_0's multi_logloss: 1.13982
[4]	valid_0's multi_logloss: 1.08051
[5]	valid_0's multi_logloss: 1.03088
[6]	valid_0's multi_logloss: 0.98421
[7]	valid_0's multi_logloss: 0.943343
[8]	valid_0's multi_logloss: 0.912084
[9]	valid_0's multi_logloss: 0.886113
[10]	valid_0's multi_logloss: 0.852649
[11]	valid_0's multi_logloss: 0.831805
[12]	valid_0's multi_logloss: 0.804853
[13]	valid_0's multi_logloss: 0.784163
[14]	valid_0's multi_logloss: 0.768031
[15]	valid_0's multi_logloss: 0.752548
[16]	valid_0's multi_logloss: 0.741966
[17]	valid_0's multi_logloss: 0.724925
[18]	valid_0's multi_logloss: 0.718684
[19]	valid_0's multi_logloss: 0.708427
[20]	valid_0's multi_logloss: 0.695185
[21]	valid_0's multi_logloss: 0.689559
[22]	valid_0's multi_logloss: 0.683826
[23]	valid_0's multi_logloss: 0.678237
[24]	valid_0's multi_logloss: 0.671902
[25]	valid_0's mul

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        device='gpu', importance_type='split', learning_rate=0.1,
        max_depth=-1, min_child_samples=20, min_child_weight=0.001,
        min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31,
        objective='multiclass', random_state=None, reg_alpha=0.0,
        reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)