In [3]:
import numpy as np 
import time

import torch 
import torch.nn as nn
import torch.nn.functional as F 

import matplotlib.pyplot as plt
from collections import OrderedDict

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import RobustScaler

from utils.inference import Trainer, plot_loss
import utils.datasets as d

In [25]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel
import joblib

In [4]:
import pandas as pd
data = pd.read_csv("data/Original_features.csv").dropna()

In [5]:
y = data.labels.values

In [6]:
X = data.drop(columns=["Unnamed: 0","labels"]).values

In [7]:
print(X.shape)
print(y.shape)

(83998, 29)
(83998,)


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)

In [11]:
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.2, random_state=666)

In [17]:
fs = SelectFromModel(
    estimator=GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, subsample=0.7, verbose=True),
    max_features=15)

fs.fit(X_train, y_train)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1       30871.3818        2347.0234            1.82m
         2       27752.8648        1374.6126            1.74m
         3       25245.5208        1021.8296            1.69m
         4       23293.8169         814.0260            1.71m
         5       21708.9774         657.1621            1.69m
         6       20430.4606         551.9214            1.69m
         7       19265.5469         456.5139            1.67m
         8       18385.3029         401.9725            1.65m
         9       17672.0986         325.2529            1.64m
        10       16913.4201         264.5655            1.64m
        20       13697.8162          82.4784            1.43m
        30       12492.1085          31.6525            1.23m
        40       11897.0643          17.7614            1.05m
        50       11517.0476           9.3389           52.28s
        60       11187.5070           8.8801           41.67s
       

SelectFromModel(estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                     init=None,
                                                     learning_rate=0.1,
                                                     loss='deviance',
                                                     max_depth=3,
                                                     max_features=None,
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     n_estimators=100,
                                            

# lr 0.1

In [18]:
clf = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1, subsample=0.7, verbose=True)

In [19]:
clf.fit(fs.transform(X_train), y_train)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1       30773.6642        2340.7567            6.54m
         2       27663.0095        1365.6490            6.54m
         3       25295.2184        1032.1304            6.58m
         4       23356.0833         821.9823            6.47m
         5       21846.9085         651.4086            6.38m
         6       20415.6482         559.9768            6.36m
         7       19319.6639         454.4417            6.39m
         8       18539.4859         384.4756            6.40m
         9       17721.4049         316.3855            6.40m
        10       17057.6028         266.2472            6.40m
        20       13832.8070          77.2886            6.33m
        30       12818.5830          18.9161            6.40m
        40       12262.3587          14.3640            6.50m
        50       11883.6820           9.1183            6.43m
        60       11728.1922           4.7716            6.38m
       

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=1000,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=0.7, tol=0.0001,
                           validation_fraction=0.1, verbose=True,
                           warm_start=False)

In [20]:
y_pred = clf.predict(fs.transform(X_validation))

In [33]:
y_t_pred = clf.predict(fs.transform(X_test))

In [34]:
f1_score(y_true=y_test, y_pred=y_t_pred,average="macro")

0.7925579414371011

In [35]:
accuracy_score(y_true=y_test, y_pred=y_t_pred)

0.8788690476190476

In [36]:
cf1 = confusion_matrix(y_true=y_test, y_pred=y_t_pred)
cf1

array([[3578,  360,   95,  122],
       [ 713, 9683,   54,  147],
       [  83,   64,  319,   26],
       [ 226,  111,   34, 1185]], dtype=int64)

In [26]:
#joblib.dump(fs, "models_and_losses/Feature_selector_for_original_data.joblib")
#joblib.dump(clf, "models_and_losses/GB_n-1000_lr-1e-1_original_features.joblib")

['models_and_losses/GB_n-1000_lr-1e-1_original_features.joblib']

# lr 0.01

In [27]:
clf1 = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.01, subsample=0.7, verbose=True)

In [28]:
clf1.fit(fs.transform(X_train), y_train)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1       35724.4091         246.3144            6.40m
         2       35083.7578         231.1587            6.32m
         3       34627.7886         219.5555            6.37m
         4       34197.7124         206.1517            6.40m
         5       33682.6196         196.9984            6.48m
         6       33354.1227         187.9261            6.47m
         7       32932.1697         183.0129            6.45m
         8       32524.8106         171.7025            6.45m
         9       32009.3630         170.1526            6.44m
        10       31612.7020         164.1577            6.49m
        20       28501.3948         121.7397            6.39m
        30       25975.3413          94.5014            6.28m
        40       23858.9451          77.8148            6.25m
        50       22335.9311          64.4500            6.18m
        60       20894.1354          52.2338            6.29m
       

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.01, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=1000,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=0.7, tol=0.0001,
                           validation_fraction=0.1, verbose=True,
                           warm_start=False)

In [29]:
y_pred = clf1.predict(fs.transform(X_validation))

In [30]:
f1_score(y_true=y_validation, y_pred=y_pred,average="macro")

0.7969165303672667

In [31]:
accuracy_score(y_true=y_validation, y_pred=y_pred)

0.8699404761904762

In [32]:
cf1 = confusion_matrix(y_true=y_validation, y_pred=y_pred)
cf1

array([[2985,  250,   72,   95],
       [ 687, 7502,   47,  128],
       [  64,   48,  299,    6],
       [ 197,  129,   25,  906]], dtype=int64)

In [40]:
y_t_pred = clf1.predict(fs.transform(X_test))

In [41]:
f1_score(y_true=y_test, y_pred=y_t_pred,average="macro")

0.7867529663024644

In [42]:
accuracy_score(y_true=y_test, y_pred=y_t_pred)

0.8719642857142857

In [43]:
cf1 = confusion_matrix(y_true=y_test, y_pred=y_t_pred)
cf1

array([[3666,  271,   91,  127],
       [ 858, 9525,   50,  164],
       [  93,   59,  323,   17],
       [ 231,  154,   36, 1135]], dtype=int64)