In [1]:
import numpy as np 
import time

import torch 
import torch.nn as nn
import torch.nn.functional as F 

import matplotlib.pyplot as plt
from collections import OrderedDict

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import RobustScaler

from utils.inference import Trainer, plot_loss
import utils.datasets as d

In [2]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectFromModel
import joblib

In [3]:
import pandas as pd
data = pd.read_csv("data/Original_features.csv").dropna()

In [4]:
y = data.labels.values

In [5]:
X = data.drop(columns=["Unnamed: 0","labels"]).values

In [6]:
print(X.shape)
print(y.shape)

(83998, 29)
(83998,)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)

In [8]:
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.2, random_state=666)

In [10]:
fs = SelectFromModel(
    estimator=GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, subsample=0.7, verbose=True),
    max_features=15)

fs.fit(X_train, y_train)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1       30835.8781        2340.7267            1.81m
         2       27586.5825        1353.2530            1.78m
         3       25120.3754        1036.4389            1.79m
         4       23295.6295         820.5288            1.74m
         5       21746.6990         665.0355            1.73m
         6       20415.9660         543.8929            1.71m
         7       19306.0091         455.9853            1.69m
         8       18324.1987         380.7816            1.68m
         9       17650.9108         330.2608            1.66m
        10       16905.3156         274.7611            1.64m
        20       13641.9513          79.0727            1.47m
        30       12465.5415          28.9543            1.28m
        40       11941.6880          15.3354            1.09m
        50       11469.3963          12.5455           54.50s
        60       11233.1364           5.4791           43.72s
       

SelectFromModel(estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                     init=None,
                                                     learning_rate=0.1,
                                                     loss='deviance',
                                                     max_depth=3,
                                                     max_features=None,
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     n_estimators=100,
                                            

# rbf, c=100

In [13]:
?SVC

In [11]:
clf = SVC(C=100, kernel="rbf", gamma="auto", verbose=True)

In [12]:
clf.fit(fs.transform(X_train), y_train)

[LibSVM]

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=True)

In [13]:
y_pred = clf.predict(fs.transform(X_validation))

In [14]:
f1_score(y_true=y_validation, y_pred=y_pred,average="macro")

0.7824497106489123

In [15]:
accuracy_score(y_true=y_validation, y_pred=y_pred)

0.8640625

In [16]:
cf1 = confusion_matrix(y_true=y_validation, y_pred=y_pred)
cf1

array([[3064,  171,   52,  115],
       [ 787, 7377,   23,  177],
       [  69,   49,  259,   40],
       [ 211,  109,   24,  913]], dtype=int64)

In [None]:
#joblib.dump(fs, "models_and_losses/Feature_selector_for_original_data.joblib")
#joblib.dump(clf, "models_and_losses/SVM_rbf_C-100_gamma-auto_original_features.joblib")

# C=50


In [17]:
clf1 = SVC(C=50, kernel="rbf", gamma="auto",  verbose=True)

In [18]:
clf1.fit(fs.transform(X_train), y_train)

[LibSVM]

SVC(C=50, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=True)

In [19]:
y_pred = clf1.predict(fs.transform(X_validation))

In [20]:
f1_score(y_true=y_validation, y_pred=y_pred,average="macro")

0.7848718251632514

In [21]:
accuracy_score(y_true=y_validation, y_pred=y_pred)

0.8648809523809524

In [22]:
cf1 = confusion_matrix(y_true=y_validation, y_pred=y_pred)
cf1

array([[3055,  183,   54,  110],
       [ 783, 7386,   24,  171],
       [  65,   52,  261,   39],
       [ 206,  106,   23,  922]], dtype=int64)

# C=10


In [23]:
clf2 = SVC(C=10, kernel="rbf", gamma="auto",  verbose=True)

In [24]:
clf2.fit(fs.transform(X_train), y_train)

[LibSVM]

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=True)

In [25]:
y_pred = clf2.predict(fs.transform(X_validation))

In [26]:
f1_score(y_true=y_validation, y_pred=y_pred,average="macro")

0.7793031023070738

In [27]:
accuracy_score(y_true=y_validation, y_pred=y_pred)

0.8617559523809524

In [28]:
cf1 = confusion_matrix(y_true=y_validation, y_pred=y_pred)
cf1

array([[3021,  226,   54,  101],
       [ 774, 7386,   27,  177],
       [  78,   47,  253,   39],
       [ 214,  100,   21,  922]], dtype=int64)

# best val

In [37]:
y_t_pred = clf1.predict(fs.transform(X_test))

In [38]:
f1_score(y_true=y_test, y_pred=y_t_pred,average="macro")

0.7795988674358967

In [39]:
accuracy_score(y_true=y_test, y_pred=y_t_pred)

0.8671428571428571

In [40]:
cf1 = confusion_matrix(y_true=y_test, y_pred=y_t_pred)
cf1

array([[3716,  214,   78,  147],
       [ 954, 9408,   39,  196],
       [  83,   64,  302,   43],
       [ 259,  130,   25, 1142]], dtype=int64)

In [41]:
#joblib.dump(clf, "models_and_losses/SVM_rbf_C-50_gamma-auto_original_features.joblib")

['models_and_losses/SVM_rbf_C-50_gamma-auto_original_features.joblib']