In [27]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, balanced_accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (accuracy_score, balanced_accuracy_score,
                             classification_report, plot_confusion_matrix,
                             roc_auc_score)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from imblearn.ensemble import EasyEnsembleClassifier, RUSBoostClassifier
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier


In [28]:
X_train_base = pd.read_csv(r'../dataset/train.csv')
X_test_base = pd.read_csv(r'../dataset/test.csv')

y_train_base = X_train_base.match
X_train_base.drop('match', axis=1, inplace=True)

y_test_base = X_test_base.match
X_test_base.drop('match', axis=1, inplace=True)


X_train_smote = pd.read_csv(r'../dataset/train_smote.csv')
X_test_smote = pd.read_csv(r'../dataset/test_smote.csv')

y_train_smote = X_train_smote.match
X_train_smote.drop('match', axis=1, inplace=True)

y_test_smote = X_test_smote.match
X_test_smote.drop('match', axis=1, inplace=True)

X_train_poly = pd.read_csv(r'../dataset/train_poly.csv')
X_test_poly = pd.read_csv(r'../dataset/test_poly.csv')

y_train_poly = X_train_poly.match
X_train_poly.drop('match', axis=1, inplace=True)

y_test_poly = X_test_poly.match
X_test_poly.drop('match', axis=1, inplace=True)

# Base

In [29]:
sgd_base = SGDClassifier(
    loss = 'log',
    # verbose = 1,
    max_iter = 1000,
    tol = 1e-4,
    n_jobs = -1,
    random_state=42,
    early_stopping=True,
    validation_fraction = 0.2,
    n_iter_no_change = 100,
    class_weight='balanced'
)

sgd_base.fit(X_train_base, y_train_base)
sgd_base_pred = sgd_base.predict(X_test_base)
sgd_base_pred_train = sgd_base.predict(X_train_base)

print("\n\nClassification Report on the TRAIN SET\n\n")
print(classification_report(y_train_base, sgd_base_pred_train))
print("\n\nModel's accuracy on train: {}".format(balanced_accuracy_score(y_train_base, sgd_base_pred_train)))

print("\n\nClassification Report on the TEST SET\n\n")
print(classification_report(y_test_base, sgd_base_pred))
print("\n\nModel's accuracy on test: {}".format(balanced_accuracy_score(y_test_base, sgd_base_pred)))



Classification Report on the TRAIN SET


              precision    recall  f1-score   support

           0       1.00      0.71      0.83      4617
           1       0.41      0.98      0.58       936

    accuracy                           0.76      5553
   macro avg       0.70      0.85      0.71      5553
weighted avg       0.90      0.76      0.79      5553



Model's accuracy on train: 0.8482864330817548


Classification Report on the TEST SET


              precision    recall  f1-score   support

           0       0.99      0.70      0.82      1979
           1       0.39      0.97      0.56       402

    accuracy                           0.74      2381
   macro avg       0.69      0.83      0.69      2381
weighted avg       0.89      0.74      0.77      2381



Model's accuracy on test: 0.8322391327847876


# Poly

In [30]:
sgd_poly = SGDClassifier(
    loss = 'log',
    # verbose = 1,
    max_iter = 1000,
    tol = 1e-4,
    n_jobs = -1,
    random_state=42,
    early_stopping=True,
    validation_fraction = 0.2,
    n_iter_no_change = 100,
    class_weight='balanced'
)

sgd_poly.fit(X_train_poly, y_train_poly)
sgd_poly_pred = sgd_poly.predict(X_test_poly)
sgd_poly_pred_train = sgd_poly.predict(X_train_poly)

print("\n\nClassification Report on the TRAIN SET\n\n")
print(classification_report(y_train_poly, sgd_poly_pred_train))
print("\n\nModel's accuracy on train: {}".format(balanced_accuracy_score(y_train_poly, sgd_poly_pred_train)))

print("\n\nClassification Report on the TEST SET\n\n")
print(classification_report(y_test_poly, sgd_poly_pred))
print("\n\nModel's accuracy on test: {}".format(balanced_accuracy_score(y_test_poly, sgd_poly_pred)))



Classification Report on the TRAIN SET


              precision    recall  f1-score   support

           0       1.00      0.71      0.83      4617
           1       0.41      0.98      0.58       936

    accuracy                           0.76      5553
   macro avg       0.70      0.85      0.71      5553
weighted avg       0.90      0.76      0.79      5553



Model's accuracy on train: 0.8482864330817548


Classification Report on the TEST SET


              precision    recall  f1-score   support

           0       0.99      0.70      0.82      1979
           1       0.39      0.97      0.56       402

    accuracy                           0.74      2381
   macro avg       0.69      0.83      0.69      2381
weighted avg       0.89      0.74      0.77      2381



Model's accuracy on test: 0.8322391327847876


# Smote

In [31]:
sgd_smote = SGDClassifier(
    loss = 'log',
    # verbose = 1,
    max_iter = 1000,
    tol = 1e-4,
    n_jobs = -1,
    random_state=42,
    early_stopping=True,
    validation_fraction = 0.2,
    n_iter_no_change = 100,
    class_weight='balanced'
)

sgd_smote.fit(X_train_smote, y_train_smote)
sgd_smote_pred = sgd_smote.predict(X_test_smote)
sgd_smote_pred_train = sgd_smote.predict(X_train_smote)

print("\n\nClassification Report on the TRAIN SET\n\n")
print(classification_report(y_train_smote, sgd_smote_pred_train))
print("\n\nModel's accuracy on train: {}".format(balanced_accuracy_score(y_train_smote, sgd_smote_pred_train)))

print("\n\nClassification Report on the TEST SET\n\n")
print(classification_report(y_test_smote, sgd_smote_pred))
print("\n\nModel's accuracy on test: {}".format(balanced_accuracy_score(y_test_smote, sgd_smote_pred)))



Classification Report on the TRAIN SET


              precision    recall  f1-score   support

           0       0.96      0.76      0.85      4479
           1       0.80      0.97      0.88      4479

    accuracy                           0.86      8958
   macro avg       0.88      0.86      0.86      8958
weighted avg       0.88      0.86      0.86      8958



Model's accuracy on train: 0.8649252065193123


Classification Report on the TEST SET


              precision    recall  f1-score   support

           0       0.98      0.72      0.83      1979
           1       0.40      0.91      0.55       402

    accuracy                           0.75      2381
   macro avg       0.69      0.82      0.69      2381
weighted avg       0.88      0.75      0.78      2381



Model's accuracy on test: 0.8155068517945894
