In [1]:
import sys
import warnings

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from skearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score, accuracy_score, matthews_corrcoef, f1_score

from sklearn.svm import SVC

sys.path.append("../../")

from helpers.split import tag_label_feature_split
from helpers.assess import make_classification_report, make_confusion_matrix

DATASET_FOLDER = "../../datasets/"

In [2]:
# read a data set
dataset="dataset_00_all.pickle"
df = pd.read_pickle(DATASET_FOLDER + dataset)

In [3]:
# get labels, a label encoder and features
_, (y, le), X = tag_label_feature_split(df, label_format="encoded")

In [4]:
# we are going to validate using k-fold cross validation so we will not
# create a separate validation set here

# split into train/validation and test datasets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, stratify=y, random_state=1962
)

In [5]:
# standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

**Baseline A Default SVC Classifier**

In [6]:
classifier = SVC(class_weight='balanced', random_state=1962)

In [7]:
scoring = ['matthews_corrcoef', 'f1_macro', 'balanced_accuracy']

In [8]:
scores = cross_validate (classifier, 
                         X_train_scaled, 
                         y=y_train, 
                         scoring=scoring, 
                         return_train_score=True, 
                         return_estimator=True,
                         n_jobs=-1)

In [9]:
scores

{'fit_time': array([485.37560773, 486.58632159, 485.57206368, 486.34850979,
        488.43695354]),
 'score_time': array([113.92202759, 115.24112821, 111.67763042, 114.13551927,
        111.67986703]),
 'estimator': [SVC(class_weight='balanced', random_state=1962),
  SVC(class_weight='balanced', random_state=1962),
  SVC(class_weight='balanced', random_state=1962),
  SVC(class_weight='balanced', random_state=1962),
  SVC(class_weight='balanced', random_state=1962)],
 'test_matthews_corrcoef': array([0.62210034, 0.6132632 , 0.62760916, 0.60977519, 0.62515211]),
 'train_matthews_corrcoef': array([0.76379666, 0.75849296, 0.76189183, 0.75826136, 0.76077731]),
 'test_f1_macro': array([0.60684832, 0.60074932, 0.61863507, 0.59813889, 0.61117679]),
 'train_f1_macro': array([0.79237095, 0.78491733, 0.78495973, 0.78266746, 0.78897893]),
 'test_balanced_accuracy': array([0.65069484, 0.65629939, 0.66904366, 0.64138003, 0.65898045]),
 'train_balanced_accuracy': array([0.88212305, 0.87604119, 0.8784

In [10]:
# mean validation mcc
scores['test_matthews_corrcoef'].mean()

0.6195799990821136

In [11]:
# mean validation macro average f1
scores['test_f1_macro'].mean()

0.6071096772126381

In [12]:
# mean validation balanced accuracy
scores['test_balanced_accuracy'].mean()

0.6552796736976686

In [16]:
test_mcc = []
test_f1 = []
test_balanced_accuracy = []

for classifier in scores['estimator']:
    predictions = classifier.predict(X_test_scaled)
    test_mcc.append (matthews_corrcoef (y_test, predictions))
    test_f1.append  (f1_score (y_test, predictions, average='macro'))
    test_balanced_accuracy.append (balanced_accuracy_score(y_test, predictions))

In [19]:
# mean test mcc
np.mean(test_mcc)

0.612579456874313

In [20]:
# mean test macro average f1
np.mean(test_f1)

0.5909841546323944

In [22]:
# mean test balanced_accuracy
np.mean(test_balanced_accuracy)

0.6282944627005651

**Tune with GridSearchCV**

In [None]:
param_grid = [
  {'C': [1, 10, 100, 1000],
   'gamma': [0.001, 0.0001],
   'kernel': ['rbf'],
   'class_weight': ['balanced'],
   'random_state': [1962]
  }]

tuned_classifier = GridSearchCV(SVC(), param_grid, scoring='matthews_corrcoef', n_jobs=-1, verbose=2)
tuned_classifier.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [None]:
tuned_classifier.best_score_

In [None]:
tuned_classifier.best_params_

**With PCA**

In [None]:
# PCA
pca = PCA(n_components=.95, random_state=1962)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [6]:
pca_classifier = SVC(class_weight='balanced', random_state=1962)

In [8]:
pca_scores = cross_validate (pca_classifier, 
                             X_train_pca, 
                             y=y_train, 
                             scoring=scoring, 
                             return_train_score=True, 
                             return_estimator=True,
                             n_jobs=-1)

In [9]:
pca_scores

{'fit_time': array([485.37560773, 486.58632159, 485.57206368, 486.34850979,
        488.43695354]),
 'score_time': array([113.92202759, 115.24112821, 111.67763042, 114.13551927,
        111.67986703]),
 'estimator': [SVC(class_weight='balanced', random_state=1962),
  SVC(class_weight='balanced', random_state=1962),
  SVC(class_weight='balanced', random_state=1962),
  SVC(class_weight='balanced', random_state=1962),
  SVC(class_weight='balanced', random_state=1962)],
 'test_matthews_corrcoef': array([0.62210034, 0.6132632 , 0.62760916, 0.60977519, 0.62515211]),
 'train_matthews_corrcoef': array([0.76379666, 0.75849296, 0.76189183, 0.75826136, 0.76077731]),
 'test_f1_macro': array([0.60684832, 0.60074932, 0.61863507, 0.59813889, 0.61117679]),
 'train_f1_macro': array([0.79237095, 0.78491733, 0.78495973, 0.78266746, 0.78897893]),
 'test_balanced_accuracy': array([0.65069484, 0.65629939, 0.66904366, 0.64138003, 0.65898045]),
 'train_balanced_accuracy': array([0.88212305, 0.87604119, 0.8784

In [10]:
# mean validation mcc
pca_scores['test_matthews_corrcoef'].mean()

0.6195799990821136

In [11]:
# mean validation macro average f1
pca_scores['test_f1_macro'].mean()

0.6071096772126381

In [12]:
# mean validation balanced accuracy
pca_scores['test_balanced_accuracy'].mean()

0.6552796736976686

In [16]:
pca_test_mcc = []
pca_test_f1 = []
pca_test_balanced_accuracy = []

for classifier in pca_scores['estimator']:
    predictions = classifier.predict(X_test_pca)
    pca_test_mcc.append (matthews_corrcoef (y_test, predictions))
    pca_test_f1.append  (f1_score (y_test, predictions, average='macro'))
    pca_test_balanced_accuracy.append (balanced_accuracy_score(y_test, predictions))

In [19]:
# mean test mcc
np.mean(pca_test_mcc)

0.612579456874313

In [20]:
# mean test macro average f1
np.mean(pca_test_f1)

0.5909841546323944

In [22]:
# mean test balanced_accuracy
np.mean(pca_test_balanced_accuracy)

0.6282944627005651

In [None]:
# classifier_PCA=SVC(C=10, kernel='rbf', gamma =.0001, random_state=1962)

In [None]:
make_classification_report(
    y_true=y_test,
    y_pred=predictions,
    model=classifier_PCA,
    x = X_test_pca,
    label_encoder=le,
    print_report=True
)
pass

In [None]:
make_confusion_matrix(
    y_true=y_test,
    y_pred=predictions,
    model=classifier_PCA,
    x = X_test_pca,
    normalize="true",  
    autoweight_sample=True,  
    label_encoder=le,  
    figsize=(8, 8),  
)
pass

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(10, 15))

title = r"Learning Curves (SVM, RBF kernel, C=10, $\gamma=0.0001$)"
# SVC is more expensive so we do a lower number of CV iterations:
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
estimator = classifier
plot_learning_curve(
    estimator, title, X_train_scaled, y_train, axes=axes[:, 1], ylim=(0.7, 1.01), cv=cv, n_jobs=4
)

plt.show()