In [7]:
from cleaning import clean_data, SMOTE_resample
import numpy as np
from sklearn.svm import SVC, LinearSVC
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from collections import Counter
from evaluate import qwk
from sklearn.metrics import cohen_kappa_score

In [8]:
# Load Data
X_train, y_train, X_test, y_test = clean_data('')
print(Counter(y_train))
print(Counter(y_test))

# SMOTE resampling
X_train, y_train, X_test, y_test = SMOTE_resample(X_train, y_train, X_test, y_test)
print(Counter(y_train))
print(Counter(y_test))

# Feature selection
pca = PCA(svd_solver='full')
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

Counter({4: 2939, 2: 2790, 3: 2300, 1: 2183, 0: 283})
Counter({4: 1258, 2: 1247, 3: 959, 1: 907, 0: 127})
Counter({1: 3002, 4: 2953, 2: 2932, 0: 2903, 3: 2899})
Counter({3: 1298, 0: 1294, 2: 1265, 4: 1244, 1: 1195})


In [9]:
losses = ["hinge", "squared_hinge"]
learning_rates = [1e-15, 1e-8, 1e-4, 1e-2, 1e-1, 1]
penalties = ["l1", "l2"]
max_iters = [1000, 5000, 10000, 20000, 50000, 150000]

random_grid = {
    "C": learning_rates,
    "penalty": penalties,
    "max_iter": max_iters,
    "loss": losses,
    "dual": [False]
}

svc = LinearSVC()
svc_random = RandomizedSearchCV(estimator=svc, param_distributions=random_grid, n_iter=50, cv=3, verbose=0, n_jobs=-1, error_score=np.NINF)
svc_random.fit(X_train, y_train)
print(f'Best Parameters = {svc_random.best_params_}')
print(f'Best Score = {svc_random.best_score_}')

60 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to -inf.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
33 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Noah\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Noah\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\svm\_classes.py", line 257, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
  File "c:\Users\Noah\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\svm\_base.py", line 1204, in _fit_liblinear
    solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, d

Best Parameters = {'penalty': 'l1', 'max_iter': 5000, 'loss': 'squared_hinge', 'dual': False, 'C': 1}
Best Score = 0.3529169992254384


In [10]:
# Linear SVC
# best_params = {'penalty': 'l1', 'max_iter': 5000, 'loss': 'squared_hinge', 'dual': False, 'C': 1}
best_params = svc_random.best_params_
lsvc = LinearSVC(penalty=best_params['penalty'], max_iter=best_params['max_iter'], loss=best_params['loss'], dual=best_params['dual'], C=best_params['C'])
lsvc.fit(X_train, y_train)
y_pred = lsvc.predict(X_test)
print(f'Train Score = {lsvc.score(X_train, y_train)}')
print(f'Test Score = {lsvc.score(X_test, y_test)}')
print(f'QWK = {qwk(y_test, y_pred)}')
print(f'Cohens Kappa Score = {cohen_kappa_score(y_test, y_pred)}')

Train Score = 0.3836884743685751
Test Score = 0.35578144853875476
QWK = 0.999999983322588
Cohens Kappa Score = 0.1946032062622729


In [11]:
# SVC
learning_rates = [1e-15, 1e-8, 1e-4, 1e-2, 1e-1, 1]
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
gammas = ['scale', 'auto']

random_grid = {
    "C": learning_rates,
    "kernel": kernels,
    "gamma": gammas,
    "class_weight": [None]
}

svc = SVC()
svc_random = RandomizedSearchCV(estimator=svc, param_distributions=random_grid, n_iter=24, cv=3, verbose=0, n_jobs=-1, error_score=np.NINF)
svc_random.fit(X_train, y_train)
print(f'Best Parameters = {svc_random.best_params_}')
print(f'Best Score = {svc_random.best_score_}')


Best Parameters = {'kernel': 'rbf', 'gamma': 'scale', 'class_weight': None, 'C': 1}
Best Score = 0.46476902681068794


In [12]:
# SVC
# best_params = {'kernel': 'rbf', 'gamma': 'scale', 'class_weight': None, 'C': 1}
best_params = svc_random.best_params_
svc = SVC(kernel=best_params['kernel'], gamma=best_params['gamma'], C=best_params['C'])
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
print(f'Train Score = {svc.score(X_train, y_train)}')
print(f'Test Score = {svc.score(X_test, y_test)}')
print(f'QWK = {qwk(y_test, y_pred)}')
print(f'Cohens Kappa Score = {cohen_kappa_score(y_test, y_pred)}')

Train Score = 0.6015385662740826
Test Score = 0.48014612452350697
QWK = 0.9999999886313515
Cohens Kappa Score = 0.35058205811088883
