# Introduction

# Setup
Let us import the required modules.

In [1]:
import pandas as pd
import seaborn as sns
import os
import joblib

import src.data_split as ds
import src.evaluation as eva

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

## Load Data
Note that the dataset is already split into Train-Test sets.

In [None]:
data_split = ds.TrainTestSplit.from_csv_directory(dir_path="../data")

In [None]:
data_split.x_train.info()

In [None]:
data_split.y_train.shape

# Support Vector Machine Classifier Model
## SVM  using  RBF kernels;


In [None]:
clf = SVC()
clf.fit(data_split.x_train, data_split.y_train)

In [None]:
rf_train_pred = clf.predict(data_split.x_train)
rf_test_pred = clf.predict(data_split.x_test)

rf_train_eval = eva.Evaluation(y_real=data_split.y_train, y_pred=rf_train_pred)
rf_test_eval = eva.Evaluation(y_real=data_split.y_test, y_pred=rf_test_pred)

In [None]:
print("Training:")
rf_train_eval.print_eval()
print("Testing:")
rf_test_eval.print_eval()

In [None]:
import time

# automatic parameters tuning
svcclsf_rbf=SVC(random_state=28)
properties={
    "n_estimators": [x for x in range(50,201,50)],
    "min_samples_leaf": [x for x in range(50,201,50)],
    "max_leaf_nodes": [x for x in range(75,176,25)]
}

start_time = time.time()
tuned_svcclsf_rbf=GridSearchCV(svcclsf_rbf,properties,scoring="accuracy",cv=10,return_train_score=True,verbose=6,n_jobs=4)
tuned_svcclsf_rbf.fit(data_split.x_train,data_split.y_train)
print("--- %s seconds ---" % (time.time() - start_time))

print ("Best Score: {:.3f}".format(tuned_svcclsf_rbf.best_score_) )
print ("Best Params: ", tuned_svcclsf_rbf.best_params_)

In [None]:
# save results
if not os.path.exists('../tuned_models'):
    os.mkdir('../tuned_models')
joblib.dump(tuned_svcclsf_rbf, '../tuned_models/svc_rbf_tuning_results.pkl')

In [None]:
# to load previously saved results
tuned_svcclsf_rbf = joblib.load("../tuned_models/svc_rbf_tuning_results.pkl")

In [None]:
print(tuned_svcclsf_rbf.cv_results_)
tuned_svcclsf_rbf_results = pd.DataFrame( tuned_svcclsf_rbf.cv_results_ )

In [None]:
tuned_svcclsf_rbf_results[["param_n_estimators","mean_test_score","mean_train_score"]]

In [None]:
sns.set(rc={"figure.figsize":(12, 8)})
sns.lineplot(data=tuned_svcclsf_rbf_results, x="param_n_estimators", y="mean_test_score")

In [None]:
print(tuned_svcclsf_rbf.best_estimator_)
print(tuned_svcclsf_rbf.best_score_)

In [None]:
test_acc = accuracy_score(y_true = data_split.y_train,
                          y_pred = tuned_svcclsf_rbf.predict(data_split.x_train) )
print ("Train Accuracy: {}".format(test_acc) )

In [None]:
# best parameters from automatic parameters tuning
svc_rbf_clsf = SVC(**tuned_svcclsf_rbf.best_params_)
# svc_rbf_clsf = SVC(max_leaf_nodes=100, min_samples_leaf=75, n_estimators=200, random_state=28)

svc_rbf_clsf.fit(data_split.x_train, data_split.y_train)

In [None]:
svc_rbf_train_pred = svc_rbf_clsf.predict(data_split.x_train)
svc_rbf_test_pred = svc_rbf_clsf.predict(data_split.x_test)

svc_rbf_train_eval = eva.Evaluation(y_real=data_split.y_train, y_pred=rf_train_pred)
svc_rbf_test_eval = eva.Evaluation(y_real=data_split.y_test, y_pred=rf_test_pred)

In [None]:
print("Training:")
svc_rbf_train_eval.print_eval()
print("Testing:")
svc_rbf_test_eval.print_eval()

In [None]:
# rf_model = eva.EvaluatedModel(forest_clsf, train_eval=svc_rbf_train_eval, test_eval=rf_test_eval)

accuracy is not a great measure of classifier performance when the classes are imbalanced
but as we can see from the plots, the classes are more or less balanced

# Performance Visualization

### Setup