In [1]:
# Import libraries
import warnings

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import cohen_kappa_score

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

In [2]:
# Preliminaries
# ---------------------------------------------
print('Loading data from file ...')  
dataset = pd.read_csv('winequality-white.csv')
print('done \n')

print('Removing rows with missing data ...')
dataset = dataset.dropna() 
print('done \n')

Loading data from file ...
done 

Removing rows with missing data ...
done 



In [3]:
# Previewing Data
print('Sample rows from the dataset (top and bottom five):')  # Spot checks
display(dataset.head(5))
display(dataset.tail(5))
print('\n')

Sample rows from the dataset (top and bottom five):


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.5,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.9949,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.3,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7
4897,6.0,0.21,0.38,0.8,0.02,22.0,98.0,0.98941,3.26,0.32,11.8,6






In [4]:
# Setting the X(features) and y(target) 
#In this case, the features: 
#(fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, density, pH, sulphates,and alcohol) 
#will be used to predict FlowPattern
print('Reading list of problem variables X and Y...')
X_name = [ 'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 
          'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'] # columns to focus on as predictors
X = dataset[X_name]   # only keep these columns as features
y_name = 'quality'     # column to focus on as target
y = dataset[y_name]   # only keep this column as label 
print('done \n')

Reading list of problem variables X and Y...
done 



In [5]:
# Setting up a classification problem

# Split data into training and testing datasets

print('Partitioning data into parts: formative (for development) and summative (for testing) ...')
test_pct = 0.20   # reserve 20% of the data points for testing performance
seed = 42          # specifying the seed allows for repeatability
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=test_pct, random_state=seed)
print('done with setting aside data for testing')

Partitioning data into parts: formative (for development) and summative (for testing) ...
done with setting aside data for testing


In [6]:
# Question 1

# Chose the formative scoring method

print('Reading list of scoring methods to use during model development ...')
scoring = 'accuracy'
print('done \n')

Reading list of scoring methods to use during model development ...
done 



In [7]:
# QUESTION 1
# Design the classifier neural network
# Chose the Algorithms

seed = 42 # setting the seed allows for repeatability

print('Reading list of algorithms to train ...')
models = []
#mlp = MLPClassifier(hidden_layer_sizes=(30,30), activation = 'relu', alpha=0.005, learning_rate_init=0.1 , max_iter=20, random_state=42, verbose=10,)
models.append(( 'MLP_1', MLPClassifier(hidden_layer_sizes=(70,70), activation = 'relu', alpha=0.0001, learning_rate_init=0.1 , 
                                       max_iter=20, random_state=42, verbose=10,)) )
models.append(( 'MLP_2', MLPClassifier(hidden_layer_sizes=(50,50,50), activation = 'relu', alpha=0.002, learning_rate_init=0.3, 
                                       max_iter=30, random_state=42, verbose=10,)))
models.append(( 'MLP_3', MLPClassifier(hidden_layer_sizes=(60,60,60), activation = 'relu', alpha=0.05, learning_rate_init=0.5, 
                                       max_iter=50, random_state=42, verbose=10,)))
print(models)
print('done \n')

Reading list of algorithms to train ...
[('MLP_1', MLPClassifier(hidden_layer_sizes=(70, 70), learning_rate_init=0.1, max_iter=20,
              random_state=42, verbose=10)), ('MLP_2', MLPClassifier(alpha=0.002, hidden_layer_sizes=(50, 50, 50),
              learning_rate_init=0.3, max_iter=30, random_state=42, verbose=10)), ('MLP_3', MLPClassifier(alpha=0.05, hidden_layer_sizes=(60, 60, 60),
              learning_rate_init=0.5, max_iter=50, random_state=42, verbose=10))]
done 



In [8]:
from sklearn.exceptions import ConvergenceWarning

k4folds = 5
results = []
names = []
for name, model in models:   # Select each model in turn
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=ConvergenceWarning, module="sklearn")
        print(" ++ NOW WORKING ON ALGORITHM %s ++" % name)
        print("Splitting data into %s folds" % k4folds)
        kfold = model_selection.KFold(n_splits=k4folds, random_state=42, shuffle=True)
        print("Training model on each split ...")
        cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring, verbose=3)
        print("5-fold cross validation results:", cv_results)
        print()
        msg = "algorithm %s %s results: mean = %f (std = %f)" % (name, scoring, cv_results.mean(), cv_results.std())
        print(msg)
        print()
print('all model trainings have been completed \n')

 ++ NOW WORKING ON ALGORITHM MLP_1 ++
Splitting data into 5 folds
Training model on each split ...
Iteration 1, loss = 8.27411351
Iteration 2, loss = 1.37084187


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Iteration 3, loss = 1.29283614
Iteration 4, loss = 1.28403020
Iteration 5, loss = 1.27211429
Iteration 6, loss = 1.25865449
Iteration 7, loss = 1.25534199
Iteration 8, loss = 1.25251031
Iteration 9, loss = 1.24689365
Iteration 10, loss = 1.24873412
Iteration 11, loss = 1.24520102
Iteration 12, loss = 1.25624579
Iteration 13, loss = 1.25276250
Iteration 14, loss = 1.25124502
Iteration 15, loss = 1.26922609
Iteration 16, loss = 1.24724925
Iteration 17, loss = 1.23596862
Iteration 18, loss = 1.23909679
Iteration 19, loss = 1.24166524
Iteration 20, loss = 1.23388548
[CV] END ................................ score: (test=0.444) total time=   0.9s
Iteration 1, loss = 6.66073191
Iteration 2, loss = 1.32394567
Iteration 3, loss = 1.31065670
Iteration 4, loss = 1.30915616


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s remaining:    0.0s


Iteration 5, loss = 1.30356741
Iteration 6, loss = 1.29909064
Iteration 7, loss = 1.30201436
Iteration 8, loss = 1.30041553
Iteration 9, loss = 1.29991461
Iteration 10, loss = 1.29938755
Iteration 11, loss = 1.30491803
Iteration 12, loss = 1.29781664
Iteration 13, loss = 1.30074827
Iteration 14, loss = 1.30222259
Iteration 15, loss = 1.29966010
Iteration 16, loss = 1.30126271
Iteration 17, loss = 1.30083111
Iteration 18, loss = 1.30391769
Iteration 19, loss = 1.30042454
Iteration 20, loss = 1.30158801
[CV] END ................................ score: (test=0.420) total time=   0.5s
Iteration 1, loss = 8.11985935
Iteration 2, loss = 1.34168901
Iteration 3, loss = 1.31827942
Iteration 4, loss = 1.28625502
Iteration 5, loss = 1.27861969
Iteration 6, loss = 1.28641936


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.4s remaining:    0.0s


Iteration 7, loss = 1.26830981
Iteration 8, loss = 1.25605636
Iteration 9, loss = 1.25320065
Iteration 10, loss = 1.25255955
Iteration 11, loss = 1.25239344
Iteration 12, loss = 1.25591477
Iteration 13, loss = 1.26084777
Iteration 14, loss = 1.25003979
Iteration 15, loss = 1.24859248
Iteration 16, loss = 1.24736876
Iteration 17, loss = 1.24519164
Iteration 18, loss = 1.25035963
Iteration 19, loss = 1.24572962
Iteration 20, loss = 1.24553440
[CV] END ................................ score: (test=0.474) total time=   0.5s
Iteration 1, loss = 9.07495662
Iteration 2, loss = 1.31238087
Iteration 3, loss = 1.29511538
Iteration 4, loss = 1.29108314
Iteration 5, loss = 1.29148326
Iteration 6, loss = 1.29213315
Iteration 7, loss = 1.28898614
Iteration 8, loss = 1.29320991
Iteration 9, loss = 1.28997585
Iteration 10, loss = 1.29101923
Iteration 11, loss = 1.28785366
Iteration 12, loss = 1.29069230
Iteration 13, loss = 1.29400047
Iteration 14, loss = 1.29048021
Iteration 15, loss = 1.29058682
Ite

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Iteration 7, loss = 1.34646344
Iteration 8, loss = 1.34848868
Iteration 9, loss = 1.34684224
Iteration 10, loss = 1.34721571
Iteration 11, loss = 1.34705469
Iteration 12, loss = 1.34654871
Iteration 13, loss = 1.34537715
Iteration 14, loss = 1.34712809
Iteration 15, loss = 1.34800935
Iteration 16, loss = 1.34528976
Iteration 17, loss = 1.35036549
Iteration 18, loss = 1.34694213
Iteration 19, loss = 1.34665759
Iteration 20, loss = 1.34474403
Iteration 21, loss = 1.34276915
Iteration 22, loss = 1.34323214
Iteration 23, loss = 1.34581123
Iteration 24, loss = 1.34441727
Iteration 25, loss = 1.34439538
Iteration 26, loss = 1.34359008
Iteration 27, loss = 1.34332339
Iteration 28, loss = 1.34253573
Iteration 29, loss = 1.34283632
Iteration 30, loss = 1.34684812
[CV] END ................................ score: (test=0.450) total time=   0.9s
Iteration 1, loss = 12.07481848
Iteration 2, loss = 1.38437010
Iteration 3, loss = 1.36104676
Iteration 4, loss = 1.34943256
Iteration 5, loss = 1.3485072

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s remaining:    0.0s


Iteration 7, loss = 1.34821361
Iteration 8, loss = 1.34759213
Iteration 9, loss = 1.34580819
Iteration 10, loss = 1.34659682
Iteration 11, loss = 1.34796888
Iteration 12, loss = 1.34528015
Iteration 13, loss = 1.34584480
Iteration 14, loss = 1.34800642
Iteration 15, loss = 1.35004659
Iteration 16, loss = 1.34764380
Iteration 17, loss = 1.34753187
Iteration 18, loss = 1.34532028
Iteration 19, loss = 1.34557088
Iteration 20, loss = 1.34621611
Iteration 21, loss = 1.34523260
Iteration 22, loss = 1.34368769
Iteration 23, loss = 1.34497358
Iteration 24, loss = 1.34293240
Iteration 25, loss = 1.34452790
Iteration 26, loss = 1.34425097
Iteration 27, loss = 1.34368869
Iteration 28, loss = 1.34686683
Iteration 29, loss = 1.34997642
Iteration 30, loss = 1.34631184
[CV] END ................................ score: (test=0.420) total time=   0.9s
Iteration 1, loss = 13.72857776
Iteration 2, loss = 1.37759816
Iteration 3, loss = 1.36461795
Iteration 4, loss = 1.35859649
Iteration 5, loss = 1.3516033

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.8s remaining:    0.0s


Iteration 7, loss = 1.35171353
Iteration 8, loss = 1.35026728
Iteration 9, loss = 1.35095138
Iteration 10, loss = 1.34990090
Iteration 11, loss = 1.35198333
Iteration 12, loss = 1.35052659
Iteration 13, loss = 1.35153337
Iteration 14, loss = 1.35140124
Iteration 15, loss = 1.35027098
Iteration 16, loss = 1.34978011
Iteration 17, loss = 1.35025657
Iteration 18, loss = 1.34827326
Iteration 19, loss = 1.34869665
Iteration 20, loss = 1.34995614
Iteration 21, loss = 1.34871566
Iteration 22, loss = 1.35267575
Iteration 23, loss = 1.35378607
Iteration 24, loss = 1.35389428
Iteration 25, loss = 1.35317480
Iteration 26, loss = 1.35090034
Iteration 27, loss = 1.35149944
Iteration 28, loss = 1.34603104
Iteration 29, loss = 1.34641913
Iteration 30, loss = 1.34975185
[CV] END ................................ score: (test=0.490) total time=   0.9s
Iteration 1, loss = 11.17213402
Iteration 2, loss = 1.35390916
Iteration 3, loss = 1.34121346
Iteration 4, loss = 1.34417612
Iteration 5, loss = 1.3417988

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    4.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Iteration 7, loss = 5.60385873
Iteration 8, loss = 5.51091578
Iteration 9, loss = 5.42241786
Iteration 10, loss = 5.34312568
Iteration 11, loss = 5.26754920
Iteration 12, loss = 5.19508642
Iteration 13, loss = 5.13357031
Iteration 14, loss = 5.06489202
Iteration 15, loss = 5.00377367
Iteration 16, loss = 4.94700607
Iteration 17, loss = 4.89017547
Iteration 18, loss = 4.83535333
Iteration 19, loss = 4.78496482
Iteration 20, loss = 4.73253518
Iteration 21, loss = 4.68583003
Iteration 22, loss = 4.64852908
Iteration 23, loss = 4.60565642
Iteration 24, loss = 4.55160390
Iteration 25, loss = 4.51335191
Iteration 26, loss = 4.47777917
Iteration 27, loss = 4.43588409
Iteration 28, loss = 4.40108970
Iteration 29, loss = 4.36089190
Iteration 30, loss = 4.32754512
Iteration 31, loss = 4.29519202
Iteration 32, loss = 4.27395606
Iteration 33, loss = 4.24030822
Iteration 34, loss = 4.19898098
Iteration 35, loss = 4.17202961
Iteration 36, loss = 4.13833150
Iteration 37, loss = 4.11139564
Iteration 3

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.6s remaining:    0.0s


Iteration 6, loss = 6.07769414
Iteration 7, loss = 5.96638799
Iteration 8, loss = 5.86599366
Iteration 9, loss = 5.76927458
Iteration 10, loss = 5.68387636
Iteration 11, loss = 5.60005194
Iteration 12, loss = 5.52231016
Iteration 13, loss = 5.45045806
Iteration 14, loss = 5.37741468
Iteration 15, loss = 5.31138303
Iteration 16, loss = 5.24665119
Iteration 17, loss = 5.18521950
Iteration 18, loss = 5.12716338
Iteration 19, loss = 5.07322206
Iteration 20, loss = 5.01996006
Iteration 21, loss = 4.96951750
Iteration 22, loss = 4.92654265
Iteration 23, loss = 4.87473474
Iteration 24, loss = 4.83027360
Iteration 25, loss = 4.78638178
Iteration 26, loss = 4.74227144
Iteration 27, loss = 4.69858094
Iteration 28, loss = 4.66121476
Iteration 29, loss = 4.62693859
Iteration 30, loss = 4.58982362
Iteration 31, loss = 4.55572875
Iteration 32, loss = 4.51936961
Iteration 33, loss = 4.48710622
Iteration 34, loss = 4.45529537
Iteration 35, loss = 4.42538591
Iteration 36, loss = 4.39820023
Iteration 37

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.2s remaining:    0.0s


Iteration 4, loss = 6.34395159
Iteration 5, loss = 6.11750972
Iteration 6, loss = 5.99140120
Iteration 7, loss = 5.87340959
Iteration 8, loss = 5.77127877
Iteration 9, loss = 5.68001269
Iteration 10, loss = 5.59432736
Iteration 11, loss = 5.51529331
Iteration 12, loss = 5.44106374
Iteration 13, loss = 5.36608228
Iteration 14, loss = 5.29930869
Iteration 15, loss = 5.23964696
Iteration 16, loss = 5.17402017
Iteration 17, loss = 5.11326584
Iteration 18, loss = 5.05602205
Iteration 19, loss = 5.00129657
Iteration 20, loss = 4.94556580
Iteration 21, loss = 4.89805832
Iteration 22, loss = 4.84784954
Iteration 23, loss = 4.80333042
Iteration 24, loss = 4.75314508
Iteration 25, loss = 4.70999677
Iteration 26, loss = 4.66670599
Iteration 27, loss = 4.63170176
Iteration 28, loss = 4.59022024
Iteration 29, loss = 4.55212544
Iteration 30, loss = 4.51594664
Iteration 31, loss = 4.47916811
Iteration 32, loss = 4.44820825
Iteration 33, loss = 4.41336072
Iteration 34, loss = 4.38253322
Iteration 35, 

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.9s finished


In [9]:
##### QUESTION 2 - Modifying MLP Optimizers using GridSearch

# Tuning Decision Tree Classifier hyperparameters using GridSearch

selected_model = MLPClassifier()
hyperparameters = {'solver': ['sgd', 'adam'], 'alpha':[0.01, 0.05], 'max_iter': [80, 100]}
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=ConvergenceWarning, module="sklearn")
    print("Now tuning hyperparameters...")
    clf = GridSearchCV(selected_model, hyperparameters, cv=5, scoring=scoring, verbose=4)
    clf.fit(X_train, y_train)

    print("Best hyperparameters found on development set:")
    print(clf.best_params_)
    print("Grid scores on development set:")
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))
    print('done \n')

    MLP_tuned_model = clf.best_estimator_

Now tuning hyperparameters...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END alpha=0.01, max_iter=80, solver=sgd;, score=0.503 total time=   1.2s
[CV 2/5] END alpha=0.01, max_iter=80, solver=sgd;, score=0.462 total time=   1.2s
[CV 3/5] END alpha=0.01, max_iter=80, solver=sgd;, score=0.445 total time=   1.2s
[CV 4/5] END alpha=0.01, max_iter=80, solver=sgd;, score=0.490 total time=   1.2s
[CV 5/5] END alpha=0.01, max_iter=80, solver=sgd;, score=0.451 total time=   1.2s
[CV 1/5] END alpha=0.01, max_iter=80, solver=adam;, score=0.485 total time=   1.3s
[CV 2/5] END alpha=0.01, max_iter=80, solver=adam;, score=0.467 total time=   1.3s
[CV 3/5] END alpha=0.01, max_iter=80, solver=adam;, score=0.510 total time=   1.1s
[CV 4/5] END alpha=0.01, max_iter=80, solver=adam;, score=0.497 total time=   1.3s
[CV 5/5] END alpha=0.01, max_iter=80, solver=adam;, score=0.496 total time=   1.3s
[CV 1/5] END alpha=0.01, max_iter=100, solver=sgd;, score=0.481 total time=   1.5s
[C

#### Through the use of Grid search, the variation of the optimizer parameters influenced the performance as shown above. The best optimizer parameters found on the development set include an MLP(alpha = 0.01, learning_rate_init = 0.02, max_iter (epoch) = 100, and solver = adam). 

In [10]:
# QUESTION 3 - TESTING THE PERFORMANCE OF THE BEST MODEL

print("Now testing the best tuned model on the separate test set...")
print("Detailed classification report:")
print('\n')
y_true, y_pred = y_test, MLP_tuned_model.predict(X_test)
print(classification_report(y_true, y_pred))
print('Cohen Kappa Score:', cohen_kappa_score(y_true, y_pred))
print('done \n')

#print(f'Tuned decision tree has {tuned_model.tree_.node_count} nodes with maximum depth {tuned_model.tree_.max_depth}.')

Now testing the best tuned model on the separate test set...
Detailed classification report:


              precision    recall  f1-score   support

           3       0.00      0.00      0.00         5
           4       0.50      0.08      0.14        25
           5       0.49      0.70      0.58       291
           6       0.50      0.59      0.54       432
           7       0.46      0.11      0.18       192
           8       0.00      0.00      0.00        35

    accuracy                           0.49       980
   macro avg       0.32      0.25      0.24       980
weighted avg       0.47      0.49      0.45       980

Cohen Kappa Score: 0.19728829139377224
done 



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


##### As expected, the tuned model accuracy on the test data closely match that obtained from the trained model which shows that overfitting is very limited. 
##### Also, from inspection, the 51% accuracy can be termed fair. 

##### PS: the accuracy is slightly unstable as it varies slightly whenever it is re-run (so restarting the kernel might chane the answer between + or - 1 to 2%

In [None]:
# QUESTION 4 - training a different classifier 
# Tuning Support Vector Machine Classifier hyperparameters using GridSearch

from sklearn.svm import SVC

selected_model = SVC()
hyperparameters = {'kernel':['linear', 'rbf'], 'C':[0.01, 1] }

print("Now tuning hyperparameters...")
clf = GridSearchCV(selected_model, hyperparameters, cv=5, scoring=scoring, verbose=4)
clf.fit(X_train, y_train)
print("Best hyperparameters found on development set:")
print(clf.best_params_)
print("Grid scores on development set:")
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
        % (mean, std * 2, params))
print('done \n')
SVC_tuned_model = clf.best_estimator_

Now tuning hyperparameters...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END .............C=0.01, kernel=linear;, score=0.511 total time=   4.9s
[CV 2/5] END .............C=0.01, kernel=linear;, score=0.497 total time=   5.5s
[CV 3/5] END .............C=0.01, kernel=linear;, score=0.515 total time=   6.0s
[CV 4/5] END .............C=0.01, kernel=linear;, score=0.494 total time=   7.2s
[CV 5/5] END .............C=0.01, kernel=linear;, score=0.484 total time=   7.1s
[CV 1/5] END ................C=0.01, kernel=rbf;, score=0.452 total time=   1.0s
[CV 2/5] END ................C=0.01, kernel=rbf;, score=0.450 total time=   1.0s
[CV 3/5] END ................C=0.01, kernel=rbf;, score=0.450 total time=   1.0s
[CV 4/5] END ................C=0.01, kernel=rbf;, score=0.451 total time=   1.0s
[CV 5/5] END ................C=0.01, kernel=rbf;, score=0.451 total time=   1.0s


In [None]:
# QUESTION 4 - TESTING THE PERFORMANCE OF THE SVC BEST MODEL

print("Now testing the best tuned model on the separate test set...")
print("Detailed classification report:")
print('\n')
y_true, y_pred = y_test, SVC_tuned_model.predict(X_test)
print(classification_report(y_true, y_pred))
print('Cohen Kappa Score:', cohen_kappa_score(y_true, y_pred))
print('done \n')

#print(f'Tuned decision tree has {tuned_model.tree_.node_count} nodes with maximum depth {tuned_model.tree_.max_depth}.')

### From inspection, the SVC model with 51% which turns out to be the same as the MLP model with 50% accuracy. 
### It is worth to note that the SVC model training were less interrupted when compared to the MLP model which had covergence warnings due to the limited number of iterations. With further training, I believe the MLP model could beat the performance of the SVC model.  