In [5]:
import numpy as np
import pandas as pd 
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
# import the winequality-bin.csv dataset into a dataframe called wine
wine = pd.read_csv('winequality-bin.csv')

In [7]:
wine.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,0
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,0
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,1
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,1
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,0


In [8]:
# assign the features to the X and the labels to y
X = wine.drop('quality', axis = 1)
y = wine['quality']
print("X values: ",X)
print("\n\n\nY values: ",y)

X values:        fixed acidity  volatile acidity  citric acid  ...    pH  sulphates  alcohol
0               7.4             0.700         0.00  ...  3.51       0.56      9.4
1               7.8             0.880         0.00  ...  3.20       0.68      9.8
2               7.8             0.760         0.04  ...  3.26       0.65      9.8
3              11.2             0.280         0.56  ...  3.16       0.58      9.8
4               7.4             0.700         0.00  ...  3.51       0.56      9.4
...             ...               ...          ...  ...   ...        ...      ...
1594            6.2             0.600         0.08  ...  3.45       0.58     10.5
1595            5.9             0.550         0.10  ...  3.52       0.76     11.2
1596            6.3             0.510         0.13  ...  3.42       0.75     11.0
1597            5.9             0.645         0.12  ...  3.57       0.71     10.2
1598            6.0             0.310         0.47  ...  3.39       0.66     11.0

[159

In [9]:
# split X and y into X_train, X_test, y_train and y_test using a test_size of 30% and a random_state of 42
X_train, X_test, y_train, y_test = train_test_split (X,y, test_size=0.3, random_state=42)

In [None]:
# Exercise 1 
# training SVM models

In [10]:
# Task 1.1
# what is the size of y_train?
# what is the size of y_test?
print("Size of y_train is ", y_train.size)
print("Size of y_test is ", y_test.size)

Size of y_train is  1119
Size of y_test is  480


In [13]:
# Task 1.2
# create a SVM model called svm_linear_clf using the pipeline pattern shown in O'Reilly textbook page 156 
# add a max_iter of 20000 to ensure the training converges
# What value of C did you choose?
svm_linear_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svc",SVC(C = 1, max_iter = 20000)),
])
print("C = 1 is chosen as it generalizes well")

C = 1 is chosen as it generalizes well


In [14]:
# Task 1.3
# train the svm_linear_clf model using the fit function on X_train and y_train
svm_linear_clf.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svc',
                 SVC(C=1, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=20000,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [15]:
# Task 1.4
# using the predict function, calculate the svm_linear_clf model predictions based on X_test
# hint: use the model.predict(X_test) pattern
y_pred = svm_linear_clf.predict(X_test)

In [16]:
# Task 1.5
# Calculate the accuracy of svm_linear_clf by comparing y_pred to y_test (see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html)
# what is the accuracy? 
svm_linear_accuracy = accuracy_score(y_test, y_pred)
print(svm_linear_accuracy)

0.88125


In [17]:
# Task 1.6 
# create a SVM model called svm_poly_clf using the pipeline pattern shown in O'Reilly textbook page 158
# add a max_iter of 20000 to ensure the training converges
svm_poly_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svc",SVC(kernel = "poly", C = 5, degree=3, coef0=1, max_iter = 20000)),
])

In [18]:
# Task 1.7 
# train the svm_poly_clf model using the fit function on X_train and y_train
svm_poly_clf.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svc',
                 SVC(C=5, break_ties=False, cache_size=200, class_weight=None,
                     coef0=1, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='poly', max_iter=20000,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [19]:
# Task 1.8
# using the predict function, calculate the svm_poly_clf model predictions based on X_test
# hint: use the model.predict(X_test) pattern
y_pred = svm_poly_clf.predict(X_test)

In [20]:
# Task 1.9
# Calculate the accuracy of svm_poly_clf by comparing y_pred to y_test (see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html)
# what is the accuracy? 
svm_poly_accuracy = accuracy_score(y_test, y_pred)
print(svm_poly_accuracy)

0.8770833333333333


In [21]:
# Task 1.10
# create a SVM model called svm_rbf_clf using the pipeline pattern shown in O'Reilly textbook page 160
# add a max_iter of 20000 to ensure the training converges
svm_rbf_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svc",SVC(kernel = "rbf", C = 0.001,gamma=5, max_iter = 20000)),
])

In [22]:
# Task 1.11
# train the svm_rbf_clf model using the fit function  on X_train and y_train
svm_rbf_clf.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svc',
                 SVC(C=0.001, break_ties=False, cache_size=200,
                     class_weight=None, coef0=0.0,
                     decision_function_shape='ovr', degree=3, gamma=5,
                     kernel='rbf', max_iter=20000, probability=False,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=False))],
         verbose=False)

In [23]:
# Task 1.12
# using the predict function, calculate the svm_rbf_clf model predictions based on X_test
# hint: use the model.predict(X_test) pattern
y_pred = svm_rbf_clf.predict(X_test)

In [24]:
# Task 1.13
# Calculate the accuracy of svm_rbf_clf by comparing y_pred to y_test (see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html)
svm_rbf_accuracy = accuracy_score(y_test, y_pred)
print(svm_rbf_accuracy)

0.8604166666666667


In [25]:
# Exercise 2
# let's us grid search to find the best SVM model for this dataset

In [26]:
# task 2.1 
# import the required sklearn library for performing a grid search
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

In [27]:
# Task 2.2
# Set the grid search parameters as follows to help find the best parameters for our SVC model
param = {
    'C': [0.5,1,1.3,1.5],
    'kernel':['linear', 'rbf'],
    'gamma' :[0.5,1,1.3,1.5],
}

# or use:

param2 = {
    'C': [0.5,1,1.3,1.5],
    'kernel':['poly', 'rbf'],
    'gamma' :[0.5,1,1.3,1.5],
    'degree':[2,3,4]
}

In [40]:
# Task 2.3
# Setup and run the svm classifier model

svc = SVC()
grid_svc = GridSearchCV(svc, param_grid=param, scoring='accuracy', cv=10)

In [41]:
# Task 2.4
# use fit on X_train and y_train to train the grid_svc model
#  It is recommended to run it on Google Colab or Kaggle with GPU to make the training go faster
grid_svc.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.5, 1, 1.3, 1.5], 'gamma': [0.5, 1, 1.3, 1.5],
                         'kernel': ['linear', 'rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [42]:
# task 2.5
# use grid_svc.best_params_ to find the best model
# what is the accuracy for the best model?
print(grid_svc.best_params_)

{'C': 1.3, 'gamma': 1, 'kernel': 'rbf'}


In [31]:
# Exercise 3
# Let's run a random forest classifier on the same wine dataset
# Task 3.1
# import the required scikit-learn library for running random forest classification
from sklearn.ensemble import RandomForestClassifier

In [32]:
# Task 3.2
# define a model called rfc as a random forest classifier with 200 estimators
rfc = RandomForestClassifier(n_estimators=200)

In [34]:
#Applying Standard scaling to get optimized result
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [35]:
# Task 3.3
# train the rfc random forest classifier on X_train and y_train
rfc.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [37]:
# Task 3.4
# calculate the predictions for the X_test values. Call the ouput y_pred
y_pred = rfc.predict(X_test)

In [38]:
# Task 3.5
# what is the accuracy? 
rfc_accuracy = accuracy_score(y_test, y_pred)
print(rfc_accuracy)

0.88125


In [39]:
# Task 3.6
#Let's see how our model performed by running print(classification_report(y_test, y_pred))
# What is the accuracy of the model?
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.96      0.93       413
           1       0.61      0.42      0.50        67

    accuracy                           0.88       480
   macro avg       0.76      0.69      0.71       480
weighted avg       0.87      0.88      0.87       480

