# Wine Class Prediction
Predicts wine class using Linear and Kernelized Support Vector Machine

In [91]:
# Imports required modules and methods

import pandas as pd

from sklearn.datasets import load_wine

from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

from sklearn.svm import LinearSVC, SVC

from scipy.stats import loguniform, uniform

## Retrieving & Analyzing the Data

In [92]:
# Loads the wine dataset
wine = load_wine(return_X_y=False, as_frame=True)

# Checks the bunch objects available throgh keys
print(wine.keys())

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])


In [93]:
# Displays few instances from the dataset
display(wine.data)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0


In [94]:
# Checks wine class distribution
wine.target.value_counts()

target
1    71
0    59
2    48
Name: count, dtype: int64

In [95]:
# Checks for the basic information about the dataset
wine.data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline                       178 non-null    float64
dtypes: fl

All 13 features were found to be numerical with no missing values in any of the features

In [96]:
# Checks for basic statistics of the dataset
wine.data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
alcohol,178.0,13.000618,0.811827,11.03,12.3625,13.05,13.6775,14.83
malic_acid,178.0,2.336348,1.117146,0.74,1.6025,1.865,3.0825,5.8
ash,178.0,2.366517,0.274344,1.36,2.21,2.36,2.5575,3.23
alcalinity_of_ash,178.0,19.494944,3.339564,10.6,17.2,19.5,21.5,30.0
magnesium,178.0,99.741573,14.282484,70.0,88.0,98.0,107.0,162.0
total_phenols,178.0,2.295112,0.625851,0.98,1.7425,2.355,2.8,3.88
flavanoids,178.0,2.02927,0.998859,0.34,1.205,2.135,2.875,5.08
nonflavanoid_phenols,178.0,0.361854,0.124453,0.13,0.27,0.34,0.4375,0.66
proanthocyanins,178.0,1.590899,0.572359,0.41,1.25,1.555,1.95,3.58
color_intensity,178.0,5.05809,2.318286,1.28,3.22,4.69,6.2,13.0


## Data Preparation

### Splitting Data
Splits the data into train and test set for performing model performance evaluation.

In [97]:
# Splits the data into train and test set with stratification on wine class
X_train, X_test, y_train, y_test = train_test_split(
    wine.data, wine.target, test_size=0.2, random_state=42, stratify=wine.target)

print("Train set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

Train set shape: (142, 13)
Test set shape: (36, 13)


In [98]:
# Ensure class distribution is nearly equal in train and test set
pd.DataFrame({
    "Train set": y_train.value_counts()/y_train.size,
    "Test set": y_test.value_counts()/y_test.size
    })

Unnamed: 0_level_0,Train set,Test set
target,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.401408,0.388889
0,0.330986,0.333333
2,0.267606,0.277778


### Transforming Data
Standardize the data as the features are of different scales.

In [99]:
# Inializes the standard scaler
std_scaler = StandardScaler()

# Fits the standard scaler on train set
std_scaler.fit(X_train)

# Transforms the train set
X_train_transformed = std_scaler.transform(X_train)

In [100]:
# Fitted standard scaler is used tranform test set
X_test_transformed = std_scaler.transform(X_test)

## Modeling

### Linear SVC

In [101]:
# Initializes the linear support vector classifier
lin_svc = LinearSVC(random_state=42)

# Fits the model on train set
lin_svc.fit(X_train_transformed, y_train)

In [102]:
# Checks prediction performance over cross validation

lin_svc_cv = cross_val_score(LinearSVC(random_state=42), X_train_transformed, y_train)

print("Linear SVC CV Score: {:.3f}".format(lin_svc_cv.mean()))

Linear SVC CV Score: 0.979


In [103]:
# Checks the prediction performance on the test set

lin_svc_test_predictions = lin_svc.predict(X_test_transformed)

print("Linear SVC Test Score: {:.3f}".format(accuracy_score(y_test, lin_svc_test_predictions)))

Linear SVC Test Score: 0.972


### Kernelized SVM

In [104]:
# Initializes kernelized SVC with default parameters
svm_clf = SVC(random_state=42)

# Fits the model on train set
svm_clf.fit(X_train_transformed, y_train)

In [105]:
# Performs prediction on the train set and checks performance

svm_clf_train_predictions = svm_clf.predict(X_train_transformed)

print("SVM Classifier Train Score: {:.3f}".format(accuracy_score(y_train, svm_clf_train_predictions)))

SVM Classifier Train Score: 0.993


In [106]:
# Checks overall prediction performance over cross validation

svm_clf_cv = cross_val_score(SVC(random_state=42), X_train_transformed, y_train)

print("SVM Classifier CV Score: {:.3f}".format(svm_clf_cv.mean()))

SVM Classifier CV Score: 0.986


In [107]:
# Prints the parameters for trained SVM classifier
svm_clf.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': 42,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

If the model performs well below the expectation, then important hyperparamters **C** and **gamma** can be fine-tuned using appropriate search methods like the one below.

In [108]:
# Sets the distribution for the hyperparameters
param_distributions = {
    "gamma": loguniform(0.001, 0.1),
    "C": uniform(1, 10)
}

# Performs random search for the best values for the mentioned parameters
rnd_search_cv = RandomizedSearchCV(SVC(random_state=42), param_distributions, n_iter=100, cv=5, random_state=42)
rnd_search_cv.fit(X_train_transformed, y_train)

In [109]:
# Checks for the best CV score achieved
rnd_search_cv.best_score_

0.993103448275862

This CV performance of the tuned SVM classifier was observed to be litte better than that of the default SVM classifier.

In [110]:
# Prints the best value found the parameters of interest ['C' and 'gamma']
rnd_search_cv.best_estimator_.get_params()

{'C': 4.042422429595377,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 0.01120760621186057,
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': 42,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [111]:
# Checks the best estimator's prediction performance on the test set
accuracy_score(y_test, rnd_search_cv.best_estimator_.predict(X_test_transformed))

0.9722222222222222

In overall, kernelized support vector classifier performed better than linear supoort vector classifier in this experiment.