In [1]:
# Import the libraries 

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

### Reading in the data

In [2]:
# Load the data
df = pd.read_csv('https://raw.githubusercontent.com/Explore-AI/Public-Data/master/Data/classification_sprint/winequality.csv')
df.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,0,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,0,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,0,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,0,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


###  Data preprocessing

In [3]:
def data_preprocess(df):
    """
    Preprocesses the dataset by handling missing values, 
    converting quality labels to binary, standardizing features, 
    and splitting the data into training and testing sets.

    Parameters:
    df (DataFrame): The input dataset.

    Returns:
    tuple: ((X_train, y_train), (X_test, y_test)) - Processed training and testing sets.
    """

    # Fill NaN values with zeros
    df = df.fillna(0)

    # Convert wine quality to binary labels
    df['quality'] = df['quality'].apply(lambda x: 0 if x <= 4 else 1)
    
    # Split the data into features and labels
    X = df.drop('quality', axis=1)
    y = df['quality']

    # Standardize the features
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Split the data into 75% training and 25% testing sets
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=42)

    return (X_train, y_train), (X_test, y_test)

In [4]:
(X_train, y_train), (X_test, y_test) = data_preprocess(df)

In [5]:
print(X_train[:2])

[[-0.57136659  0.07127869 -0.48054096  1.17914161 -0.09303318 -0.79974133
   0.0830898  -0.15472329 -0.36573452  0.13010447  0.06101473  0.25842195]
 [-0.57136659  1.50396711 -0.72301571  0.56008035 -0.63948302 -0.05776881
  -0.70572997  0.62379657  0.16787589 -0.86828773 -0.47467813 -0.99931317]]


### Model training

In [6]:
def train_SVC_model(X_train, y_train):
    """
    Trains a Support Vector Classifier (SVC) model on the given training data.

    Parameters:
    X_train (array-like): The training features.
    y_train (array-like): The training labels.

    Returns:
    SVC: The trained SVC model.
    """

    # Instantiate the SVC model with a fixed random state
    model = SVC(random_state=40, gamma='auto')

    # Train the model on the training data
    model.fit(X_train, y_train)

    return model


In [7]:
svc = train_SVC_model(X_train,y_train)
svc.classes_

array([0, 1], dtype=int64)

### Model testing

In [8]:
def custom_scoring_function(y_true, y_pred):
    """
    Computes the log loss for the given true and predicted values.

    Parameters:
    y_true (array-like): The true labels.
    y_pred (array-like): The predicted probabilities.

    Returns:
    float: The log loss value rounded to 7 decimal places.
    """

    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)  # Ensure predictions are within valid range

    # Compute log loss
    N = len(y_true)
    log_loss = - (1 / N) * np.sum(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

    return np.round(log_loss, 7)


In [9]:
# Calculates log loss and accuracy

y_pred = svc.predict(X_test)
print('Log Loss value: ', custom_scoring_function(y_test, y_pred))
print('Accuracy: ',round(accuracy_score(y_test,y_pred),4))

Log Loss value:  1.2540518
Accuracy:  0.9637


### Getting model parameters

In [10]:
def get_model_hyperparams(model):
    """
    Retrieves the hyperparameter names of a given model.

    Parameters:
    model (object): A trained machine learning model.

    Returns:
    list: A list of hyperparameter names.
    """
    return list(model.get_params().keys())

In [11]:
# Get hyperparamters
get_model_hyperparams(svc)

['C',
 'break_ties',
 'cache_size',
 'class_weight',
 'coef0',
 'decision_function_shape',
 'degree',
 'gamma',
 'kernel',
 'max_iter',
 'probability',
 'random_state',
 'shrinking',
 'tol',
 'verbose']

In [12]:
# Hyperparameter search

def tune_SVC_model(X_train, y_train):
    """
    Performs hyperparameter tuning for a Support Vector Classifier (SVC) using GridSearchCV.

    Parameters:
    X_train (array-like): The training features.
    y_train (array-like): The training labels.

    Returns:
    GridSearchCV: The fitted GridSearchCV object containing the best hyperparameters.
    """
    
    # Define the parameter grid
    D = {'C': [0.1, 1, 10], 'gamma': [0.01, 0.1, 1]}
    
    # Create the custom scorer
    from sklearn.metrics import make_scorer
    scorer = make_scorer(custom_scoring_function, greater_is_better=False)

    # Perform grid search with 5-fold cross-validation
    grid_search = GridSearchCV(estimator=SVC(), param_grid=D, scoring=scorer, cv=5)
    
    # Fit the grid search to the training data
    grid_search.fit(X_train, y_train)
    
    return grid_search



In [13]:
# Tune the SVC model using GridSearchCV to find the best hyperparameters

svc_tuned = tune_SVC_model(X_train, y_train)
y_pred = svc_tuned.predict(X_test)
print('Log Loss value: ',custom_scoring_function(y_test,y_pred))
print('Accuracy: ',round(accuracy_score(y_test,y_pred),4))

Log Loss value:  1.2115421
Accuracy:  0.9649


In [14]:
# Optimal model parameters

def get_best_params(grid_search):
    """
    Retrieves the best hyperparameters from a trained GridSearchCV object.

    Parameters:
    grid_search (GridSearchCV): The fitted GridSearchCV object.

    Returns:
    dict: A dictionary containing the optimal hyperparameters.
    """
    
    # Get the best hyperparameters from the GridSearchCV object
    best_params = grid_search.best_params_
    
    # Return the dictionary of optimal parameters
    return best_params


In [15]:
# The best parameter for the model 

get_best_params(svc_tuned)

{'C': 1, 'gamma': 1}