In [1]:
import pandas as pd
import numpy as np

ww = pd.read_csv('C:\Asus WebStorage\psabin@gmail.com\MySyncFolder\Data Science Course\BlueBerry Winery\winequality-white.csv', sep=';')
rw = pd.read_csv('C:\Asus WebStorage\psabin@gmail.com\MySyncFolder\Data Science Course\BlueBerry Winery\winequality-red.csv', sep=';')

In [2]:
ww['quality_label'] = ww['quality'].apply(lambda value: 'low'
if value <= 5 else 'medium'
if value <= 7 else 'high')

# here we are transforming these labels into categrical data type (specific to pandas) instead of simple string
ww['quality_label'] = pd.Categorical(ww['quality_label'],
categories=['low', 'medium', 'high'])

rw['quality_label'] = rw['quality'].apply(lambda value: 'low'
if value <= 5 else 'medium'
if value <= 7 else 'high')

# here we are transforming these labels into categrical data type (specific to pandas) instead of simple string
rw['quality_label'] = pd.Categorical(rw['quality_label'],
categories=['low', 'medium', 'high'])

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
ww['encoded_quality_label'] = le.fit_transform(ww['quality_label'])  


In [None]:
# Swap 0 for 3 to make the order make sense from low to high
ww['encoded_quality_label'].replace(0,3,inplace=True)
display(ww.head(20))

In [None]:
ww.drop(['quality_label'], axis=1, inplace=True)

In [7]:
ww.drop(['quality'], axis=1, inplace=True)
display(ww.head())

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,encoded_quality_label
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,2
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,2
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,2
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,2
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,2


## Select the features to use as inputs

In [17]:
ww_select = ww[['fixed acidity', 'volatile acidity', 'chlorides', 'alcohol']]
display(ww_select.head())

Unnamed: 0,fixed acidity,volatile acidity,chlorides,alcohol
0,7.0,0.27,0.045,8.8
1,6.3,0.3,0.049,9.5
2,8.1,0.28,0.05,10.1
3,7.2,0.23,0.058,9.9
4,7.2,0.23,0.058,9.9


## Create column-vector for quality_label (y)

In [20]:
y_ww_quality = ww.encoded_quality_label
display(y_ww_quality.head())
display(y_ww_quality.shape)

0    2
1    2
2    2
3    2
4    2
Name: encoded_quality_label, dtype: int32

(4898,)

## Split into train and test data

In [44]:
from sklearn.model_selection import train_test_split
ww_features_train, ww_features_test, ww_quality_train, ww_quality_test = train_test_split(ww_select,y_ww_quality,test_size=0.2, random_state=19)

## Scale wine features to even out their impact on the model

In [45]:

# Also try different methods of normalization/standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
ww_features_train_scaled = scaler.fit_transform(ww_features_train)
ww_features_test_scaled = scaler.transform(ww_features_test)

## Create a function with several ML models at once

In [78]:
# Import the relevant models:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

In [105]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score


def models(X_train, y_train, X_test, y_test):
    # Create an array of classification models (classifiers)
    classifiers = [
        ('Logistic Regression', LogisticRegression(max_iter=1000)),
        ('Gaussian Naive Bayes', GaussianNB()),
        ('Support Vector Classification', SVC()),
        ('Decision Tree', DecisionTreeClassifier(criterion = 'gini', random_state = 0)),
        ('Random Forest', RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)),
        ('Neural Network', MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, learning_rate_init=0.001, early_stopping=True, activation='relu', solver='adam', random_state=0)),
        ('K Nearest Neighbors', KNeighborsClassifier())
        
    ]
    
    metrics_df = pd.DataFrame(columns=['Classifier', 'Test Accuracy', 'Train Accuracy', 'Precision', 'Recall', 'F1-Score'])

    for name, clf in classifiers:
        clf.fit(X_train, y_train)
        train_predictions = clf.predict(X_train)
        test_predictions = clf.predict(X_test)
        
        # Calculate metrics
        train_accuracy = (accuracy_score(y_train, train_predictions) * 100).round(2)
        test_accuracy = (accuracy_score(y_test, test_predictions) * 100).round(2)
        precision = (precision_score(y_test, test_predictions, average='weighted') * 100).round(2)
        recall = (recall_score(y_test, test_predictions, average='weighted') * 100).round(2)
        f1 = (f1_score(y_test, test_predictions, average='weighted') * 100).round(2)

        metrics_row = {
            'Classifier': name,
            'Test Accuracy': test_accuracy,
            'Train Accuracy': train_accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1
        }

        metrics_df = pd.concat([metrics_df, pd.DataFrame([metrics_row])]).reset_index(drop=True)
        metrics_df = metrics_df.sort_values(by='Test Accuracy', ascending=False).reset_index(drop=True)
    
    return metrics_df




Call the big function on white wine features (input) quality (output)

In [106]:
results_df = models(ww_features_train_scaled, ww_quality_train, ww_features_test_scaled, ww_quality_test)
display(results_df)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Classifier,Test Accuracy,Train Accuracy,Precision,Recall,F1-Score
0,Random Forest,79.29,99.92,79.09,79.29,78.81
1,Decision Tree,74.8,99.92,74.89,74.8,74.83
2,Support Vector Classification,73.27,72.21,69.39,73.27,70.75
3,Neural Network,72.24,71.44,68.28,72.24,69.72
4,Logistic Regression,71.43,70.09,67.43,71.43,69.11
5,Gaussian Naive Bayes,70.92,67.71,68.46,70.92,67.17
6,K Nearest Neighbors,68.57,79.53,68.44,68.57,67.78


In [107]:
from sklearn.model_selection import cross_val_score

def models_cross_val(X_train, y_train, X_test, y_test):
    # Create an array of classification models (classifiers)
    classifiers = [
        ('Logistic Regression', LogisticRegression(max_iter=1000)),
        ('Gaussian Naive Bayes', GaussianNB()),
        ('Support Vector Classification', SVC()),
        ('Decision Tree', DecisionTreeClassifier(criterion = 'gini', random_state = 0)),
        ('Random Forest', RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)),
        ('Neural Network', MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, learning_rate_init=0.001, early_stopping=True, activation='relu', solver='adam', random_state=0)),
        ('K Nearest Neighbors', KNeighborsClassifier())
        
    ]

    cross_val_metrics_df = pd.DataFrame(columns=['Classifier', 'Mean Accuracy', 'Standard Deviation'])

    
    for name, clf in classifiers:

        # Perform 5-fold cross-validation
        scores = cross_val_score(clf, X_train, y_train, cv=5)

        # Calculate the mean and standard deviation of the scores
        mean_score = scores.mean().round(3)
        std_dev = scores.std().round(3)

        cross_val_metrics_row = {
            'Classifier': name,
            'Mean Accuracy': mean_score,
            'Standard Deviation': std_dev
        }

        cross_val_metrics_df = pd.concat([cross_val_metrics_df, pd.DataFrame([cross_val_metrics_row])]).reset_index(drop=True)
        cross_val_metrics_df = cross_val_metrics_df.sort_values(by='Mean Accuracy', ascending=False).reset_index(drop=True)
    
    return cross_val_metrics_df

In [108]:
models_cross_val_df = models_cross_val(ww_features_train_scaled, ww_quality_train, ww_features_test_scaled, ww_quality_test)
display(models_cross_val_df)

Unnamed: 0,Classifier,Mean Accuracy,Standard Deviation
0,Random Forest,0.764,0.013
1,Support Vector Classification,0.715,0.006
2,Neural Network,0.709,0.012
3,K Nearest Neighbors,0.704,0.008
4,Decision Tree,0.703,0.01
5,Logistic Regression,0.702,0.012
6,Gaussian Naive Bayes,0.676,0.011
