In [None]:
import pandas as pd 
import numpy as np
import sys
from hashlib import sha1

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [None]:
df = pd.read_csv('data/winequality-red.csv', sep = ';')

In [None]:
df.head()

In [None]:
df.info()


In [None]:
df.describe()

In [None]:
has_na = df.isna().any().any()
has_na

In [None]:
X = df.drop(columns = ['quality'])
y = df['quality']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 522)


In [None]:
models1 = {
    "dummy": DummyClassifier(random_state=522), 
    "KNN": KNeighborsClassifier(),
    "RBF SVM": SVC(random_state=123), 
    'Ridge model':Ridge(),
    'linear SVC':SVC(kernel = 'linear'),
    'decision tree': DecisionTreeClassifier(),
}

In [None]:
from sklearn.compose import make_column_transformer


results1 = []

for name, model in models1.items():
    pipeline =  make_pipeline(StandardScaler(), model) 
    scores = cross_validate(pipeline, X_train, y_train, return_train_score=True, n_jobs=-1)
 #   mean_std_cross_val_scores
    results1.append({
        'model': name,
        'test_score': np.mean(scores['test_score']),
        'train_score': np.mean(scores['train_score']),
        'fit_time': np.mean(scores['fit_time']),
        'score_time': np.mean(scores['score_time'])
    })

results_df1 = pd.DataFrame(results1)
results_df1.set_index('model', inplace=True)
results_df1

In [None]:
pipe_ridge = make_pipeline(StandardScaler(), Ridge())
pipe_ridge.fit(X_train, y_train)

coeffs = pipe_ridge.named_steps["ridge"].coef_

# Use df.columns to get the feature names if X_train was derived from df
coeff_df = pd.DataFrame(data=coeffs, index=X_train.columns, columns=["Coefficients"])
sorted_coeff_df = coeff_df.sort_values(by="Coefficients", ascending=False)

sorted_coeff_df

In [None]:
#drop variables with small coefficients(< 0.05) (free sulfur dioxide, residual sugar, density, citric acid)
X = df.drop(columns = ['quality','free sulfur dioxide', 'residual sugar', 'density', 'citric acid'])
y = df['quality']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 522)
results1 = []

for name, model in models1.items():
    pipeline =  make_pipeline(StandardScaler(), model) 
    scores = cross_validate(pipeline, X_train, y_train, return_train_score=True, n_jobs=-1)
 #   mean_std_cross_val_scores
    results1.append({
        'model': name,
        'test_score': np.mean(scores['test_score']),
        'train_score': np.mean(scores['train_score']),
        'fit_time': np.mean(scores['fit_time']),
        'score_time': np.mean(scores['score_time'])
    })

results_df1 = pd.DataFrame(results1)
results_df1.set_index('model', inplace=True)
results_df1

In [None]:
#ridge coefficient
pipe_ridge = make_pipeline(StandardScaler(), Ridge())
pipe_ridge.fit(X_train, y_train)

coeffs = pipe_ridge.named_steps["ridge"].coef_

# Use df.columns to get the feature names if X_train was derived from df
coeff_df = pd.DataFrame(data=coeffs, index=X_train.columns, columns=["Coefficients"])
sorted_coeff_df = coeff_df.sort_values(by="Coefficients", ascending=False)

sorted_coeff_df

# ANALYSIS:

In this study, we employed various machine learning models to predict the quality of wine based on its chemical properties. The models used included a Dummy model, K-Nearest Neighbors (KNN), Support Vector Machine (SVM) with Radial Basis Function (RBF) kernel, Ridge Regression, and Linear Support Vector Classification (SVC). These models were rigorously cross-validated with 5 folds to assess their performance. The Python programming language, along with essential packages such as Pandas,scikit-learn was instrumental in conducting this analysis.

The initial performance of each model, as measured by test scores, was as follows:

Dummy Model: 0.437059
KNN: 0.562917
RBF SVM: 0.615313
Ridge Model: 0.340925
Linear SVC: 0.566094

To improve model performance and streamline the feature set, we conducted a coefficient analysis. This analysis led to the exclusion of variables with coefficients less than 0.05, including variables within 'free sulfur dioxide', 'residual sugar', 'density', and 'citric acid'. The updated model performances were:

Dummy Model: 0.437059 (unchanged)
KNN: 0.571541
RBF SVM: 0.606740
Ridge Model: 0.344543
Linear SVC: 0.576244

The new coefficients for the remaining variables were:

Alcohol: 0.322230
Sulphates: 0.145794
Fixed Acidity: 0.011367
pH: -0.059857
Chlorides: -0.067214
Total Sulfur Dioxide: -0.089573
Volatile Acidity: -0.179928

This refined analysis suggests a more focused model, with the reduced feature set enhancing the predictive accuracy of certain models, notably the KNN and Linear SVC. The data utilized for tdy encompass various physicochemical properties of wine, such as acidity, sulfur dioxide levels,might influence its quality.oB content, which are believ Based on above results and information, we choose to o, decision tree modelptimize RBF SVM, Ridge ed to influence its quality.


# Hyperparameter Tuning

I'm tuning three models: decision tree, linear SVC, and RBF SVM. Make sure we add justifications for these three choices later.

In [None]:
from sklearn.model_selection import GridSearchCV
import altair as alt
import matplotlib.pyplot as plt

In [None]:
#Decision Tree Tuning
pipe_dt = make_pipeline(StandardScaler(), DecisionTreeClassifier())

dt_param_grid = {
    "decisiontreeclassifier__criterion": ['gini', 'entropy'],
    "decisiontreeclassifier__max_depth": 2 ** np.arange(8)
}

dt_gs = GridSearchCV(pipe_dt, param_grid=dt_param_grid, n_jobs=-1, return_train_score=True)

In [None]:
dt_gs.fit(X_train, y_train)

In [None]:
dt_df = (pd.DataFrame(dt_gs.cv_results_)[
    [
        "mean_test_score",
        "mean_train_score",
        "param_decisiontreeclassifier__criterion",
        "param_decisiontreeclassifier__max_depth",
        "mean_fit_time",
        "rank_test_score",
    ]
].set_index("rank_test_score").sort_index())

In [None]:
dt_df.head(1)

In [None]:
plot = alt.Chart(dt_df, title="Validation Score for Different Parameters for Decision Tree").mark_line().encode(x=alt.X('param_decisiontreeclassifier__max_depth', title='max_depth'), 
                                    y=alt.Y('mean_test_score', title='Validation Score').scale(zero=False),
                                    color=alt.Color('param_decisiontreeclassifier__criterion', title='criterion'))
plot + alt.Chart(dt_df.head(1)).mark_text(dy=-5).encode(
    x='param_decisiontreeclassifier__max_depth',
    y="mean_test_score",
    text=alt.value('Max'))

In [None]:
#Linear SVC Tuning
pipe_lsvc = make_pipeline(StandardScaler(), SVC(kernel = 'linear'))

lsvc_param_grid = {
    "svc__C": [0.001, 0.01, 0.1, 1.0, 10, 100, 1000]
}

lsvc_gs = GridSearchCV(pipe_lsvc, param_grid=lsvc_param_grid, n_jobs=-1, return_train_score=True)

In [None]:
lsvc_gs.fit(X_train, y_train)

In [None]:
lsvc_df = (pd.DataFrame(lsvc_gs.cv_results_)[
    [
        "mean_test_score",
        "mean_train_score",
        "param_svc__C",
        "mean_fit_time",
        "rank_test_score",
    ]
].set_index("rank_test_score").sort_index())

In [None]:
lsvc_df.head(1)

In [None]:
plot = alt.Chart(lsvc_df, title="Validation Score for Different Parameters for Linear SVM").mark_line().encode(
    x=alt.X('param_svc__C', title='C (log scale)', scale=alt.Scale(type='log')),
    y=alt.Y('mean_test_score', title='Validation Score').scale(zero=False))

plot + alt.Chart(lsvc_df.head(1)).mark_text(dy=-5).encode(
    x='param_svc__C',
    y="mean_test_score",
    text=alt.value('Max'))

In [None]:
#RBF SVM Tuning
pipe_rbf = make_pipeline(StandardScaler(), SVC(random_state=123))

rbf_param_grid = {
    "svc__gamma": [0.001, 0.01, 0.1, 1.0, 10, 100],
    "svc__C": [0.001, 0.01, 0.1, 1.0, 10, 100]
}

rbf_gs = GridSearchCV(pipe_rbf, param_grid=rbf_param_grid, n_jobs=-1, return_train_score=True)

In [None]:
rbf_gs.fit(X_train, y_train)

In [None]:
rbf_df = (pd.DataFrame(rbf_gs.cv_results_)[
    [
        "mean_test_score",
        "mean_train_score",
        "param_svc__gamma",
        "param_svc__C",
        "mean_fit_time",
        "rank_test_score",
    ]
].set_index("rank_test_score").sort_index())

In [None]:
rbf_df.head(1)

In [None]:
plot = alt.Chart(rbf_df, title="Validation Score for Different Parameters for RBF SVM").mark_line().encode(
    x=alt.X('param_svc__C', title='C (log scale)', scale=alt.Scale(type='log')),
    y=alt.Y('mean_test_score', title='Validation Score').scale(zero=False),
    color=alt.Color('param_svc__gamma:O', title='gamma').scale(scheme='tableau20')
)

plot + alt.Chart(rbf_df.head(1)).mark_text(dy=-5).encode(
    x='param_svc__C',
    y="mean_test_score",
    text=alt.value('Max'))

Based on the above findings, we have found that in terms of validation score, the best parameters for decision tree is to use gini as the criterion and a `max_depth` of 128, that for linear SVM is to set `C` as 0.1, and that for RBF SVM is to set both `gamma` and `C` as 1.0. We now use these three best models on the test set to assess their performances.

In [None]:
best_dt_pipe = make_pipeline(StandardScaler(), DecisionTreeClassifier(criterion='gini', max_depth=128))
best_lsvm_pipe = make_pipeline(StandardScaler(), SVC(kernel='linear', C=0.1))
best_rbf_pipe = make_pipeline(StandardScaler(), SVC(gamma=1.0, C=1.0))

best_dt_pipe.fit(X_train, y_train)
best_lsvm_pipe.fit(X_train, y_train)
best_rbf_pipe.fit(X_train, y_train)

In [None]:
best_dt_pipe.score(X_test, y_test)

In [None]:
best_lsvm_pipe.score(X_test, y_test)

In [None]:
best_rbf_pipe.score(X_test, y_test)

Among all three models, RBF SVM is the best in terms of test set performance.