In [17]:
# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import KFold
import warnings
from warnings import simplefilter

np.random.seed(500)
simplefilter(action='ignore', category=FutureWarning)

# 2. Import data
Red = pd.read_csv('/content/winequality-red.csv', sep=';')
White = pd.read_csv('/content/winequality-white.csv', sep=';')

# 3. Concatenate the Red and White wine datasets
Red['type'] = 'red'
White['type'] = 'white'
wine = pd.concat([Red, White])

# Shuffling data
wine = wine.sample(frac=1).reset_index(drop=True)

# 4. Create Classification version of target variable
wine['def_quality'] = [0 if x < 7 else 1 for x in wine['quality']]  # 1 = 'Good Quality', 0 = 'Bad Quality'

# Separate feature variables and target variable
X = wine.drop(['quality', 'def_quality', 'type'], axis=1)
Y = wine['def_quality']

# 5. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=7)

# 6. Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 7. Model 1: Logistic Regression
log_reg = LogisticRegression(max_iter=200)
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l1','l2']
c_values = [50, 10, 1.0, 0.1, 0.01]

# Hyperparameter tuning for Logistic Regression
grid_lr = dict(solver=solvers, penalty=penalty, C=c_values)
Kcv = KFold(n_splits=5, shuffle=True, random_state=100)

grid_search_lr = GridSearchCV(estimator=log_reg, param_grid=grid_lr, n_jobs=-1, cv=Kcv, scoring='accuracy', error_score=0)
grid_result_lr = grid_search_lr.fit(X_train_scaled, y_train)

# Best Logistic Regression hyperparameters
print("Best LR: %f using %s" % (grid_result_lr.best_score_, grid_result_lr.best_params_))
log_reg_best = LogisticRegression(C=grid_result_lr.best_params_['C'], penalty=grid_result_lr.best_params_['penalty'], solver=grid_result_lr.best_params_['solver'])
log_reg_best.fit(X_train_scaled, y_train)

# Predicting Test Set for Logistic Regression
y_pred_log_reg = log_reg_best.predict(X_test_scaled)

# Evaluate Logistic Regression
acc_log_reg = accuracy_score(y_test, y_pred_log_reg)
prec_log_reg = precision_score(y_test, y_pred_log_reg)
rec_log_reg = recall_score(y_test, y_pred_log_reg)
f1_log_reg = f1_score(y_test, y_pred_log_reg)

# 8. Model 2: Support Vector Machine (SVM)
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_scaled, y_train)

# Predicting Test Set for SVM
y_pred_svm = svm_model.predict(X_test_scaled)

# Evaluate SVM
acc_svm = accuracy_score(y_test, y_pred_svm)
prec_svm = precision_score(y_test, y_pred_svm)
rec_svm = recall_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)

# 9. Model 3: Decision Tree
dt_model = DecisionTreeClassifier(random_state=100)
param_tuning_dt = {
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

# Hyperparameter tuning for Decision Tree
grid_search_dt = GridSearchCV(estimator=dt_model, param_grid=param_tuning_dt, n_jobs=-1, cv=Kcv, scoring='accuracy', error_score=0)
grid_result_dt = grid_search_dt.fit(X_train_scaled, y_train)

# Best Decision Tree hyperparameters
print("Best Decision Tree: %f using %s" % (grid_result_dt.best_score_, grid_result_dt.best_params_))
dt_best = DecisionTreeClassifier(**grid_result_dt.best_params_)
dt_best.fit(X_train_scaled, y_train)

# Predicting Test Set for Decision Tree
y_pred_dt = dt_best.predict(X_test_scaled)

# Evaluate Decision Tree
acc_dt = accuracy_score(y_test, y_pred_dt)
prec_dt = precision_score(y_test, y_pred_dt)
rec_dt = recall_score(y_test, y_pred_dt)
f1_dt = f1_score(y_test, y_pred_dt)

# 10. Model 4: K-Nearest Neighbors (KNN)
knn_model = KNeighborsClassifier()
param_tuning_knn = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

# Hyperparameter tuning for KNN
grid_search_knn = GridSearchCV(estimator=knn_model, param_grid=param_tuning_knn, n_jobs=-1, cv=Kcv, scoring='accuracy', error_score=0)
grid_result_knn = grid_search_knn.fit(X_train_scaled, y_train)

# Best KNN hyperparameters
print("Best KNN: %f using %s" % (grid_result_knn.best_score_, grid_result_knn.best_params_))
knn_best = KNeighborsClassifier(**grid_result_knn.best_params_)
knn_best.fit(X_train_scaled, y_train)

# Predicting Test Set for KNN
y_pred_knn = knn_best.predict(X_test_scaled)

# Evaluate KNN
acc_knn = accuracy_score(y_test, y_pred_knn)
prec_knn = precision_score(y_test, y_pred_knn)
rec_knn = recall_score(y_test, y_pred_knn)
f1_knn = f1_score(y_test, y_pred_knn)

# 11. Results
results = pd.DataFrame([
    ['Logistic Regression', acc_log_reg, prec_log_reg, rec_log_reg, f1_log_reg],
    ['SVM', acc_svm, prec_svm, rec_svm, f1_svm],
    ['Decision Tree', acc_dt, prec_dt, rec_dt, f1_dt],
    ['KNN', acc_knn, prec_knn, rec_knn, f1_knn]
], columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

print(results)

# 12. Confusion Matrices
print("\nConfusion Matrix for Logistic Regression\n", confusion_matrix(y_test, y_pred_log_reg))
print("\nConfusion Matrix for SVM\n", confusion_matrix(y_test, y_pred_svm))
print("\nConfusion Matrix for Decision Tree\n", confusion_matrix(y_test, y_pred_dt))
print("\nConfusion Matrix for KNN\n", confusion_matrix(y_test, y_pred_knn))


Best LR: 0.817938 using {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}
Best Decision Tree: 0.812604 using {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 20}
Best KNN: 0.856530 using {'algorithm': 'auto', 'n_neighbors': 7, 'weights': 'distance'}
                 Model  Accuracy  Precision    Recall  F1 Score
0  Logistic Regression  0.826462   0.636364  0.244444  0.353211
1                  SVM  0.806154   0.000000  0.000000  0.000000
2        Decision Tree  0.837538   0.642458  0.365079  0.465587
3                  KNN  0.886154   0.722603  0.669841  0.695222

Confusion Matrix for Logistic Regression
 [[1266   44]
 [ 238   77]]

Confusion Matrix for SVM
 [[1310    0]
 [ 315    0]]

Confusion Matrix for Decision Tree
 [[1246   64]
 [ 200  115]]

Confusion Matrix for KNN
 [[1229   81]
 [ 104  211]]
