In [96]:
from path import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from pathlib import Path
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.svm import SVC
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import ClusterCentroids
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier

In [97]:
red_data = Path('all_wines.csv')
red_df = pd.read_csv(red_data)
red_df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,sulphates,alcohol,ph,quality,id,color
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,2,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,3,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,4,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,5,red


In [98]:
# Missing Values
print(red_df.isna().sum())

fixed_acidity           0
volatile_acidity        0
citric_acid             0
residual_sugar          0
chlorides               0
free_sulfur_dioxide     0
total_sulfur_dioxide    0
density                 0
sulphates               0
alcohol                 0
ph                      0
quality                 0
id                      0
color                   0
dtype: int64


In [104]:
corr = red_df.corr()
corr

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,sulphates,alcohol,ph,quality,id
fixed_acidity,1.0,0.219008,0.324436,-0.111981,0.298195,-0.282735,-0.329054,0.45891,-0.2527,0.299568,-0.095452,-0.076743,-0.50565
volatile_acidity,0.219008,1.0,-0.377981,-0.196011,0.377124,-0.352557,-0.414476,0.271296,0.261454,0.225984,-0.03764,-0.265699,-0.647175
citric_acid,0.324436,-0.377981,1.0,0.142451,0.038998,0.133126,0.195242,0.096154,-0.329808,0.056197,-0.010493,0.085532,0.169111
residual_sugar,-0.111981,-0.196011,0.142451,1.0,-0.12894,0.402871,0.495482,0.552517,-0.26732,-0.185927,-0.359415,-0.03698,0.346423
chlorides,0.298195,0.377124,0.038998,-0.12894,1.0,-0.195045,-0.27963,0.362615,0.044708,0.395593,-0.256916,-0.200666,-0.513311
free_sulfur_dioxide,-0.282735,-0.352557,0.133126,0.402871,-0.195045,1.0,0.720934,0.025717,-0.145854,-0.188457,-0.179838,0.055463,0.466819
total_sulfur_dioxide,-0.329054,-0.414476,0.195242,0.495482,-0.27963,0.720934,1.0,0.032395,-0.238413,-0.275727,-0.26574,-0.041385,0.679535
density,0.45891,0.271296,0.096154,0.552517,0.362615,0.025717,0.032395,1.0,0.011686,0.259478,-0.686745,-0.305858,-0.411002
sulphates,-0.2527,0.261454,-0.329808,-0.26732,0.044708,-0.145854,-0.238413,0.011686,1.0,0.192123,0.121248,0.019506,-0.336958
alcohol,0.299568,0.225984,0.056197,-0.185927,0.395593,-0.188457,-0.275727,0.259478,0.192123,1.0,-0.003029,0.038485,-0.483778


#### Filter to red wines only

In [4]:
red_df = red_df[red_df["color"] == "red"]
red_df.shape

(1599, 14)

#### Categorize quality to >=7 as Good and < 7 and Not good

In [5]:
red_df['quality_2'] = (red_df['quality'] >=  7).astype(int)

In [6]:
red_df.head(10)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,sulphates,alcohol,ph,quality,id,color,quality_2
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1,red,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,2,red,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,3,red,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,4,red,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,5,red,0
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5,6,red,0
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5,7,red,0
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7,8,red,1
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7,9,red,1
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5,10,red,0


#### Create Target and Features

In [7]:
y = red_df["quality_2"]
X = red_df.drop(columns=["quality","quality_2","id","color"])

In [8]:
X.head(10)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,sulphates,alcohol,ph
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5


In [9]:
y.value_counts()

0    1382
1     217
Name: quality_2, dtype: int64

#### Scale the data

In [10]:
data_scaler = StandardScaler()

In [11]:
X_scaled = data_scaler.fit_transform(X)
# X_scaled

#### Split the data into test and train

In [12]:
# YOUR CODE HERE
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=7)

In [13]:
y_train.value_counts()

0    1033
1     166
Name: quality_2, dtype: int64

In [14]:
X_scaled.shape

(1599, 11)

## Logisitic model

In [15]:
model1 = LogisticRegression(solver='lbfgs',random_state=7)

In [16]:
model1.fit(X_train, y_train)

LogisticRegression(random_state=7)

#### Model predictions

In [17]:
y_pred = model1.predict(X_test)

In [18]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[329  20]
 [ 29  22]]


In [19]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.92      0.94      0.93       349
           1       0.52      0.43      0.47        51

    accuracy                           0.88       400
   macro avg       0.72      0.69      0.70       400
weighted avg       0.87      0.88      0.87       400



## SVM

In [20]:
# Linear kernel did not work
model2 = SVC(kernel='rbf',random_state=7)

In [21]:
model2.fit(X_train, y_train)

SVC(random_state=7)

In [22]:
y_pred = model2.predict(X_test)

In [23]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[336  13]
 [ 34  17]]


In [24]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.91      0.96      0.93       349
           1       0.57      0.33      0.42        51

    accuracy                           0.88       400
   macro avg       0.74      0.65      0.68       400
weighted avg       0.86      0.88      0.87       400



In [25]:
accuracy_score(y_test, y_pred)

0.8825

## Decision tree

In [26]:
model3 = tree.DecisionTreeClassifier(random_state=7)

In [27]:
model3 = model3.fit(X_train, y_train)

In [28]:
y_pred = model3.predict(X_test)

In [29]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[305  44]
 [ 24  27]]


In [30]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.93      0.87      0.90       349
           1       0.38      0.53      0.44        51

    accuracy                           0.83       400
   macro avg       0.65      0.70      0.67       400
weighted avg       0.86      0.83      0.84       400



In [31]:
accuracy_score(y_test, y_pred)

0.83

In [32]:
importance = model3.feature_importances_
columns = X.columns
import_df = pd.DataFrame({'feature':columns, 'importance':importance})
import_df = import_df.sort_values(["importance"], ascending=False)
import_df

Unnamed: 0,feature,importance
10,ph,0.24958
8,sulphates,0.13608
9,alcohol,0.114116
0,fixed_acidity,0.087731
1,volatile_acidity,0.081758
5,free_sulfur_dioxide,0.081022
2,citric_acid,0.078951
3,residual_sugar,0.045753
7,density,0.045384
6,total_sulfur_dioxide,0.040393


## Random Forest Classifier

In [33]:
model4 = RandomForestClassifier(n_estimators=128,random_state=7) 

In [34]:
model4 = model4.fit(X_train, y_train)

In [35]:
y_pred = model4.predict(X_test)

In [36]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[333  16]
 [ 25  26]]


In [37]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.93      0.95      0.94       349
           1       0.62      0.51      0.56        51

    accuracy                           0.90       400
   macro avg       0.77      0.73      0.75       400
weighted avg       0.89      0.90      0.89       400



In [38]:
accuracy_score(y_test, y_pred)

0.8975

In [39]:
importance = model4.feature_importances_
columns = X.columns
import_df = pd.DataFrame({'feature':columns, 'importance':importance})
import_df = import_df.sort_values(["importance"], ascending=False)
import_df

Unnamed: 0,feature,importance
10,ph,0.169798
1,volatile_acidity,0.120064
9,alcohol,0.112396
7,density,0.092389
6,total_sulfur_dioxide,0.083003
2,citric_acid,0.079479
0,fixed_acidity,0.075408
4,chlorides,0.07215
3,residual_sugar,0.071893
8,sulphates,0.062759


## Gradient Boosted Tree

In [40]:
# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20,
                                            learning_rate=learning_rate,
                                            max_features=5,
                                            max_depth=3,
                                            random_state=0)

    # Fit the model
    classifier.fit(X_train, y_train)
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_train,
            y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test,
            y_test)))
    print()

Learning rate:  0.05
Accuracy score (training): 0.887
Accuracy score (validation): 0.880

Learning rate:  0.1
Accuracy score (training): 0.914
Accuracy score (validation): 0.882

Learning rate:  0.25
Accuracy score (training): 0.944
Accuracy score (validation): 0.882

Learning rate:  0.5
Accuracy score (training): 0.959
Accuracy score (validation): 0.892

Learning rate:  0.75
Accuracy score (training): 0.973
Accuracy score (validation): 0.865

Learning rate:  1
Accuracy score (training): 0.973
Accuracy score (validation): 0.858



In [41]:
model5 = GradientBoostingClassifier(n_estimators=20,
   learning_rate=0.5, max_features=10, max_depth=3, random_state=7)

In [42]:
model5 = model5.fit(X_train, y_train)

In [43]:
y_pred = model5.predict(X_test)

In [44]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[332  17]
 [ 29  22]]


In [45]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.92      0.95      0.94       349
           1       0.56      0.43      0.49        51

    accuracy                           0.89       400
   macro avg       0.74      0.69      0.71       400
weighted avg       0.87      0.89      0.88       400



In [46]:
accuracy_score(y_test, y_pred)

0.885

In [47]:
importance = model5.feature_importances_
columns = X.columns
import_df = pd.DataFrame({'feature':columns, 'importance':importance})
import_df = import_df.sort_values(["importance"], ascending=False)
import_df

Unnamed: 0,feature,importance
10,ph,0.306515
6,total_sulfur_dioxide,0.131908
9,alcohol,0.12608
1,volatile_acidity,0.122331
5,free_sulfur_dioxide,0.097373
3,residual_sugar,0.054741
4,chlorides,0.04501
2,citric_acid,0.040504
7,density,0.038264
8,sulphates,0.018796


## Random Over Sampling

In [48]:
ros = RandomOverSampler(random_state=7)

In [49]:
Counter(y_train)

Counter({0: 1033, 1: 166})

In [50]:
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [51]:
Counter(y_resampled)

Counter({0: 1033, 1: 1033})

In [52]:
# Train the Logistic Regression model using the resampled data
model6 = LogisticRegression(solver='lbfgs', random_state=7)

In [53]:
model6.fit(X_resampled, y_resampled)

LogisticRegression(random_state=7)

In [54]:
y_pred = model6.predict(X_test)

In [55]:
print(balanced_accuracy_score(y_test, y_pred))

0.8306365526153154


In [56]:
confusion_matrix(y_test, y_pred)

array([[265,  84],
       [  5,  46]], dtype=int64)

In [57]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.76      0.90      0.86      0.83      0.68       349
          1       0.35      0.90      0.76      0.51      0.83      0.69        51

avg / total       0.90      0.78      0.88      0.81      0.83      0.68       400



## SMOTE oversampling

In [58]:
X_resampled, y_resampled = SMOTE(random_state=7,sampling_strategy='auto').fit_resample(
   X_train, y_train)

In [59]:
Counter(y_resampled)

Counter({0: 1033, 1: 1033})

In [60]:
model7 = LogisticRegression(solver='lbfgs', random_state=7)
model7.fit(X_resampled, y_resampled)

LogisticRegression(random_state=7)

In [61]:
y_pred = model7.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8447384684532839

In [62]:
confusion_matrix(y_test, y_pred)

array([[268,  81],
       [  4,  47]], dtype=int64)

In [63]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.77      0.92      0.86      0.84      0.70       349
          1       0.37      0.92      0.77      0.53      0.84      0.72        51

avg / total       0.91      0.79      0.90      0.82      0.84      0.70       400



## Cluster Centroid Undersampling

In [64]:
cc = ClusterCentroids(random_state=7)

In [65]:
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)

In [66]:
model8 = LogisticRegression(solver='lbfgs', random_state=7, max_iter=1000)
model8.fit(X_resampled, y_resampled)

LogisticRegression(max_iter=1000, random_state=7)

In [67]:
y_pred = model8.predict(X_test)

In [68]:
balanced_accuracy_score(y_test, y_pred)

0.8165346367773471

In [69]:
confusion_matrix(y_test, y_pred)

array([[262,  87],
       [  6,  45]], dtype=int64)

In [70]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.75      0.88      0.85      0.81      0.65       349
          1       0.34      0.88      0.75      0.49      0.81      0.67        51

avg / total       0.90      0.77      0.87      0.80      0.81      0.66       400



##  Balanced Random Forest Classifier

In [71]:
brf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=7) 

In [72]:
model9 = brf_model.fit(X_train, y_train)

In [73]:
y_pred = model9.predict(X_test)

In [74]:
balanced_accuracy_score(y_test, y_pred)

0.8433058036968368

In [75]:
confusion_matrix(y_test, y_pred)

array([[267,  82],
       [  4,  47]], dtype=int64)

In [76]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.77      0.92      0.86      0.84      0.69       349
          1       0.36      0.92      0.77      0.52      0.84      0.72        51

avg / total       0.91      0.79      0.90      0.82      0.84      0.70       400



In [77]:
importance = model9.feature_importances_
columns = X.columns
import_df = pd.DataFrame({'feature':columns, 'importance':importance})
import_df = import_df.sort_values(["importance"], ascending=False)
import_df

Unnamed: 0,feature,importance
10,ph,0.233499
1,volatile_acidity,0.128257
9,alcohol,0.116748
2,citric_acid,0.083079
7,density,0.076444
6,total_sulfur_dioxide,0.074452
0,fixed_acidity,0.070034
4,chlorides,0.067913
8,sulphates,0.053655
3,residual_sugar,0.049013


## Easy Ensemble AdaBoost Classifier

In [78]:
eec_model = EasyEnsembleClassifier(n_estimators=100, random_state=7)

In [79]:
model10 = eec_model.fit(X_train, y_train)

In [80]:
y_pred = model10.predict(X_test)

In [81]:
balanced_accuracy_score(y_test, y_pred)

0.8177425698072925

In [82]:
confusion_matrix(y_test, y_pred)

array([[256,  93],
       [  5,  46]], dtype=int64)

In [83]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.73      0.90      0.84      0.81      0.65       349
          1       0.33      0.90      0.73      0.48      0.81      0.67        51

avg / total       0.90      0.76      0.88      0.79      0.81      0.65       400

