In [1]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

In [2]:
df = pd.read_csv("Data/winequality-red-reformatted.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
#categorize good and bad quality wines - anything over 5 is considered "good" wine and rest is "bad" wine
for q in df['quality']:
    if q > 5:
        df['quality'] = df['quality'].replace([6, 7, 8], 1)
    else:
        df['quality'] = df['quality'].replace([3, 4, 5], 0) 

In [4]:
quality_counts = df["quality"].value_counts()
quality_counts

1    855
0    744
Name: quality, dtype: int64

In [5]:
target = df["quality"]
target_names = ["bad", "good"]

In [6]:
features = df.drop("quality", axis = 1)
feature_names = features.columns

## Logistic Regression

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=42) 

In [8]:
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train_scaled, y_train)

LogisticRegression()

In [10]:
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.7522935779816514
Testing Data Score: 0.72


In [11]:
predictions = classifier.predict(X_test_scaled)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [0 0 1 0 1 0 0 0 1 1]
First 10 Actual labels: [1, 0, 1, 0, 1, 0, 0, 0, 0, 1]


## SVM

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=42)

In [28]:
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [29]:
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train_scaled, y_train)

SVC(kernel='linear')

In [30]:
print('Test Acc: %.3f' % model.score(X_test_scaled, y_test))

Test Acc: 0.730


In [31]:
from sklearn.metrics import classification_report
predictions = model.predict(X_test_scaled)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

         bad       0.67      0.77      0.72       178
        good       0.79      0.70      0.74       222

    accuracy                           0.73       400
   macro avg       0.73      0.73      0.73       400
weighted avg       0.74      0.73      0.73       400



In [17]:
actual = y_test.to_frame()
actual["Predicted"] = predictions
comparison = actual.rename(columns = {"quality": "Actual"})
comparison.head()

Unnamed: 0,Actual,Predicted
803,1,0
124,0,0
350,1,1
682,0,0
1326,1,1


## Random Forest


In [23]:
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=42)

In [25]:
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [26]:
rf = RandomForestClassifier(n_estimators = 200)
rf = rf.fit(X_train_scaled, y_train)
rf.score(X_test_scaled, y_test)

0.8025