In [160]:
import pandas as pd
import seaborn as sns
import sklearn as sk
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split 
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE


In [170]:
wine_df = pd.read_csv("./Data/wine_extracted.csv")
wine_df = wine_df[wine_df.columns[-2:].append(wine_df.columns[:-2])]

In [182]:
wine_df = wine_df[wine_df["quality"]!=9]
wine_df = wine_df[wine_df["quality"]!=3]
wine_df = wine_df[wine_df["Wine"]=="White"]
#wine_df = wine_df[wine_df["quality"]!=4]
#wine_df = wine_df[wine_df["quality"]!=8]

In [183]:
def print_metrics(true_val, pred_val):
    print("The accuracy is ",accuracy_score(true_val, pred_val))
    print(classification_report(true_val, pred_val))
#     print("The precision is ",precision_score(true_val, pred_val, average = "weighted"))
#     print("The recall is ",recall_score(true_val, pred_val, average = "weighted"))
#     print("The f1_score is ",f1_score(true_val, pred_val, average = "weighted"))
    print("The confusion matric is ")
    print(confusion_matrix(true_val, pred_val))
    

## Smote Use

In [184]:
sm = SMOTE()

In [194]:
wine_df.columns

Index(['quality', 'Wine', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'],
      dtype='object')

In [195]:
wine_df_target = wine_df["quality"]
cols =['fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']

In [196]:
train_df, test_df = train_test_split(wine_df)

In [197]:
test_df_target, test_df_feature = test_df["quality"], test_df[cols]
train_df_target, train_df_feature = train_df["quality"], train_df[cols]

In [198]:
train_df_feature, train_df_target = sm.fit_resample(train_df_feature,train_df_target)

## SVC

In [191]:
svm_model = SVC(kernel = "linear")
svm_model = svm_model.fit(train_df_feature, train_df_target)

In [192]:
print_metrics(test_df_target, svm_model.predict(test_df_feature))

The accuracy is  0.3338802296964725
              precision    recall  f1-score   support

           4       0.09      0.33      0.14        36
           5       0.47      0.59      0.52       346
           6       0.58      0.22      0.32       569
           7       0.29      0.14      0.19       223
           8       0.10      0.69      0.17        45

    accuracy                           0.33      1219
   macro avg       0.31      0.40      0.27      1219
weighted avg       0.46      0.33      0.34      1219

The confusion matric is 
[[ 12  13   8   0   3]
 [ 53 205  41  18  29]
 [ 52 185 127  58 147]
 [ 12  31  37  32 111]
 [  1   4   6   3  31]]


In [193]:
accuracy_score(test_df_target,svm_model.predict(test_df_feature))

0.3338802296964725

In [129]:
confusion_matrix(test_df_target,svm_model.predict(test_df_feature))

array([[333, 193,   0],
       [213, 494,   0],
       [ 35, 246,   0]], dtype=int64)

## Random Forest

In [199]:
random_forest = RandomForestClassifier()
random_forest = random_forest.fit(train_df_feature, train_df_target)

In [200]:
print_metrics(test_df_target, random_forest.predict(test_df_feature))

The accuracy is  0.6562756357670222
              precision    recall  f1-score   support

           4       0.34      0.51      0.41        41
           5       0.72      0.69      0.70       386
           6       0.73      0.66      0.69       548
           7       0.56      0.64      0.60       190
           8       0.39      0.52      0.45        54

    accuracy                           0.66      1219
   macro avg       0.55      0.61      0.57      1219
weighted avg       0.67      0.66      0.66      1219

The confusion matric is 
[[ 21  15   4   1   0]
 [ 25 267  78  12   4]
 [ 14  87 362  68  17]
 [  0   4  42 122  22]
 [  1   0   9  16  28]]


In [138]:
accuracy_score(test_df_target,random_forest.predict(test_df_feature))

0.6743725231175693

In [139]:
confusion_matrix(test_df_target,random_forest.predict(test_df_feature))

array([[365, 138,  23],
       [136, 464, 107],
       [ 15,  74, 192]], dtype=int64)