## Prepare data

In [40]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split 
from sklearn import metrics

In [41]:
def load_wine_data(path_to_dir, filename, csv_separator):
    csv_path = os.path.join(path_to_dir, filename)
    return pd.read_csv(csv_path, sep=csv_separator)
#os.path.dirname(path) return the parent directory of the given path
PARENT_DIR = os.path.dirname(os.getcwd())
WINE_DIR = os.path.join(PARENT_DIR, "Data set")
print(PARENT_DIR)
print(WINE_DIR)

D:\Documents\GitHub\WineQuality
D:\Documents\GitHub\WineQuality\Data set


In [42]:
df_red_wine = load_wine_data(WINE_DIR, "winequality-red.csv", ";")
df_x = df_red_wine.drop('quality', axis = 1)
df_y = df_red_wine['quality']

# Splitting the dataset.  
df_x_train, df_x_test, df_y_train, df_y_test = train_test_split(df_x,df_y, test_size= 0.2)  
%matplotlib inline

## Decision Tree

Graphical representation of all the possible sol to a decision like a tree

Decision based on some conditions , made can be easily explained
#### Terminology
- Root node 
- Child node
- Branch
- Leaf node
- Pruning

### Scale Data 

In [43]:
transform = RobustScaler().fit(df_x)
df_trans_red = transform.transform(df_x)
df_scale_red = pd.DataFrame(df_trans_red,columns=df_red_wine.columns[:-1])
df_scale_red.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,-0.238095,0.72,-0.787879,-0.428571,-0.15,-0.214286,-0.1,0.469799,1.052632,-0.333333,-0.5
1,-0.047619,1.44,-0.787879,0.571429,0.95,0.785714,0.725,0.022371,-0.578947,0.333333,-0.25
2,-0.047619,0.96,-0.666667,0.142857,0.65,0.071429,0.4,0.111857,-0.263158,0.166667,-0.25
3,1.571429,-0.96,0.909091,-0.428571,-0.2,0.214286,0.55,0.559284,-0.789474,-0.222222,-0.25
4,-0.238095,0.72,-0.787879,-0.428571,-0.15,-0.214286,-0.1,0.469799,1.052632,-0.333333,-0.5


### ML Algo classifying the red quality

In [44]:
from sklearn.tree import DecisionTreeClassifier
df_red_decision = DecisionTreeClassifier()
df_red_decision .fit(df_x_train,df_y_train)
df_red_predict = df_red_decision.predict(df_x_test)


### Evaluation

##### Show the accuracy and the balanced accuracy based on test set

In [45]:
from sklearn.metrics import accuracy_score
print(accuracy_score(df_y_test,df_red_predict))

from sklearn.metrics import balanced_accuracy_score
print(balanced_accuracy_score(df_y_test, df_red_predict))


0.653125
0.4104815326558209


#### Show the precision, f1-score, recall-score
[scikit](https://scikit-learn.org/stable/modules/model_evaluation.html#the-scoring-parameter-defining-model-evaluation-rules)

In [46]:
from sklearn.metrics import classification_report
print(classification_report(df_y_test,df_red_predict))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.08      0.10      0.09        10
           5       0.78      0.65      0.71       143
           6       0.62      0.72      0.66       122
           7       0.62      0.66      0.64        38
           8       0.40      0.33      0.36         6

    accuracy                           0.65       320
   macro avg       0.42      0.41      0.41       320
weighted avg       0.67      0.65      0.66       320



#### Visulize the ROC curve and calculate AUC - score

In [59]:
df_y_test.values

array([7, 5, 5, 7, 5, 5, 7, 6, 5, 5, 7, 5, 8, 5, 6, 4, 6, 6, 5, 5, 6, 5,
       5, 6, 5, 6, 5, 5, 5, 6, 6, 6, 5, 6, 5, 7, 5, 6, 5, 5, 6, 3, 5, 4,
       4, 5, 5, 6, 5, 6, 7, 7, 5, 4, 5, 6, 6, 6, 6, 5, 5, 5, 7, 7, 6, 5,
       6, 7, 5, 6, 6, 5, 5, 5, 6, 5, 6, 5, 7, 5, 6, 6, 6, 5, 5, 6, 5, 6,
       6, 5, 5, 5, 7, 5, 5, 6, 6, 5, 5, 5, 5, 7, 6, 6, 5, 5, 7, 6, 7, 6,
       5, 5, 6, 6, 7, 6, 6, 7, 7, 5, 5, 6, 5, 7, 6, 6, 5, 6, 5, 6, 7, 5,
       6, 5, 6, 8, 4, 6, 6, 5, 4, 6, 6, 7, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5,
       5, 7, 7, 6, 6, 6, 6, 5, 5, 4, 5, 6, 5, 5, 5, 6, 6, 6, 4, 6, 5, 6,
       5, 6, 5, 5, 5, 5, 6, 5, 6, 5, 5, 5, 5, 6, 5, 6, 5, 7, 5, 6, 6, 5,
       6, 5, 5, 6, 5, 6, 7, 5, 6, 5, 5, 5, 6, 5, 5, 5, 7, 6, 6, 5, 5, 5,
       6, 5, 5, 8, 5, 5, 8, 5, 6, 5, 6, 5, 6, 6, 6, 5, 7, 6, 6, 6, 5, 5,
       8, 4, 6, 6, 7, 5, 7, 6, 5, 5, 5, 5, 6, 6, 6, 7, 5, 6, 6, 6, 6, 5,
       4, 6, 6, 6, 5, 8, 5, 5, 5, 5, 6, 6, 7, 6, 6, 5, 5, 6, 7, 5, 5, 6,
       6, 6, 5, 6, 6, 5, 7, 7, 5, 5, 5, 6, 6, 5, 5,

In [58]:
df_red_predict

array([7, 5, 6, 8, 5, 5, 7, 7, 5, 6, 6, 5, 7, 6, 5, 6, 6, 7, 5, 6, 6, 5,
       5, 6, 5, 6, 6, 6, 6, 5, 5, 6, 5, 6, 5, 5, 5, 6, 6, 5, 5, 4, 5, 6,
       4, 5, 5, 5, 6, 5, 7, 7, 6, 5, 6, 6, 6, 6, 5, 6, 6, 5, 7, 7, 6, 5,
       6, 7, 6, 6, 6, 5, 5, 5, 6, 4, 6, 5, 7, 4, 7, 6, 6, 5, 6, 6, 6, 6,
       6, 5, 5, 5, 7, 6, 5, 5, 6, 5, 5, 5, 6, 7, 6, 6, 6, 6, 7, 6, 7, 6,
       7, 5, 6, 6, 6, 6, 7, 7, 6, 6, 5, 6, 5, 7, 6, 6, 7, 5, 5, 5, 7, 5,
       6, 5, 6, 6, 6, 6, 5, 5, 5, 5, 6, 7, 6, 5, 7, 5, 5, 5, 5, 5, 6, 5,
       6, 6, 7, 6, 6, 5, 6, 5, 5, 6, 5, 6, 6, 5, 5, 7, 6, 5, 5, 4, 5, 6,
       5, 6, 5, 5, 5, 5, 6, 5, 6, 5, 6, 5, 5, 6, 5, 6, 5, 6, 5, 6, 6, 5,
       6, 5, 5, 6, 5, 5, 6, 5, 6, 4, 6, 5, 7, 4, 5, 5, 7, 6, 6, 6, 5, 8,
       6, 5, 5, 8, 5, 5, 7, 6, 6, 5, 6, 5, 6, 6, 6, 5, 7, 6, 6, 6, 5, 6,
       6, 5, 6, 6, 5, 5, 6, 6, 5, 4, 5, 4, 6, 6, 6, 7, 5, 6, 7, 6, 5, 6,
       3, 7, 5, 6, 5, 8, 6, 5, 5, 6, 6, 7, 7, 6, 5, 6, 8, 6, 6, 5, 6, 6,
       4, 6, 5, 6, 6, 6, 6, 7, 6, 5, 5, 6, 6, 4, 6,

In [1]:
# import matplotlib.pyplot as plt
# import numpy as np
# from sklearn import metrics
# fpr, tpr, thresholds = metrics.roc_curve(df_y_test.values,df_red_predict)
# roc_auc = metrics.auc(fpr, tpr)
# display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc)
# display.plot()
# plt.show()