### Analysis of Red wine data

In [94]:
from path import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from pathlib import Path
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.svm import SVC
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import ClusterCentroids
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier

In [95]:
red_data = Path('all_wines.csv')
red_df = pd.read_csv(red_data)
red_df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,sulphates,alcohol,ph,quality,id,color
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,2,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,3,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,4,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,5,red


#### Filter to red wines only

In [96]:
red_df = red_df[red_df["color"] == "red"]
red_df.shape

(1599, 14)

In [97]:
# Missing Values
print(red_df.isna().sum())

fixed_acidity           0
volatile_acidity        0
citric_acid             0
residual_sugar          0
chlorides               0
free_sulfur_dioxide     0
total_sulfur_dioxide    0
density                 0
sulphates               0
alcohol                 0
ph                      0
quality                 0
id                      0
color                   0
dtype: int64


In [98]:
red_df.describe()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,sulphates,alcohol,ph,quality,id
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023,800.0
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569,461.735855
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0,1.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0,400.5
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0,800.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0,1199.5
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0,1599.0


In [99]:
corr = red_df.corr()
corr

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,sulphates,alcohol,ph,quality,id
fixed_acidity,1.0,-0.256131,0.671703,0.114777,0.093705,-0.153794,-0.113181,0.668047,-0.682978,0.183006,-0.061668,0.124052,-0.268484
volatile_acidity,-0.256131,1.0,-0.552496,0.001918,0.061298,-0.010504,0.07647,0.022026,0.234937,-0.260987,-0.202288,-0.390558,-0.008815
citric_acid,0.671703,-0.552496,1.0,0.143577,0.203823,-0.060978,0.035533,0.364947,-0.541904,0.31277,0.109903,0.226373,-0.153551
residual_sugar,0.114777,0.001918,0.143577,1.0,0.05561,0.187049,0.203028,0.355283,-0.085652,0.005527,0.042075,0.013732,-0.031261
chlorides,0.093705,0.061298,0.203823,0.05561,1.0,0.005562,0.0474,0.200632,-0.265026,0.37126,-0.221141,-0.128907,-0.119869
free_sulfur_dioxide,-0.153794,-0.010504,-0.060978,0.187049,0.005562,1.0,0.667666,-0.021946,0.070377,0.051658,-0.069408,-0.050656,0.09048
total_sulfur_dioxide,-0.113181,0.07647,0.035533,0.203028,0.0474,0.667666,1.0,0.071269,-0.066495,0.042947,-0.205654,-0.1851,-0.11785
density,0.668047,0.022026,0.364947,0.355283,0.200632,-0.021946,0.071269,1.0,-0.341699,0.148506,-0.49618,-0.174919,-0.368372
sulphates,-0.682978,0.234937,-0.541904,-0.085652,-0.265026,0.070377,-0.066495,-0.341699,1.0,-0.196648,0.205633,-0.057731,0.136005
alcohol,0.183006,-0.260987,0.31277,0.005527,0.37126,0.051658,0.042947,0.148506,-0.196648,1.0,0.093595,0.251397,-0.125307


#### Categorize quality to >=7 as Good and < 7 and Not good

In [100]:
red_df['quality_2'] = (red_df['quality'] >=  7).astype(int)

In [101]:
red_df.head(10)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,sulphates,alcohol,ph,quality,id,color,quality_2
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1,red,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,2,red,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,3,red,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,4,red,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,5,red,0
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5,6,red,0
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5,7,red,0
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7,8,red,1
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7,9,red,1
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5,10,red,0


#### Create Target and Features
- quality_2 is the binary outcome (1-Good, 0-Not good)
- quality, id, color are dropped since they aren't needed in the model

In [102]:
y = red_df["quality_2"]
X = red_df.drop(columns=["quality","quality_2","id","color"])

In [103]:
X.head(10)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,sulphates,alcohol,ph
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5


In [104]:
y.value_counts()

0    1382
1     217
Name: quality_2, dtype: int64

#### Scale the data
- Scale the data since there is such a wide range of values between features. Scaling will make all the model results comparable

In [105]:
data_scaler = StandardScaler()

In [106]:
X_scaled = data_scaler.fit_transform(X)
# X_scaled

#### Split the data into test and train

In [107]:
# YOUR CODE HERE
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=7)

In [108]:
y_train.value_counts()

0    1033
1     166
Name: quality_2, dtype: int64

In [109]:
X_scaled.shape

(1599, 11)

## Logistic model
_Benefits_ - Simple to implement and effective; it works with feature scaling but does not require it.  
_Limitations_ - Poor performance on non-linear data or where features are not highly correlated.

In [110]:
model1 = LogisticRegression(solver='lbfgs',random_state=7)

In [111]:
model1.fit(X_train, y_train)

LogisticRegression(random_state=7)

#### Model predictions

In [112]:
y_pred1 = model1.predict(X_test)

In [113]:
matrix1 = confusion_matrix(y_test, y_pred1)
print(matrix1)

[[329  20]
 [ 29  22]]


In [114]:
report1 = classification_report(y_test, y_pred1)
print(report1)

              precision    recall  f1-score   support

           0       0.92      0.94      0.93       349
           1       0.52      0.43      0.47        51

    accuracy                           0.88       400
   macro avg       0.72      0.69      0.70       400
weighted avg       0.87      0.88      0.87       400



In [115]:
importance = model1.coef_[0]
columns = X.columns
import_df = pd.DataFrame({'feature':columns, 'importance':importance})
import_df = import_df.sort_values(["importance"], ascending=False)
import_df

Unnamed: 0,feature,importance
10,ph,0.842461
0,fixed_acidity,0.618401
9,alcohol,0.536259
3,residual_sugar,0.35641
5,free_sulfur_dioxide,0.046747
2,citric_acid,0.002807
8,sulphates,-0.019941
4,chlorides,-0.370528
6,total_sulfur_dioxide,-0.392125
1,volatile_acidity,-0.465505


## SVM
_Benefits_ - Does better at handling a higher number of features.  Useful when classes can be easily separated.  Outliers have less impact.  
_Limitations_ - For larger datasets, it requires a large amount of time to process.  In the case of overlapped classes it does nt perform well.

In [116]:
# Linear kernel did not work
model2 = SVC(kernel='rbf',random_state=7)

In [117]:
model2.fit(X_train, y_train)

SVC(random_state=7)

In [118]:
y_pred2 = model2.predict(X_test)

In [119]:
matrix2 = confusion_matrix(y_test, y_pred2)
print(matrix2)

[[336  13]
 [ 34  17]]


In [120]:
report2 = classification_report(y_test, y_pred2)
print(report2)

              precision    recall  f1-score   support

           0       0.91      0.96      0.93       349
           1       0.57      0.33      0.42        51

    accuracy                           0.88       400
   macro avg       0.74      0.65      0.68       400
weighted avg       0.86      0.88      0.87       400



In [121]:
accuracy_score(y_test, y_pred2)

0.8825

## Decision tree
_Benefits_ - Scaling or normalization of data is not required.  Can handle missing values.  
_Limitations_ Prone to overfitting.  Higher time required to train decision trees.

In [122]:
model3 = tree.DecisionTreeClassifier(random_state=7)

In [123]:
model3 = model3.fit(X_train, y_train)

In [124]:
y_pred3 = model3.predict(X_test)

In [125]:
matrix3 = confusion_matrix(y_test, y_pred3)
print(matrix3)

[[305  44]
 [ 24  27]]


In [126]:
report3 = classification_report(y_test, y_pred3)
print(report3)

              precision    recall  f1-score   support

           0       0.93      0.87      0.90       349
           1       0.38      0.53      0.44        51

    accuracy                           0.83       400
   macro avg       0.65      0.70      0.67       400
weighted avg       0.86      0.83      0.84       400



In [127]:
accuracy_score(y_test, y_pred3)

0.83

In [128]:
importance = model3.feature_importances_
columns = X.columns
import_df = pd.DataFrame({'feature':columns, 'importance':importance})
import_df = import_df.sort_values(["importance"], ascending=False)
import_df

Unnamed: 0,feature,importance
10,ph,0.24958
8,sulphates,0.13608
9,alcohol,0.114116
0,fixed_acidity,0.087731
1,volatile_acidity,0.081758
5,free_sulfur_dioxide,0.081022
2,citric_acid,0.078951
3,residual_sugar,0.045753
7,density,0.045384
6,total_sulfur_dioxide,0.040393


## Random Forest Classifier
_Benefits_ - Less prone to overfitting.  
_Limitations_ - Features need to have some predictive power or else they wont work.

In [129]:
model4 = RandomForestClassifier(n_estimators=128,random_state=7) 

In [130]:
model4 = model4.fit(X_train, y_train)

In [131]:
y_pred4 = model4.predict(X_test)

In [132]:
matrix4 = confusion_matrix(y_test, y_pred4)
print(matrix4)

[[333  16]
 [ 25  26]]


In [133]:
report4 = classification_report(y_test, y_pred4)
print(report4)

              precision    recall  f1-score   support

           0       0.93      0.95      0.94       349
           1       0.62      0.51      0.56        51

    accuracy                           0.90       400
   macro avg       0.77      0.73      0.75       400
weighted avg       0.89      0.90      0.89       400



In [134]:
accuracy_score(y_test, y_pred4)

0.8975

In [135]:
importance = model4.feature_importances_
columns = X.columns
import_df = pd.DataFrame({'feature':columns, 'importance':importance})
import_df = import_df.sort_values(["importance"], ascending=False)
import_df

Unnamed: 0,feature,importance
10,ph,0.169798
1,volatile_acidity,0.120064
9,alcohol,0.112396
7,density,0.092389
6,total_sulfur_dioxide,0.083003
2,citric_acid,0.079479
0,fixed_acidity,0.075408
4,chlorides,0.07215
3,residual_sugar,0.071893
8,sulphates,0.062759


## Gradient Boosted Tree
_Benefits_ - Often provides more predictive accuracy. Lots of flexibility. No data pre-processing required. 

_Limitations_ - Training generally takes longer. It will continue improving to minimize all errors. This can overemphasize outliers and cause overfitting.

In [136]:
# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20,
                                            learning_rate=learning_rate,
                                            max_features=5,
                                            max_depth=3,
                                            random_state=0)

    # Fit the model
    classifier.fit(X_train, y_train)
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_train,
            y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test,
            y_test)))
    print()

Learning rate:  0.05
Accuracy score (training): 0.887
Accuracy score (validation): 0.880

Learning rate:  0.1
Accuracy score (training): 0.914
Accuracy score (validation): 0.882

Learning rate:  0.25
Accuracy score (training): 0.944
Accuracy score (validation): 0.882

Learning rate:  0.5
Accuracy score (training): 0.959
Accuracy score (validation): 0.892

Learning rate:  0.75
Accuracy score (training): 0.973
Accuracy score (validation): 0.865

Learning rate:  1
Accuracy score (training): 0.973
Accuracy score (validation): 0.858



#### Choose the learning_rate based on highest Accuracy score = 0.5

In [137]:
model5 = GradientBoostingClassifier(n_estimators=20,
   learning_rate=0.5, max_features=10, max_depth=3, random_state=7)

In [138]:
model5 = model5.fit(X_train, y_train)

In [139]:
y_pred5 = model5.predict(X_test)

In [140]:
matrix5 = confusion_matrix(y_test, y_pred5)
print(matrix5)

[[332  17]
 [ 29  22]]


In [141]:
report5 = classification_report(y_test, y_pred5)
print(report5)

              precision    recall  f1-score   support

           0       0.92      0.95      0.94       349
           1       0.56      0.43      0.49        51

    accuracy                           0.89       400
   macro avg       0.74      0.69      0.71       400
weighted avg       0.87      0.89      0.88       400



In [142]:
accuracy_score(y_test, y_pred5)

0.885

In [143]:
importance = model5.feature_importances_
columns = X.columns
import_df = pd.DataFrame({'feature':columns, 'importance':importance})
import_df = import_df.sort_values(["importance"], ascending=False)
import_df

Unnamed: 0,feature,importance
10,ph,0.306515
6,total_sulfur_dioxide,0.131908
9,alcohol,0.12608
1,volatile_acidity,0.122331
5,free_sulfur_dioxide,0.097373
3,residual_sugar,0.054741
4,chlorides,0.04501
2,citric_acid,0.040504
7,density,0.038264
8,sulphates,0.018796


### The next 3 models use oversampling and undersampling because of imbalanced sample sizes.

## Random Over Sampling

_Benefits_ - Randomly duplicate examples in the minority class. This makes it simple to implement and fast to execute. 

_Limitations_ - Increase in training time for the classifier and can lead to model overfitting.

In [144]:
ros = RandomOverSampler(random_state=7)

In [145]:
Counter(y_train)

Counter({0: 1033, 1: 166})

In [146]:
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

#### New balanced sample sizes

In [147]:
Counter(y_resampled)

Counter({0: 1033, 1: 1033})

In [148]:
# Train the Logistic Regression model using the resampled data
model6 = LogisticRegression(solver='lbfgs', random_state=7)

In [149]:
model6.fit(X_resampled, y_resampled)

LogisticRegression(random_state=7)

In [150]:
y_pred6 = model6.predict(X_test)

In [151]:
print(balanced_accuracy_score(y_test, y_pred6))

0.8306365526153154


In [152]:
confusion_matrix(y_test, y_pred6)

array([[265,  84],
       [  5,  46]], dtype=int64)

In [153]:
report6 = classification_report_imbalanced(y_test, y_pred6)
print(report6)

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.76      0.90      0.86      0.83      0.68       349
          1       0.35      0.90      0.76      0.51      0.83      0.69        51

avg / total       0.90      0.78      0.88      0.81      0.83      0.68       400



In [154]:
importance = model6.coef_[0]
columns = X.columns
import_df = pd.DataFrame({'feature':columns, 'importance':importance})
import_df = import_df.sort_values(["importance"], ascending=False)
import_df

Unnamed: 0,feature,importance
10,ph,0.942781
0,fixed_acidity,0.898591
9,alcohol,0.602835
3,residual_sugar,0.427579
5,free_sulfur_dioxide,0.091059
8,sulphates,0.035186
4,chlorides,-0.188786
2,citric_acid,-0.202176
6,total_sulfur_dioxide,-0.469851
1,volatile_acidity,-0.499269


## SMOTE (Synthetic Minority Over-sampling Technique) oversampling

_Benefits_ -  Less likely to overfit. Seems to work well with low dimensional data. 

_Limitations_ - SMOTE does not take into consideration neighboring examples can be from other classes. This can increase the overlapping of classes and can introduce additional noise.

In [155]:
X_resampled, y_resampled = SMOTE(random_state=7,sampling_strategy='auto').fit_resample(
   X_train, y_train)

#### New balanced sample sizes

In [156]:
Counter(y_resampled)

Counter({0: 1033, 1: 1033})

In [157]:
model7 = LogisticRegression(solver='lbfgs', random_state=7)
model7.fit(X_resampled, y_resampled)

LogisticRegression(random_state=7)

In [158]:
y_pred7 = model7.predict(X_test)
balanced_accuracy_score(y_test, y_pred7)

0.8447384684532839

In [159]:
confusion_matrix(y_test, y_pred7)

array([[268,  81],
       [  4,  47]], dtype=int64)

In [160]:
report7 = classification_report_imbalanced(y_test, y_pred7)
print(report7)

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.77      0.92      0.86      0.84      0.70       349
          1       0.37      0.92      0.77      0.53      0.84      0.72        51

avg / total       0.91      0.79      0.90      0.82      0.84      0.70       400



In [161]:
importance = model7.coef_[0]
columns = X.columns
import_df = pd.DataFrame({'feature':columns, 'importance':importance})
import_df = import_df.sort_values(["importance"], ascending=False)
import_df

Unnamed: 0,feature,importance
10,ph,1.072563
0,fixed_acidity,0.808048
9,alcohol,0.615908
3,residual_sugar,0.376058
5,free_sulfur_dioxide,0.178504
8,sulphates,-0.049215
2,citric_acid,-0.287617
4,chlorides,-0.499886
7,density,-0.555367
1,volatile_acidity,-0.572576


## Cluster Centroid Undersampling

_Benefits_ - Less likely to overfit. 

_Limitations_ - The result of the analysis may be less accurate.

In [162]:
cc = ClusterCentroids(random_state=7)

In [163]:
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)

In [164]:
Counter(y_resampled)

Counter({0: 166, 1: 166})

In [165]:
model8 = LogisticRegression(solver='lbfgs', random_state=7, max_iter=1000)
model8.fit(X_resampled, y_resampled)

LogisticRegression(max_iter=1000, random_state=7)

In [166]:
y_pred8 = model8.predict(X_test)

In [167]:
balanced_accuracy_score(y_test, y_pred8)

0.8165346367773471

In [168]:
confusion_matrix(y_test, y_pred8)

array([[262,  87],
       [  6,  45]], dtype=int64)

In [169]:
report8 = classification_report_imbalanced(y_test, y_pred8)
print(report8)

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.75      0.88      0.85      0.81      0.65       349
          1       0.34      0.88      0.75      0.49      0.81      0.67        51

avg / total       0.90      0.77      0.87      0.80      0.81      0.66       400



In [170]:
importance = model8.coef_[0]
columns = X.columns
import_df = pd.DataFrame({'feature':columns, 'importance':importance})
import_df = import_df.sort_values(["importance"], ascending=False)
import_df

Unnamed: 0,feature,importance
10,ph,0.983138
0,fixed_acidity,0.47087
9,alcohol,0.28689
5,free_sulfur_dioxide,0.021236
3,residual_sugar,0.00418
8,sulphates,-0.208731
7,density,-0.248779
2,citric_acid,-0.332573
6,total_sulfur_dioxide,-0.39666
4,chlorides,-0.491394


##  Balanced Random Forest Classifier
_Benefits_ - It reduces overfitting problem in decision trees and also reduces the variance and therefore improves the accuracy. 

_Limitations_ -  Requires much more computational power and resources. Longer Training Period.

In [171]:
brf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=7) 

In [172]:
model9 = brf_model.fit(X_train, y_train)

In [173]:
y_pred9 = model9.predict(X_test)

In [174]:
balanced_accuracy_score(y_test, y_pred9)

0.8433058036968368

In [175]:
confusion_matrix(y_test, y_pred9)

array([[267,  82],
       [  4,  47]], dtype=int64)

In [176]:
report9 = classification_report_imbalanced(y_test, y_pred9)
print(report9)

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.77      0.92      0.86      0.84      0.69       349
          1       0.36      0.92      0.77      0.52      0.84      0.72        51

avg / total       0.91      0.79      0.90      0.82      0.84      0.70       400



In [177]:
importance = model9.feature_importances_
columns = X.columns
import_df = pd.DataFrame({'feature':columns, 'importance':importance})
import_df = import_df.sort_values(["importance"], ascending=False)
import_df

Unnamed: 0,feature,importance
10,ph,0.233499
1,volatile_acidity,0.128257
9,alcohol,0.116748
2,citric_acid,0.083079
7,density,0.076444
6,total_sulfur_dioxide,0.074452
0,fixed_acidity,0.070034
4,chlorides,0.067913
8,sulphates,0.053655
3,residual_sugar,0.049013


## Easy Ensemble AdaBoost 
_Benefits_ - Fast, simple, and easy to program. Also, it has the flexibility to be combined with any machine learning algorithm. 

_Limitations_ - Weak classifiers can lead to overfitting.

In [178]:
eec_model = EasyEnsembleClassifier(n_estimators=100, random_state=7)

In [179]:
model10 = eec_model.fit(X_train, y_train)

In [180]:
y_pred10 = model10.predict(X_test)

In [181]:
balanced_accuracy_score(y_test, y_pred10)

0.8177425698072925

In [182]:
confusion_matrix(y_test, y_pred10)

array([[256,  93],
       [  5,  46]], dtype=int64)

In [183]:
report10 = classification_report_imbalanced(y_test, y_pred10)
print(report10)

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.73      0.90      0.84      0.81      0.65       349
          1       0.33      0.90      0.73      0.48      0.81      0.67        51

avg / total       0.90      0.76      0.88      0.79      0.81      0.65       400



#### Attempt a cool graphic of the decision tree model but a little too complicated

In [184]:
# import matplotlib.pyplot as plt
# fn=['ph', 'sulphates', 'alcohol', 'fixed_acidity', 'volatile_acidity', 'free_sulfur_dioxide', 'citric_acid', 'residual_sugar', 'density', 
# 'total_sulfur_dioxide', 'chlorides']
# cn=['Good','Not good']
# fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (6,6), dpi=900)

# tree.plot_tree(model3,
#                feature_names = fn, 
#                class_names=cn,
#                filled = True);
# fig.savefig('Redmodel3Tree.png')

In [185]:
# reportout1 = classification_report(y_test, y_pred1, output_dict=True)
# print(reportout1)

#### Combine all the model outputs for visualization into Tableau

In [187]:
# df_out1 = pd.DataFrame(reportout1).transpose()
# df_out1

In [188]:
reportout1  = classification_report(y_test, y_pred1 , output_dict=True)
reportout2  = classification_report(y_test, y_pred2 , output_dict=True)
reportout3  = classification_report(y_test, y_pred3 , output_dict=True)
reportout4  = classification_report(y_test, y_pred4 , output_dict=True)
reportout5  = classification_report(y_test, y_pred5 , output_dict=True)
reportout6  = classification_report(y_test, y_pred6 , output_dict=True)
reportout7  = classification_report(y_test, y_pred7 , output_dict=True)
reportout8  = classification_report(y_test, y_pred8 , output_dict=True)
reportout9  = classification_report(y_test, y_pred9 , output_dict=True)
reportout10 = classification_report(y_test, y_pred10, output_dict=True)

In [189]:
df_out1  = pd.DataFrame(reportout1).transpose()
df_out2  = pd.DataFrame(reportout2).transpose()
df_out3  = pd.DataFrame(reportout3).transpose()
df_out4  = pd.DataFrame(reportout4).transpose()
df_out5  = pd.DataFrame(reportout5).transpose()
df_out6  = pd.DataFrame(reportout6).transpose()
df_out7  = pd.DataFrame(reportout7).transpose()
df_out8  = pd.DataFrame(reportout8).transpose()
df_out9  = pd.DataFrame(reportout9).transpose()
df_out10 = pd.DataFrame(reportout10).transpose()

In [190]:
df_out1= df_out1[1:3]
df_out1

Unnamed: 0,precision,recall,f1-score,support
1,0.52381,0.431373,0.473118,51.0
accuracy,0.8775,0.8775,0.8775,0.8775


In [191]:
# df_out2

In [192]:
df_out2 = df_out2[1:3]
df_out3 = df_out3[1:3]
df_out4 = df_out4[1:3]
df_out5 = df_out5[1:3]
df_out6 = df_out6[1:3]
df_out7 = df_out7[1:3]
df_out8 = df_out8[1:3]
df_out9 = df_out9[1:3]
df_out10= df_out10[1:3]

In [193]:
# df_out9

In [214]:
output_df = df_out1.append([df_out2, df_out3, df_out4, df_out5, df_out6, df_out7, df_out8, df_out9, df_out10])
output_df.drop(columns="support", axis=1, inplace=True)
output_df

Unnamed: 0,precision,recall,f1-score
1,0.52381,0.431373,0.473118
accuracy,0.8775,0.8775,0.8775
1,0.566667,0.333333,0.419753
accuracy,0.8825,0.8825,0.8825
1,0.380282,0.529412,0.442623
accuracy,0.83,0.83,0.83
1,0.619048,0.509804,0.55914
accuracy,0.8975,0.8975,0.8975
1,0.564103,0.431373,0.488889
accuracy,0.885,0.885,0.885


#### columns
- 0 - Precision
- 1 - Recall
- 2 - F1 score
- 3-5 - Accuracy

In [215]:
output_df = pd.DataFrame([y.values.ravel() for x , y in output_df.groupby(np.arange(len(output_df))//2)])
output_df

Unnamed: 0,0,1,2,3,4,5
0,0.52381,0.431373,0.473118,0.8775,0.8775,0.8775
1,0.566667,0.333333,0.419753,0.8825,0.8825,0.8825
2,0.380282,0.529412,0.442623,0.83,0.83,0.83
3,0.619048,0.509804,0.55914,0.8975,0.8975,0.8975
4,0.564103,0.431373,0.488889,0.885,0.885,0.885
5,0.353846,0.901961,0.508287,0.7775,0.7775,0.7775
6,0.367188,0.921569,0.52514,0.7875,0.7875,0.7875
7,0.340909,0.882353,0.491803,0.7675,0.7675,0.7675
8,0.364341,0.921569,0.522222,0.785,0.785,0.785
9,0.330935,0.901961,0.484211,0.755,0.755,0.755


In [216]:
shorter_df = output_df.drop(output_df.columns[4:6], axis=1)
shorter_df

Unnamed: 0,0,1,2,3
0,0.52381,0.431373,0.473118,0.8775
1,0.566667,0.333333,0.419753,0.8825
2,0.380282,0.529412,0.442623,0.83
3,0.619048,0.509804,0.55914,0.8975
4,0.564103,0.431373,0.488889,0.885
5,0.353846,0.901961,0.508287,0.7775
6,0.367188,0.921569,0.52514,0.7875
7,0.340909,0.882353,0.491803,0.7675
8,0.364341,0.921569,0.522222,0.785
9,0.330935,0.901961,0.484211,0.755


In [217]:
shorter_df = shorter_df.rename({0: 'Precision', 1: 'Recall', 2: 'F1', 3: 'Accuracy'}, axis=1)
shorter_df

# axis 1-columns 0-rows

Unnamed: 0,Precision,Recall,F1,Accuracy
0,0.52381,0.431373,0.473118,0.8775
1,0.566667,0.333333,0.419753,0.8825
2,0.380282,0.529412,0.442623,0.83
3,0.619048,0.509804,0.55914,0.8975
4,0.564103,0.431373,0.488889,0.885
5,0.353846,0.901961,0.508287,0.7775
6,0.367188,0.921569,0.52514,0.7875
7,0.340909,0.882353,0.491803,0.7675
8,0.364341,0.921569,0.522222,0.785
9,0.330935,0.901961,0.484211,0.755


In [218]:
shorter_df = shorter_df.rename({0: 'Model 1: Logistic', 1: 'Model 2: SVM', 2: 'Model 3: Decision Tree', 
                                3: 'Model 4: Random Forest', 4: 'Model 5: Gradient Boosted Tree', 
                                5: 'Model 6: Logisitic-ROS', 6: 'Model 7: Logistic-SMOTE', 
                                7: 'Model 8: Logistic-Cluster centroid US', 8: 'Model 9: Balanced Random Forest Classifier', 
                                9: 'Model 10: Easy Ensemble Ada Boost Classifier', 
                               }
                               , axis=0)
shorter_df


Unnamed: 0,Precision,Recall,F1,Accuracy
Model 1: Logistic,0.52381,0.431373,0.473118,0.8775
Model 2: SVM,0.566667,0.333333,0.419753,0.8825
Model 3: Decision Tree,0.380282,0.529412,0.442623,0.83
Model 4: Random Forest,0.619048,0.509804,0.55914,0.8975
Model 5: Gradient Boosted Tree,0.564103,0.431373,0.488889,0.885
Model 6: Logisitic-ROS,0.353846,0.901961,0.508287,0.7775
Model 7: Logistic-SMOTE,0.367188,0.921569,0.52514,0.7875
Model 8: Logistic-Cluster centroid US,0.340909,0.882353,0.491803,0.7675
Model 9: Balanced Random Forest Classifier,0.364341,0.921569,0.522222,0.785
Model 10: Easy Ensemble Ada Boost Classifier,0.330935,0.901961,0.484211,0.755


In [221]:
shorter_df.to_csv('df_test.csv')