In [1]:
# Importing the libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Ignore harmless warnings

import warnings
warnings.filterwarnings("ignore")

# Set to display all the columns in dataset

pd.set_option("display.max_columns", 50)

# Import psql to run queries

import pandasql as psql

In [2]:
wine = pd.read_csv(r"C:\Users\Anil\Desktop\data_science\62 Session 24-Aug-2021-20210824\wine.csv",header=0)
wine.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,WineType
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065,0
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050,0
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185,0
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480,0
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735,0


In [3]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    int64  
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline                       178 non-null    int64  
 13  WineT

In [4]:
wine.nunique()

alcohol                         126
malic_acid                      133
ash                              79
alcalinity_of_ash                63
magnesium                        53
total_phenols                    97
flavanoids                      132
nonflavanoid_phenols             39
proanthocyanins                 101
color_intensity                 132
hue                              78
od280/od315_of_diluted_wines    122
proline                         121
WineType                          3
dtype: int64

In [5]:
# Identify the independent and Target (dependent) variables

IndepVar = []
for col in wine.columns:
    if col != 'WineType':
        IndepVar.append(col)

TargetVar = 'WineType'

x = wine[IndepVar]
y = wine[TargetVar]

In [7]:
# Split the data into train and test (random sampling)

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, stratify=y, random_state=6)
x_test_F1 = x_test.copy()
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((142, 13), (36, 13), (142,), (36,))

In [8]:
# Scaling the features by using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler(feature_range=(0, 1))

x_train = mmscaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train)

x_test = mmscaler.fit_transform(x_test)
x_test = pd.DataFrame(x_test)

## XGBoost Classifier

In [10]:
# XGBClassifier is used for classification problems

from xgboost import XGBClassifier

modelXGB = XGBClassifier()

# Fit the model with train data

modelXGB.fit(x_train, y_train)

# Predict model with test data

y_pred = modelXGB.predict(x_test)

# Evaluation Metrics for MultiClassification Analysis

actual = y_test
predict = y_pred

# Giving number of classes to the labe_classes
label_classes = [0,1,2]

# printing the multiclass confusion matrix
from sklearn.metrics import confusion_matrix,classification_report,multilabel_confusion_matrix
print("Confusion_Matrix:\n",confusion_matrix(actual,predict,labels = label_classes))

matrix = multilabel_confusion_matrix(actual,predict,labels = label_classes) 
# creating a lists to append the metric values by each class for to calcualte overall evaluation metris
avg_accuracy = []
avg_precision = []
avg_recall = []
avg_f1score = []
avg_balanced_accuracy = []
avg_mcc = []

print("\n")
for i in label_classes:
    print("Class: {0}".format(i))
    # separating the true_positive and true_negative values from multilabel confusion_matrix
    matrix_data = matrix[i]
    tp = matrix_data[1][1]
    fn = matrix_data[1][0]
    fp = matrix_data[0][1]
    tn = matrix_data[0][0]
    
    # printing the class wise confusion_matrix
    print("Confusion_Matrix: \n",matrix_data)
    
    # printing the outcome values
    print("Outcome_values:\n",tp,fn,fp,tn)
    
    # calculating the evaluation metrics
    accuracy = round((tp+tn)/(tp+fn+fp+tn),3)
    avg_accuracy.append(accuracy)
    
    precision = round(tp/(tp+fp),3)
    avg_precision.append(precision)
    
    recall = round(tp/(tp+fn),3)
    avg_recall.append(recall)
    
    f1_score = round((2*tp/(2*tp+fp+fn)),3)
    avg_f1score.append(f1_score)
    
    specificity = round(tn/(tn+fp),3)
    
    balanced_accuracy = round((recall + specificity)/2,3)
    avg_balanced_accuracy.append(balanced_accuracy)
    
    # Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1.
    # A model with a score of +1 is a perfect model and -1 is a poor model

    # importing square root
    from math import sqrt
    n = (tp*tn)-(fp*fn)
    d = sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
    mcc = round(n/d,3)
    avg_mcc.append(mcc)
    
    # importing metrics from sklearn
    from sklearn import metrics
    
    print("TP={0}, FP={1}, TN={2}, FN={3}".format(tp, fp, tn, fn));
    print("Accuracy:",round(accuracy*100,2),"%")
    print("Precision:",round(precision*100,2),"%")
    print("Recall:",round(recall*100,3),"%")
    print("F1_score:",f1_score)
    print("Balanced_Accuracy:",round(balanced_accuracy*100,2),"%")
    print("MCC:",mcc)
    print("\n")
# printing the overall_evaluation_metrics
# importing mean from statistics
from statistics import mean
print("-------------------------------------------------------------------\n")
print("Overall_evaluation_Metrics:\n")
print("Accuracy:",round(mean(avg_accuracy)*100,2),"%")
print("Precision:",round(mean(avg_precision)*100,2),"%")
print("Recall:",round(mean(avg_recall)*100,3),"%")
print("F1_score:",round(mean(avg_f1score),3))
print("Balanced_Accuracy:",round(mean(avg_balanced_accuracy)*100,2),"%")
print("MCC:",mean(avg_mcc))


Confusion_Matrix:
 [[12  0  0]
 [ 0 13  1]
 [ 0  0 10]]


Class: 0
Confusion_Matrix: 
 [[24  0]
 [ 0 12]]
Outcome_values:
 12 0 0 24
TP=12, FP=0, TN=24, FN=0
Accuracy: 100.0 %
Precision: 100.0 %
Recall: 100.0 %
F1_score: 1.0
Balanced_Accuracy: 100.0 %
MCC: 1.0


Class: 1
Confusion_Matrix: 
 [[22  0]
 [ 1 13]]
Outcome_values:
 13 1 0 22
TP=13, FP=0, TN=22, FN=1
Accuracy: 97.2 %
Precision: 100.0 %
Recall: 92.9 %
F1_score: 0.963
Balanced_Accuracy: 96.4 %
MCC: 0.942


Class: 2
Confusion_Matrix: 
 [[25  1]
 [ 0 10]]
Outcome_values:
 10 0 1 25
TP=10, FP=1, TN=25, FN=0
Accuracy: 97.2 %
Precision: 90.9 %
Recall: 100.0 %
F1_score: 0.952
Balanced_Accuracy: 98.1 %
MCC: 0.935


-------------------------------------------------------------------

Overall_evaluation_Metrics:

Accuracy: 98.13 %
Precision: 96.97 %
Recall: 97.633 %
F1_score: 0.972
Balanced_Accuracy: 98.17 %
MCC: 0.959


## GBClassifier

In [11]:
# GradientBoosting Classifier is used for classification problems

from sklearn.ensemble import GradientBoostingClassifier

modelGBC = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0,
criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1,
min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0,
min_impurity_split=None, init=None, random_state=None, max_features=None,
verbose=0, max_leaf_nodes=None, warm_start=False, validation_fraction=0.1,
n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)

# Fit the model with train data

modelGBC.fit(x_train, y_train)

# Predict the model with test data

# Predict model with test data

y_pred = modelXGB.predict(x_test)

# Evaluation Metrics for MultiClassification Analysis

actual = y_test
predict = y_pred

# Giving number of classes to the labe_classes
label_classes = [0,1,2]

# printing the multiclass confusion matrix
from sklearn.metrics import confusion_matrix,classification_report,multilabel_confusion_matrix
print("Confusion_Matrix:\n",confusion_matrix(actual,predict,labels = label_classes))

matrix = multilabel_confusion_matrix(actual,predict,labels = label_classes) 
# creating a lists to append the metric values by each class for to calcualte overall evaluation metris
avg_accuracy = []
avg_precision = []
avg_recall = []
avg_f1score = []
avg_balanced_accuracy = []
avg_mcc = []

print("\n")
for i in label_classes:
    print("Class: {0}".format(i))
    # separating the true_positive and true_negative values from multilabel confusion_matrix
    matrix_data = matrix[i]
    tp = matrix_data[1][1]
    fn = matrix_data[1][0]
    fp = matrix_data[0][1]
    tn = matrix_data[0][0]
    
    # printing the class wise confusion_matrix
    print("Confusion_Matrix: \n",matrix_data)
    
    # printing the outcome values
    print("Outcome_values:\n",tp,fn,fp,tn)
    
    # calculating the evaluation metrics
    accuracy = round((tp+tn)/(tp+fn+fp+tn),3)
    avg_accuracy.append(accuracy)
    
    precision = round(tp/(tp+fp),3)
    avg_precision.append(precision)
    
    recall = round(tp/(tp+fn),3)
    avg_recall.append(recall)
    
    f1_score = round((2*tp/(2*tp+fp+fn)),3)
    avg_f1score.append(f1_score)
    
    specificity = round(tn/(tn+fp),3)
    
    balanced_accuracy = round((recall + specificity)/2,3)
    avg_balanced_accuracy.append(balanced_accuracy)
    
    # Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1.
    # A model with a score of +1 is a perfect model and -1 is a poor model

    # importing square root
    from math import sqrt
    n = (tp*tn)-(fp*fn)
    d = sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
    mcc = round(n/d,3)
    avg_mcc.append(mcc)
    
    # importing metrics from sklearn
    from sklearn import metrics
    
    print("TP={0}, FP={1}, TN={2}, FN={3}".format(tp, fp, tn, fn));
    print("Accuracy:",round(accuracy*100,2),"%")
    print("Precision:",round(precision*100,2),"%")
    print("Recall:",round(recall*100,3),"%")
    print("F1_score:",f1_score)
    print("Balanced_Accuracy:",round(balanced_accuracy*100,2),"%")
    print("MCC:",mcc)
    print("\n")
# printing the overall_evaluation_metrics
# importing mean from statistics
from statistics import mean
print("-------------------------------------------------------------------\n")
print("Overall_evaluation_Metrics:\n")
print("Accuracy:",round(mean(avg_accuracy)*100,2),"%")
print("Precision:",round(mean(avg_precision)*100,2),"%")
print("Recall:",round(mean(avg_recall)*100,3),"%")
print("F1_score:",round(mean(avg_f1score),3))
print("Balanced_Accuracy:",round(mean(avg_balanced_accuracy)*100,2),"%")
print("MCC:",mean(avg_mcc))



Confusion_Matrix:
 [[12  0  0]
 [ 0 13  1]
 [ 0  0 10]]


Class: 0
Confusion_Matrix: 
 [[24  0]
 [ 0 12]]
Outcome_values:
 12 0 0 24
TP=12, FP=0, TN=24, FN=0
Accuracy: 100.0 %
Precision: 100.0 %
Recall: 100.0 %
F1_score: 1.0
Balanced_Accuracy: 100.0 %
MCC: 1.0


Class: 1
Confusion_Matrix: 
 [[22  0]
 [ 1 13]]
Outcome_values:
 13 1 0 22
TP=13, FP=0, TN=22, FN=1
Accuracy: 97.2 %
Precision: 100.0 %
Recall: 92.9 %
F1_score: 0.963
Balanced_Accuracy: 96.4 %
MCC: 0.942


Class: 2
Confusion_Matrix: 
 [[25  1]
 [ 0 10]]
Outcome_values:
 10 0 1 25
TP=10, FP=1, TN=25, FN=0
Accuracy: 97.2 %
Precision: 90.9 %
Recall: 100.0 %
F1_score: 0.952
Balanced_Accuracy: 98.1 %
MCC: 0.935


-------------------------------------------------------------------

Overall_evaluation_Metrics:

Accuracy: 98.13 %
Precision: 96.97 %
Recall: 97.633 %
F1_score: 0.972
Balanced_Accuracy: 98.17 %
MCC: 0.959


##  Light GBM

In [12]:
# Light GBM Classifier is used for classification problems

from lightgbm import LGBMClassifier

modelLGBM = LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=- 1, learning_rate=0.1, n_estimators=100,
subsample_for_bin=200000, objective=None, class_weight=None, min_split_gain=0.0,
min_child_weight=0.001, min_child_samples=20, subsample=1.0, subsample_freq=0,
colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, random_state=None, n_jobs=- 1,
silent=True, importance_type='split')

# Fit the model with train data

modelLGBM.fit(x_train, y_train)

# Predict model with test data

y_pred = modelXGB.predict(x_test)

# Evaluation Metrics for MultiClassification Analysis

actual = y_test
predict = y_pred

# Giving number of classes to the labe_classes
label_classes = [0,1,2]

# printing the multiclass confusion matrix
from sklearn.metrics import confusion_matrix,classification_report,multilabel_confusion_matrix
print("Confusion_Matrix:\n",confusion_matrix(actual,predict,labels = label_classes))

matrix = multilabel_confusion_matrix(actual,predict,labels = label_classes) 
# creating a lists to append the metric values by each class for to calcualte overall evaluation metris
avg_accuracy = []
avg_precision = []
avg_recall = []
avg_f1score = []
avg_balanced_accuracy = []
avg_mcc = []

print("\n")
for i in label_classes:
    print("Class: {0}".format(i))
    # separating the true_positive and true_negative values from multilabel confusion_matrix
    matrix_data = matrix[i]
    tp = matrix_data[1][1]
    fn = matrix_data[1][0]
    fp = matrix_data[0][1]
    tn = matrix_data[0][0]
    
    # printing the class wise confusion_matrix
    print("Confusion_Matrix: \n",matrix_data)
    
    # printing the outcome values
    print("Outcome_values:\n",tp,fn,fp,tn)
    
    # calculating the evaluation metrics
    accuracy = round((tp+tn)/(tp+fn+fp+tn),3)
    avg_accuracy.append(accuracy)
    
    precision = round(tp/(tp+fp),3)
    avg_precision.append(precision)
    
    recall = round(tp/(tp+fn),3)
    avg_recall.append(recall)
    
    f1_score = round((2*tp/(2*tp+fp+fn)),3)
    avg_f1score.append(f1_score)
    
    specificity = round(tn/(tn+fp),3)
    
    balanced_accuracy = round((recall + specificity)/2,3)
    avg_balanced_accuracy.append(balanced_accuracy)
    
    # Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1.
    # A model with a score of +1 is a perfect model and -1 is a poor model

    # importing square root
    from math import sqrt
    n = (tp*tn)-(fp*fn)
    d = sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
    mcc = round(n/d,3)
    avg_mcc.append(mcc)
    
    # importing metrics from sklearn
    from sklearn import metrics
    
    print("TP={0}, FP={1}, TN={2}, FN={3}".format(tp, fp, tn, fn));
    print("Accuracy:",round(accuracy*100,2),"%")
    print("Precision:",round(precision*100,2),"%")
    print("Recall:",round(recall*100,3),"%")
    print("F1_score:",f1_score)
    print("Balanced_Accuracy:",round(balanced_accuracy*100,2),"%")
    print("MCC:",mcc)
    print("\n")
# printing the overall_evaluation_metrics
# importing mean from statistics
from statistics import mean
print("-------------------------------------------------------------------\n")
print("Overall_evaluation_Metrics:\n")
print("Accuracy:",round(mean(avg_accuracy)*100,2),"%")
print("Precision:",round(mean(avg_precision)*100,2),"%")
print("Recall:",round(mean(avg_recall)*100,3),"%")
print("F1_score:",round(mean(avg_f1score),3))
print("Balanced_Accuracy:",round(mean(avg_balanced_accuracy)*100,2),"%")
print("MCC:",mean(avg_mcc))


Confusion_Matrix:
 [[12  0  0]
 [ 0 13  1]
 [ 0  0 10]]


Class: 0
Confusion_Matrix: 
 [[24  0]
 [ 0 12]]
Outcome_values:
 12 0 0 24
TP=12, FP=0, TN=24, FN=0
Accuracy: 100.0 %
Precision: 100.0 %
Recall: 100.0 %
F1_score: 1.0
Balanced_Accuracy: 100.0 %
MCC: 1.0


Class: 1
Confusion_Matrix: 
 [[22  0]
 [ 1 13]]
Outcome_values:
 13 1 0 22
TP=13, FP=0, TN=22, FN=1
Accuracy: 97.2 %
Precision: 100.0 %
Recall: 92.9 %
F1_score: 0.963
Balanced_Accuracy: 96.4 %
MCC: 0.942


Class: 2
Confusion_Matrix: 
 [[25  1]
 [ 0 10]]
Outcome_values:
 10 0 1 25
TP=10, FP=1, TN=25, FN=0
Accuracy: 97.2 %
Precision: 90.9 %
Recall: 100.0 %
F1_score: 0.952
Balanced_Accuracy: 98.1 %
MCC: 0.935


-------------------------------------------------------------------

Overall_evaluation_Metrics:

Accuracy: 98.13 %
Precision: 96.97 %
Recall: 97.633 %
F1_score: 0.972
Balanced_Accuracy: 98.17 %
MCC: 0.959


## Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression
modelLR = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, 
                             intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', 
                             max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)
modelLR.fit(x_train,y_train)

y_pred = modelLR.predict(x_test)

# Evaluation Metrics for MultiClassification Analysis

actual = y_test
predict = y_pred

# Giving number of classes to the labe_classes
label_classes = [0,1,2]

# printing the multiclass confusion matrix
from sklearn.metrics import confusion_matrix,classification_report,multilabel_confusion_matrix
print("Confusion_Matrix:\n",confusion_matrix(actual,predict,labels = label_classes))

matrix = multilabel_confusion_matrix(actual,predict,labels = label_classes) 
# creating a lists to append the metric values by each class for to calcualte overall evaluation metris
avg_accuracy = []
avg_precision = []
avg_recall = []
avg_f1score = []
avg_balanced_accuracy = []
avg_mcc = []

print("\n")
for i in label_classes:
    print("Class: {0}".format(i))
    # separating the true_positive and true_negative values from multilabel confusion_matrix
    matrix_data = matrix[i]
    tp = matrix_data[1][1]
    fn = matrix_data[1][0]
    fp = matrix_data[0][1]
    tn = matrix_data[0][0]
    
    # printing the class wise confusion_matrix
    print("Confusion_Matrix: \n",matrix_data)
    
    # printing the outcome values
    print("Outcome_values:\n",tp,fn,fp,tn)
    
    # calculating the evaluation metrics
    accuracy = round((tp+tn)/(tp+fn+fp+tn),3)
    avg_accuracy.append(accuracy)
    
    precision = round(tp/(tp+fp),3)
    avg_precision.append(precision)
    
    recall = round(tp/(tp+fn),3)
    avg_recall.append(recall)
    
    f1_score = round((2*tp/(2*tp+fp+fn)),3)
    avg_f1score.append(f1_score)
    
    specificity = round(tn/(tn+fp),3)
    
    balanced_accuracy = round((recall + specificity)/2,3)
    avg_balanced_accuracy.append(balanced_accuracy)
    
    # Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1.
    # A model with a score of +1 is a perfect model and -1 is a poor model

    # importing square root
    from math import sqrt
    n = (tp*tn)-(fp*fn)
    d = sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
    mcc = round(n/d,3)
    avg_mcc.append(mcc)
    
    # importing metrics from sklearn
    from sklearn import metrics
    
    print("TP={0}, FP={1}, TN={2}, FN={3}".format(tp, fp, tn, fn));
    print("Accuracy:",round(accuracy*100,2),"%")
    print("Precision:",round(precision*100,2),"%")
    print("Recall:",round(recall*100,3),"%")
    print("F1_score:",f1_score)
    print("Balanced_Accuracy:",round(balanced_accuracy*100,2),"%")
    print("MCC:",mcc)
    print("\n")
# printing the overall_evaluation_metrics
# importing mean from statistics
from statistics import mean
print("-------------------------------------------------------------------\n")
print("Overall_evaluation_Metrics:\n")
print("Accuracy:",round(mean(avg_accuracy)*100,2),"%")
print("Precision:",round(mean(avg_precision)*100,2),"%")
print("Recall:",round(mean(avg_recall)*100,3),"%")
print("F1_score:",round(mean(avg_f1score),3))
print("Balanced_Accuracy:",round(mean(avg_balanced_accuracy)*100,2),"%")
print("MCC:",mean(avg_mcc))


Confusion_Matrix:
 [[12  0  0]
 [ 1 12  1]
 [ 0  0 10]]


Class: 0
Confusion_Matrix: 
 [[23  1]
 [ 0 12]]
Outcome_values:
 12 0 1 23
TP=12, FP=1, TN=23, FN=0
Accuracy: 97.2 %
Precision: 92.3 %
Recall: 100.0 %
F1_score: 0.96
Balanced_Accuracy: 97.9 %
MCC: 0.941


Class: 1
Confusion_Matrix: 
 [[22  0]
 [ 2 12]]
Outcome_values:
 12 2 0 22
TP=12, FP=0, TN=22, FN=2
Accuracy: 94.4 %
Precision: 100.0 %
Recall: 85.7 %
F1_score: 0.923
Balanced_Accuracy: 92.8 %
MCC: 0.886


Class: 2
Confusion_Matrix: 
 [[25  1]
 [ 0 10]]
Outcome_values:
 10 0 1 25
TP=10, FP=1, TN=25, FN=0
Accuracy: 97.2 %
Precision: 90.9 %
Recall: 100.0 %
F1_score: 0.952
Balanced_Accuracy: 98.1 %
MCC: 0.935


-------------------------------------------------------------------

Overall_evaluation_Metrics:

Accuracy: 96.27 %
Precision: 94.4 %
Recall: 95.233 %
F1_score: 0.945
Balanced_Accuracy: 96.27 %
MCC: 0.9206666666666666
