## Implementation of bagging classifier with Logistic Regression as the base learner and 10 estimators. 

# Drink Quality Dataset
Drink Quality Dataset contains of 700 samples, 11 features and 1 column for 4 quality class labels. https://archive.ics.uci.edu/ml/datasets/wine+quality

In [430]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score
from collections import Counter
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import numpy as np
import pandas as pd

In [449]:
df= pd.read_csv("F:\\Data Science MSC\\Winter 2020\\Assignment3\\drink_quality.csv")
df.shape
df.head()
df.shape


(700, 12)

In [447]:
df.quality.unique()
df.isnull().any()

fixed acidity           False
volatile acidity        False
citric acid             False
residual sugar          False
chlorides               False
free sulfur dioxide     False
total sulfur dioxide    False
density                 False
pH                      False
sulphates               False
alcohol                 False
quality                 False
dtype: bool

In [432]:
def prepare_train_test_data(input_dataframe):
    train_data, test_data = train_test_split(input_dataframe, test_size=0.2)    
    return train_data, test_data

In [433]:
def prepare_list_data(unformatted_data):
    data_list = []
    for i in range(len(unformatted_data)):
        data_list.append(list(unformatted_data[i]))
    return data_list

## Sample Data prepare for Bagging Classifier

In [445]:
def bootstrap_sample(train_data):
    max_samples = train_data.shape[0]
    sampled_train_data = train_data.sample(n=max_samples,replace=True)
    #sampled_train_data.to_csv(r'F:\\Data Science MSC\\Winter 2020\\Assignment3\\sampled_data.csv',encoding='utf-8', header='true')
        
    sampled_data_x_train = sampled_train_data.iloc[:, :-1]
    sampled_data_y_train = sampled_train_data.iloc[:,-1]
    
    
    return sampled_data_x_train, sampled_data_y_train

## MyHardVoting used for MyBaggingClassifier 

In [359]:
#MyHardVoting used for both MyBaggingClassifier and myVotingClassifier
def myHardVoting(prediction_data_list):
    n_estimators = len(prediction_data_list)
    num_test_sample = len(prediction_data_list[0])
    
    y_pred = []
    for i in range(num_test_sample):
        temp_y_list = []
        for j in range(n_estimators):
            estimator_pred_list = prediction_data_list[j]
            #print(estimator_pred_list)
            temp_y_list.append(estimator_pred_list[i])
        #print(temp_y_list) #all posible outcome        
        major_vote = Counter(temp_y_list).most_common(1)[0][0]
        y_pred.append(major_vote)
    return y_pred

In [360]:
def myBaggingClassifer(base_classifier, num_estimators, use_bootstrap, train_data, test_data):   
    x_test = test_data.iloc[:, :-1]
    y_test = test_data.iloc[:,-1]

    prediction_list = []
    if use_bootstrap:
        for i in range(num_estimators):
            sampled_data_x_train, sampled_data_y_train = bootstrap_sample(train_data)  
            #print(sampled_data_x_train)
            #print(sampled_data_y_train)
            my_model= base_classifier.fit(sampled_data_x_train, sampled_data_y_train)
            #X_test_stan = scaler.transform(x_test) #standard scaler
            y_pred = my_model.predict(x_test)
            #print(list(y_pred))
            prediction_list.append(list(y_pred))
        #print(prediction_list)
        y_pred = myHardVoting(prediction_list)        
        return y_pred       
    else:
        print("Bootstrap must be true")

In [361]:
def calculate_accuracy(y_test, y_pred):
    return  (accuracy_score(y_test, y_pred))*100.0

In [366]:
###Question 2(a) : Bagging Classifier
train_data, test_data = prepare_train_test_data(df)

x_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:,-1]
x_test = test_data.iloc[:, :-1]
y_test = test_data.iloc[:,-1]

scaler = StandardScaler()
scaled_x_train = scaler.fit_transform(x_train) #array
scaled_x_test = scaler.transform(x_test) #array

scaled_x_train_list = prepare_list_data(scaled_x_train) #list
scaled_x_test_list = prepare_list_data(scaled_x_test)   #list

y_train_list = list(y_train) #list
y_test_list = list(y_test)  #list

scaled_train_data_df = pd.DataFrame(list(zip(scaled_x_train_list, y_train_list))) #df
scaled_test_data_df = pd.DataFrame(list(zip(scaled_x_test_list, y_test_list))) #df

num_estimators = 10
use_bootstrap = True

print("Actual result : ", y_test_list)
print('\n')

#Using myBagging Classifier
y_pred = myBaggingClassifer(LogisticRegression(solver='liblinear'), num_estimators, use_bootstrap, train_data, test_data)
print("Predicted result using myBagging Classifer with Plurality vote : ", y_pred)
myBaggingClassifier_accuracy = calculate_accuracy(y_test, y_pred)
print("My BaggingClassifier Accuracy :", myBaggingClassifier_accuracy)
print('\n')

Actual result :  [4, 5, 5, 4, 5, 5, 4, 5, 4, 4, 5, 5, 4, 4, 7, 7, 4, 7, 4, 8, 8, 8, 5, 7, 7, 5, 8, 8, 8, 4, 4, 5, 5, 4, 5, 5, 5, 5, 4, 4, 4, 8, 7, 5, 4, 4, 7, 5, 4, 8, 5, 5, 5, 4, 7, 4, 4, 8, 5, 7, 7, 8, 4, 5, 8, 7, 8, 8, 4, 5, 8, 7, 8, 7, 5, 5, 4, 7, 5, 4, 5, 8, 7, 8, 8, 7, 8, 7, 4, 8, 8, 7, 7, 4, 8, 5, 7, 5, 4, 8, 7, 5, 4, 4, 7, 8, 4, 8, 5, 8, 8, 8, 8, 5, 4, 5, 5, 8, 7, 8, 5, 8, 4, 4, 7, 7, 7, 8, 5, 5, 7, 8, 4, 8, 5, 7, 4, 5, 4, 8]


Predicted result using myBagging Classifer with Plurality vote :  [4, 5, 8, 8, 5, 5, 4, 5, 4, 4, 5, 7, 4, 4, 8, 5, 5, 7, 4, 7, 8, 7, 5, 7, 8, 5, 8, 5, 8, 7, 4, 4, 7, 4, 5, 7, 4, 5, 4, 4, 4, 8, 4, 5, 4, 4, 7, 5, 4, 8, 5, 5, 5, 4, 8, 8, 5, 8, 5, 7, 8, 8, 4, 5, 8, 5, 8, 8, 5, 7, 4, 7, 7, 8, 5, 5, 4, 8, 5, 4, 5, 7, 7, 5, 8, 4, 8, 5, 4, 8, 7, 8, 5, 4, 8, 5, 7, 5, 8, 8, 5, 5, 4, 4, 7, 8, 4, 8, 5, 8, 5, 7, 8, 5, 4, 4, 4, 7, 8, 7, 5, 7, 5, 4, 7, 7, 7, 7, 5, 5, 7, 8, 4, 5, 5, 4, 4, 4, 4, 8]
My BaggingClassifier Accuracy : 65.0




## Compare Accuracy by Using sklearn library function BaggingClassifier

In [374]:
#Using Library
baglog = BaggingClassifier(LogisticRegression(solver='liblinear'), n_estimators = num_estimators, bootstrap=use_bootstrap)
baglog.fit(scaled_x_train, y_train)
y_pred = baglog.predict(scaled_x_test)
print("Predicted result using library Bagging Classifer: ", y_pred)
library_accuracy = calculate_accuracy(y_test, y_pred)
print("\n")
print("Library BaggingClassifier Accuracy:", library_accuracy)

Predicted result using library Bagging Classifer:  [4 5 8 8 5 5 4 5 4 4 5 7 4 4 8 5 4 4 4 7 8 7 5 7 8 5 8 7 8 4 4 4 5 4 5 7 5
 7 4 4 4 8 7 5 4 4 7 5 4 8 5 5 5 4 8 8 5 8 5 7 7 8 4 5 8 5 8 8 8 5 4 7 7 8
 5 5 4 8 5 4 5 7 7 5 8 4 8 5 8 8 7 7 5 4 4 5 7 5 8 8 5 5 4 4 4 8 4 8 5 8 5
 7 8 5 4 4 4 7 8 7 5 7 5 4 7 7 5 7 5 5 7 8 4 5 5 7 4 4 4 8]


Library BaggingClassifier Accuracy: 67.14285714285714
