In [19]:
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
import itertools as it
from sklearn import metrics
from sklearn.base import clone
from sklearn.datasets import load_wine
from sklearn.model_selection import StratifiedKFold,cross_validate, train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score,mean_absolute_error,mean_squared_error,r2_score,make_scorer
from abc import ABC, abstractmethod
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.naive_bayes import GaussianNB

In [20]:
#Load Data
data = load_wine()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target_names[data.target]
print(df)

#Train Test Split
train = df.sample(frac = 0.7, random_state = 40)
test = df.drop(train.index)

y_train = train["target"]
x_train = train.drop("target", axis = 1)

y_test = test["target"]
x_test = test.drop("target", axis = 1)

#Training – Count Posterior
means = train.groupby(["target"]).mean() # Find mean of each class
var = train.groupby(["target"]).var() # Find variance of each class
prior = (train.groupby("target").count() / len(train)).iloc[:,1] # Find prior probability of each class
classes = np.unique(train["target"].tolist()) # Storing all possible classes


     alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0      14.23        1.71  2.43               15.6      127.0           2.80   
1      13.20        1.78  2.14               11.2      100.0           2.65   
2      13.16        2.36  2.67               18.6      101.0           2.80   
3      14.37        1.95  2.50               16.8      113.0           3.85   
4      13.24        2.59  2.87               21.0      118.0           2.80   
..       ...         ...   ...                ...        ...            ...   
173    13.71        5.65  2.45               20.5       95.0           1.68   
174    13.40        3.91  2.48               23.0      102.0           1.80   
175    13.27        4.28  2.26               20.0      120.0           1.59   
176    13.17        2.59  2.37               20.0      120.0           1.65   
177    14.13        4.10  2.74               24.5       96.0           2.05   

     flavanoids  nonflavanoid_phenols  proanthocyan

In [21]:
#Classification
def Normal(n, mu, var):
    # Function to return pdf of Normal(mu, var) evaluated at x
    sd = np.sqrt(var)
    pdf = (np.e ** (-0.5 * ((n - mu)/sd) ** 2)) / (sd * np.sqrt(2 * np.pi))
    return pdf


In [22]:
def Predict(X):
    Predictions = []
    
    for i in X.index: # Loop through each instances
        ClassLikelihood = []
        instance = X.loc[i]
        for cls in classes: # Loop through each class
            FeatureLikelihoods = []
            FeatureLikelihoods.append(np.log(prior[cls])) # Append log prior of class 'cls'
            for col in x_train.columns: # Loop through each feature
                data = instance[col]
                mean = means[col].loc[cls] # Find the mean of column 'col' that are in class 'cls'
                variance = var[col].loc[cls] # Find the variance of column 'col' that are in class 'cls'
                Likelihood = Normal(data, mean, variance)
                if Likelihood != 0:
                    Likelihood = np.log(Likelihood) # Find the log-likelihood evaluated at x
                else:
                    Likelihood = 1/len(train) 
                
                FeatureLikelihoods.append(Likelihood)
                
            TotalLikelihood = sum(FeatureLikelihoods) # Calculate posterior
            ClassLikelihood.append(TotalLikelihood)
            
        MaxIndex = ClassLikelihood.index(max(ClassLikelihood)) # Find largest posterior position
        Prediction = classes[MaxIndex]
        Predictions.append(Prediction)     
    return Predictions


In [23]:
def Accuracy(y, prediction):
    # Function to calculate accuracy
    y = list(y)
    prediction = list(prediction)
    score = 0
    for i, j in zip(y, prediction):
        if i == j:
            score += 1   
    return score / len(y)


In [26]:
PredictTrain = Predict(x_train)
PredictTest = Predict(x_test)

print('Training Accuracy: %.4f' % round(Accuracy(y_train, PredictTrain), 5))
print('Testing Accuracy: %.4f' % round(Accuracy(y_test, PredictTest), 5))


Training Accuracy: 0.9920
Testing Accuracy: 0.9434


In [27]:
clf = GaussianNB()
clf.fit(x_train, y_train)
SkTrain = clf.predict(x_train) # Predicting on the train set
SkTest = clf.predict(x_test) # Predicting on the test set

print('Training Accuracy: %.4f' % round(Accuracy(y_train, SkTrain), 5))
print('Testing Accuracy: %.4f' % round(Accuracy(y_test, SkTest), 5))


Training Accuracy: 0.9920
Testing Accuracy: 0.9434
