In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics 

fruit = pd.read_table("C:/Users/vedan/ML/fruit_data_with_colors.txt");
fruit.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [2]:
fruit_vector = fruit[["fruit_label", "fruit_name", "fruit_subtype", "mass", "width", "height", "color_score"]]
X = fruit_vector.iloc[:, 3:7].values
y = fruit_vector.iloc[:, 0].values
print(X[0:5])
print(y[0:5])

[[192.     8.4    7.3    0.55]
 [180.     8.     6.8    0.59]
 [176.     7.4    7.2    0.6 ]
 [ 86.     6.2    4.7    0.8 ]
 [ 84.     6.     4.6    0.79]]
[1 1 1 2 2]


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 4321)
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test)

print(X_train[0:5])

[[-1.82753952 -1.56031839 -2.94486615  0.54956872]
 [ 0.89715576  0.36161885  2.22491959 -0.68696091]
 [-1.01013093 -1.42303716 -0.27658964 -0.68696091]
 [ 0.07974718  0.22433762 -0.69350784  0.41217654]
 [ 0.21598194  0.77346255  0.05694493  1.51131399]]


In [4]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 4321)
# Loop over the range whhere 0 < i < 9
for i in range(1, 9):
    classifier = KNeighborsClassifier(n_neighbors=i)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    print("kNN %:", metrics.accuracy_score(y_test, y_pred)*100)

[[7 0 0 0]
 [0 3 0 0]
 [0 0 6 0]
 [0 0 0 2]]
kNN %: 100.0
[[7 0 0 0]
 [0 3 0 0]
 [0 0 6 0]
 [0 0 0 2]]
kNN %: 100.0
[[7 0 0 0]
 [0 3 0 0]
 [0 0 6 0]
 [0 0 0 2]]
kNN %: 100.0
[[7 0 0 0]
 [0 3 0 0]
 [0 0 6 0]
 [0 0 0 2]]
kNN %: 100.0
[[4 0 3 0]
 [0 3 0 0]
 [0 0 6 0]
 [0 0 0 2]]
kNN %: 83.33333333333334
[[4 0 3 0]
 [0 3 0 0]
 [0 0 6 0]
 [0 0 0 2]]
kNN %: 83.33333333333334
[[2 0 5 0]
 [0 0 1 2]
 [0 0 6 0]
 [0 0 0 2]]
kNN %: 55.55555555555556
[[2 0 5 0]
 [0 0 1 2]
 [0 0 6 0]
 [0 0 0 2]]
kNN %: 55.55555555555556


In [5]:
classifier = GaussianNB()
y_pred = classifier.fit(X_train, y_train).predict(X_test)
print(confusion_matrix(y_test, y_pred))
print("Gaussian NB %:", metrics.accuracy_score(y_test, y_pred)*100)

[[7 0 0 0]
 [0 3 0 0]
 [2 0 4 0]
 [0 0 0 2]]
Gaussian NB %: 88.88888888888889


In [6]:
from math import sqrt
from math import exp
from math import pi
import math

# Reset the training/test data
X = fruit_vector.iloc[:, [3, 4, 5, 6, 0]].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 4321)

# Calculate probabilities for each type of fruit
def calculate_label_probability(statistics, row):
    # Get a total of how many times class member appears in training data
    total = sum([statistics[label][0][2] for label in statistics])
    probabilities = dict()
    for class_value, class_summaries in statistics.items():
        # Probability for that fruit label is individual count/total
        probabilities[class_value] = statistics[class_value][0][2]/float(total)
        for i in range(len(class_summaries)):
            mean, stdev, count = class_summaries[i]
            # Multiply all the probabilities together
            probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
    return probabilities

# Calculate the percentage accuracy score by comparing the predicted label to real label
def accuracy_score(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct/float(len(actual)) * 100.0

# Calculate using the formula and values given
def calculate_probability(x, mean, stddev):
    return (1 / (sqrt(2 * pi) * stddev)) * exp(-((x-mean)**2 / (2 * stddev**2 )))

# Returns statistics organised by fruit label
def statistics_by_class(data):
    separated = separate_by_label(data)
    statistics = dict()
    for class_value, rows in separated.items():
        statistics[class_value] = get_statistics(rows)
    return statistics

# Returns the mean of array provided
def mean(data):
    return sum(data)/float(len(data))

# Calculate the standard deviation of data
def stdev(data):
    average = mean(data)
    variance = 0
    for i in data:
        variance = variance + ((i-average)**2)
    variance = float(variance/(len(data)-1))
    return sqrt(variance)

# test a row of the test values and calculate fruit label accordingly
def test(statistics, test_row):
    probabilities = calculate_label_probability(statistics, test_row)
    best_label, best_prob = None, -1000000
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

# Returns a summary of statistics for a column
def get_statistics(data):
    statistics = [(mean(column), stdev(column), len(column)) for column in zip(*data)]
    del(statistics[-1])
    return statistics

# Separates data by value
def separate_by_label(data):
    separated = dict()
    for i in range(len(data)):
        row = data[i]
        class_value = row[-1] 
        if (class_value not in separated):
            separated[class_value] = list()
        separated[class_value].append(row)
    return separated


In [7]:
stats = statistics_by_class(X_train)
counter = 0
for i in range(len(X_test)):
    result = test(stats, X_test[i])
    if result == y_test[i]:
        counter = counter + 1
print("Manual Gaussian NB %:", counter/len(y_test) * 100)

Manual Gaussian NB %: 88.88888888888889
