In [10]:
#Import the packages:

import numpy as np
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler

In [3]:
#Read the dataset and view it:

fruit = pd.read_table('Datasets used/fruit_data_with_colors.txt')
fruit.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [4]:
#Create the feature vector and the label:

feature_names = ['mass', 'width', 'height', 'color_score']
X = fruit[feature_names]
y = fruit['fruit_label']

In [33]:
#Create a dictionary with the names and label of the fruits:
label_name = dict(zip(fruit['fruit_label'], fruit['fruit_name']))
for i in range(1,5):
    label_name[i] = label_name[i].title()
label_name

{1: 'Apple', 2: 'Mandarin', 3: 'Orange', 4: 'Lemon'}

In [52]:
#Split the dataset into training and test sets:

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
print(f'Splitting done...\nSize of dataset: {len(fruit)}')
print(f'Size of training dataset: {len(X_train)}\nSize of test dataset: {len(X_test)}')

Splitting done...
Size of dataset: 59
Size of training dataset: 44
Size of test dataset: 15


In [53]:
#Do the scaling: 

scaler = MinMaxScaler() #Create an object of MinMaxLScaler

X_train_scaled = scaler.fit_transform(X_train) #For training use fit_transform.
X_test_scaled = scaler.transform(X_test) #For testing use transform only.

print(f'Before transformation the training dataset looked like: \n\n{X_train[:3]}')
print(f'\nAfter transformation the training dataset looked like: \n\n{X_train_scaled[:3]}')
print(f'\nBefore transformation the test dataset looked like: \n\n{X_test[:3]}')
print(f'\nAfter transformation the test dataset looked like: \n\n{X_test_scaled[:3]}')

Before transformation the training dataset looked like: 

    mass  width  height  color_score
42   154    7.2     7.2         0.82
48   174    7.3    10.1         0.72
7     76    5.8     4.0         0.81

After transformation the training dataset looked like: 

[[0.27857143 0.41176471 0.49230769 0.72972973]
 [0.35       0.44117647 0.93846154 0.45945946]
 [0.         0.         0.         0.7027027 ]]

Before transformation the test dataset looked like: 

    mass  width  height  color_score
26   362    9.6     9.2         0.74
35   150    7.1     7.9         0.75
43   194    7.2    10.3         0.70

After transformation the test dataset looked like: 

[[1.02142857 1.11764706 0.8        0.51351351]
 [0.26428571 0.38235294 0.6        0.54054054]
 [0.42142857 0.41176471 0.96923077 0.40540541]]


  return self.partial_fit(X, y)


Note: 
* fit_transform() uses the statistics (like measure of central tendencies/ dispersion etc.) of the dataset to do the transformation. We want to use those for the training. 
* That should not be the case with the testing dataset. We want it to be as independent as possible. So, we use transform() for the transformation. No fitting is needed. 

In [54]:
#Do the modelling:

knn = KNeighborsClassifier(n_neighbors = 5)
mod = knn.fit(X_train_scaled, y_train) #Save the model

In [55]:
#Evaluate the score:

print(f'Accuracy of the model on training set is: {mod.score(X_train_scaled, y_train):.2f}')
print(f'Accuracy of the model on test set is: {mod.score(X_test_scaled, y_test):.2f}')

Accuracy of the model on training set is: 0.95
Accuracy of the model on test set is: 1.00


In [59]:
#Pridict an unknown case:

example_fruit = [[5.5, 2.2, 10, 0.7]]
example_fruit_scaled = scaler.transform(example_fruit)

print(mod.predict(example_fruit_scaled))
print(f'The fruit, {example_fruit}, as per the model prediction is: {label_name[(mod.predict(example_fruit_scaled))[0]]}')

[4]
The fruit, [[5.5, 2.2, 10, 0.7]], as per the model prediction is: Lemon


In [77]:
#Validate:
avg_color_score = np.mean(fruit[fruit['fruit_name'] == 'lemon']['color_score'])
avg_mass = np.mean(fruit[fruit['fruit_name'] == 'lemon']['mass'])
avg_width = np.mean(fruit[fruit['fruit_name'] == 'lemon']['width'])
avg_height = np.mean(fruit[fruit['fruit_name'] == 'lemon']['height'])

print(f'The average mass for lemons is : {avg_mass}')
print(f'The average width for lemons is : {avg_width}')
print(f'The average height for lemons is : {avg_height}')
print(f'The average colour score for lemons is : {avg_color_score}')

The average mass for lemons is : 150.0
The average width for lemons is : 6.512499999999999
The average height for lemons is : 8.856250000000001
The average colour score for lemons is : 0.718125
