In [85]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

fruits = pd.read_table('fruit_data_with_colors.txt')

In [86]:
fruits.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [87]:
# create a mapping from fruit label value to fruit name to make results easier to interpret
lookup_fruit_name = dict(zip(fruits.fruit_label.unique(), fruits.fruit_name.unique()))   
lookup_fruit_name

{1: 'apple', 2: 'mandarin', 3: 'orange', 4: 'lemon'}

In [88]:
X = fruits[['mass', 'width', 'height']]
y = fruits['fruit_label']

In [89]:
# Data Standardization
scaler = StandardScaler()

In [90]:
scaler.fit(X)
standardized_data = scaler.transform(X)
print(standardized_data)

[[ 0.52944179  1.59869024 -0.29139659]
 [ 0.30946246  1.10485399 -0.66192242]
 [ 0.23613601  0.36409961 -0.36550175]
 [-1.41370901 -1.11740915 -2.21813091]
 [-1.45037224 -1.36432727 -2.29223608]
 [-1.52369868 -1.6112454  -2.51455158]
 [-1.52369868 -1.48778634 -2.51455158]
 [-1.59702513 -1.6112454  -2.73686708]
 [ 0.27279923 -0.00627758  0.07912925]
 [ 0.16280957  0.36409961 -0.51371209]
 [ 0.0528199  -0.25319571 -0.29139659]
 [ 0.16280957 -0.00627758 -0.06908109]
 [-0.16715944 -0.12973664 -0.43960692]
 [ 0.01615667  0.24064055  0.00502408]
 [-0.20382266  0.61101774 -0.29139659]
 [-0.13049622  0.7344768  -0.43960692]
 [-0.13049622  0.61101774 -0.14318625]
 [ 0.08948312  0.48755867 -0.06908109]
 [-0.02050655  0.48755867 -0.43960692]
 [-0.02050655  0.36409961 -0.36550175]
 [-0.05716977  0.48755867 -0.14318625]
 [-0.13049622  0.36409961 -0.21729142]
 [-0.423802    0.24064055 -0.43960692]
 [ 0.12614634  0.61101774  0.15323441]
 [ 3.2791835   2.33944462  1.26481191]
 [ 3.53582606  2.58636275

In [91]:
X = standardized_data
y = fruits['fruit_label']

In [92]:

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [93]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 10)

In [94]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='uniform')

In [95]:
knn.score(X_test, y_test)

0.6666666666666666

In [96]:
# Model Evaluation
# Accuracy Score

# accuracy on training data
X_train_prediction = knn.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, y_train)

In [97]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.8409090909090909


In [98]:
# accuracy on test data
X_test_prediction = knn.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, y_test)
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.6666666666666666


In [99]:
# first example: a small fruit with mass 20g, width 4.3 cm, height 5.5 cm
fruit_prediction = knn.predict([[20, 4.3, 5.5]])
lookup_fruit_name[fruit_prediction[0]]

'orange'

In [100]:
input_data = (100, 6.3, 8.5)

# change the input data to a numpy array
input_data_as_numpy_array= np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardize the input data
std_data = scaler.transform(input_data_reshaped)
print(std_data)

# first example: a small fruit with mass 20g, width 4.3 cm, height 5.5 cm
fruit_prediction = knn.predict([[20, 4.3, 5.5]])
lookup_fruit_name[fruit_prediction[0]]

[[-1.15706645 -0.99395009  0.59786541]]


'orange'