## Preprocessing and Visualizing the Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
fruits_df = pd.read_table("fruit_data_with_colors.txt")

In [3]:
fruits_df.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [4]:
fruits_df.describe()

Unnamed: 0,fruit_label,mass,width,height,color_score
count,59.0,59.0,59.0,59.0,59.0
mean,2.542373,163.118644,7.105085,7.69322,0.762881
std,1.208048,55.018832,0.816938,1.361017,0.076857
min,1.0,76.0,5.8,4.0,0.55
25%,1.0,140.0,6.6,7.2,0.72
50%,3.0,158.0,7.2,7.6,0.75
75%,4.0,177.0,7.5,8.2,0.81
max,4.0,362.0,9.6,10.5,0.93


In [5]:
correlation_matrix = fruits_df.corr()
correlation_matrix

Unnamed: 0,fruit_label,mass,width,height,color_score
fruit_label,1.0,0.032738,-0.29809,0.508766,-0.310521
mass,0.032738,1.0,0.877687,0.609571,-0.079794
width,-0.29809,0.877687,1.0,0.396848,-0.076576
height,0.508766,0.609571,0.396848,1.0,-0.247047
color_score,-0.310521,-0.079794,-0.076576,-0.247047,1.0


In [6]:
# Checking missing values if there is any
fruits_df.isna().sum()

fruit_label      0
fruit_name       0
fruit_subtype    0
mass             0
width            0
height           0
color_score      0
dtype: int64

In [7]:
fruits_df = fruits_df.drop(["fruit_name", "fruit_subtype"], axis=1)
fruits_df.head() 

Unnamed: 0,fruit_label,mass,width,height,color_score
0,1,192,8.4,7.3,0.55
1,1,180,8.0,6.8,0.59
2,1,176,7.4,7.2,0.6
3,2,86,6.2,4.7,0.8
4,2,84,6.0,4.6,0.79


In [8]:
fruits_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59 entries, 0 to 58
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   fruit_label  59 non-null     int64  
 1   mass         59 non-null     int64  
 2   width        59 non-null     float64
 3   height       59 non-null     float64
 4   color_score  59 non-null     float64
dtypes: float64(3), int64(2)
memory usage: 2.4 KB


In [9]:
fruits_df['fruit_label'].value_counts()

1    19
3    19
4    16
2     5
Name: fruit_label, dtype: int64

## Scaling the Data ( Standardization or Normalization)

In [10]:
X = fruits_df.drop('fruit_label', axis=1)
X.head()

Unnamed: 0,mass,width,height,color_score
0,192,8.4,7.3,0.55
1,180,8.0,6.8,0.59
2,176,7.4,7.2,0.6
3,86,6.2,4.7,0.8
4,84,6.0,4.6,0.79


In [11]:
y = fruits_df['fruit_label']
y.head()

0    1
1    1
2    1
3    2
4    2
Name: fruit_label, dtype: int64

In [12]:
print(f'Shape of X: {X.shape}\nShape of y is {y.shape} ')

Shape of X: (59, 4)
Shape of y is (59,) 


In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
scaler = StandardScaler()

* It must be remembered that fit_transform() is used for training data
* Only transform() is used for test data. 

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
X_train_scaled = scaler.fit_transform(X_train) # fit_transform for training
X_test_scaled = scaler.transform(X_test)   # only transform() for test

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_train_scaled_df.head()

Unnamed: 0,mass,width,height,color_score
0,-0.060739,-0.034022,-0.068731,-0.129491
1,0.930481,0.898749,0.246425,0.664343
2,0.296101,-0.034022,0.088847,1.9874
3,0.097857,0.49899,-0.068731,-0.526408
4,-1.64669,-1.633059,-2.668768,0.532038


## Selecting, training and testing the model

In [16]:
from sklearn.neighbors import KNeighborsClassifier

In [26]:
clf = KNeighborsClassifier(n_neighbors=4)
clf.fit(X_train_scaled, y_train)

KNeighborsClassifier(n_neighbors=4)

In [27]:
accuracy = clf.score(X_test_scaled, y_test)
accuracy

0.9166666666666666

### The Model is Trained on Scaled Data so any output taken from user must be scaled

In [40]:
# value = np.array([[192, 8.4, 7.3, 0.55]])
value = np.array([[86 , 6.2, 4.7, 0.80]])
value_scaled = scaler.transform(value)
pred = clf.predict(value_scaled)
pred



array([2], dtype=int64)

## Saving the Model

In [41]:
from joblib import dump, load
dump(clf, "Fruits_classifier.joblib")

['Fruits_classifier.joblib']