In [133]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

In [135]:
import warnings
warnings.filterwarnings('ignore')

# Zoo Data

In [137]:
data = pd.read_csv('Zoo.csv')

In [139]:
data.head()

Unnamed: 0,animal name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


In [141]:
data.shape

(101, 18)

In [143]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   animal name  101 non-null    object
 1   hair         101 non-null    int64 
 2   feathers     101 non-null    int64 
 3   eggs         101 non-null    int64 
 4   milk         101 non-null    int64 
 5   airborne     101 non-null    int64 
 6   aquatic      101 non-null    int64 
 7   predator     101 non-null    int64 
 8   toothed      101 non-null    int64 
 9   backbone     101 non-null    int64 
 10  breathes     101 non-null    int64 
 11  venomous     101 non-null    int64 
 12  fins         101 non-null    int64 
 13  legs         101 non-null    int64 
 14  tail         101 non-null    int64 
 15  domestic     101 non-null    int64 
 16  catsize      101 non-null    int64 
 17  type         101 non-null    int64 
dtypes: int64(17), object(1)
memory usage: 14.3+ KB


Initially, let us split data into inputs and label and then spli the training and testing data

In [161]:
x = data.iloc[:, 1:]
y = data.iloc[:, 0]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

Let us use hypeparameter tuning to get best parameters of k for KNN classification

In [163]:
cv = StratifiedKFold(n_splits = 2)
knn = KNeighborsClassifier()
params = {
    'n_neighbors': range(2, 21),
    'weights': ['uniform', 'distance'],
    'p': [1, 2]   

}
gridsearch = GridSearchCV(knn, params,cv = cv)
gridsearch.fit(x_train, y_train)

In [165]:
print("The best k value is, ", gridsearch.best_params_)

The best k value is,  {'n_neighbors': 2, 'p': 1, 'weights': 'distance'}


Now, let us make KNN moel using the given parameter

In [167]:
model = KNeighborsClassifier(n_neighbors=2, p=1, weights='distance')
model.fit(x_train, y_train)

In [169]:
train_data_pred = model.predict(x_train)

In [171]:
accuracy = accuracy_score(y_train, train_data_pred)
print('The accuracy of the model on test data is, ', accuracy)

The accuracy of the model on test data is,  0.65


In [173]:
report = classification_report(y_train, train_data_pred)
print(report)

              precision    recall  f1-score   support

    antelope       0.33      1.00      0.50         1
        bass       0.00      0.00      0.00         1
        bear       1.00      1.00      1.00         1
        calf       0.33      1.00      0.50         1
        carp       1.00      1.00      1.00         1
        cavy       1.00      1.00      1.00         1
     cheetah       0.00      0.00      0.00         1
     chicken       0.00      0.00      0.00         1
        clam       1.00      1.00      1.00         1
        crab       1.00      1.00      1.00         1
    crayfish       0.50      1.00      0.67         1
        crow       1.00      1.00      1.00         1
        deer       0.00      0.00      0.00         1
     dolphin       0.50      1.00      0.67         1
        dove       0.33      1.00      0.50         1
        duck       1.00      1.00      1.00         1
    elephant       0.00      0.00      0.00         1
    flamingo       1.00    

Let us see accuracy on the testing data

In [175]:
test_data_pred = model.predict(x_test)

In [177]:
accuracy = accuracy_score(y_test, test_data_pred)
print('The accuracy of the model on test data is, ', accuracy)

The accuracy of the model on test data is,  0.0


Now, as we can see that the model is not able to predict classes of testing data. This is because testing data has unqiue class values for which model hasn't been trained. So, we will first train the model for testing data values and then predict the class

In [179]:
model.fit(x_test, y_test)

In [181]:
test_y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, test_y_pred)
print('The accuracy of the model on test data is, ', accuracy)

The accuracy of the model on test data is,  0.7619047619047619


In [183]:
report = classification_report(y_test, test_y_pred)
print(report)

              precision    recall  f1-score   support

    aardvark       1.00      1.00      1.00         1
        boar       0.50      1.00      0.67         1
     buffalo       0.00      0.00      0.00         1
     catfish       0.50      1.00      0.67         1
        chub       0.00      0.00      0.00         1
     dogfish       1.00      1.00      1.00         1
        flea       0.50      1.00      0.67         1
     giraffe       0.33      1.00      0.50         1
     gorilla       1.00      1.00      1.00         1
        hawk       1.00      1.00      1.00         1
        mole       1.00      1.00      1.00         1
        oryx       0.00      0.00      0.00         1
     penguin       1.00      1.00      1.00         1
        puma       0.00      0.00      0.00         1
    pussycat       1.00      1.00      1.00         1
    reindeer       1.00      1.00      1.00         1
        rhea       1.00      1.00      1.00         1
     termite       0.00    

# Glass data

In [207]:
data = pd.read_csv('glass.csv')

In [209]:
data.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [211]:
x = data.iloc[:, 0:-1]
y = data.iloc[:, -1]

In [213]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y, random_state = 41)

In [215]:
cv = StratifiedKFold(n_splits = 5)
knn = KNeighborsClassifier()
params = {
    'n_neighbors': range(2, 21),
    'weights': ['uniform', 'distance'],
    'p': [1, 2]   

}
gridsearch = GridSearchCV(knn, params,cv = cv)
gridsearch.fit(x_train, y_train)

In [217]:
print("The best k value is, ", gridsearch.best_params_)

The best k value is,  {'n_neighbors': 2, 'p': 1, 'weights': 'uniform'}


In [219]:
model = KNeighborsClassifier(n_neighbors = 2, p = 1, weights = 'distance')

In [221]:
model.fit(x_train, y_train)

In [223]:
train_data_pred = model.predict(x_train)

In [225]:
accuracy = accuracy_score(y_train, train_data_pred)
print("The accuracy of the model on training data is,", accuracy)

The accuracy of the model on training data is, 1.0


In [227]:
test_data_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, test_data_pred)
print("The accuracy of the model on training data is,", accuracy)

The accuracy of the model on training data is, 0.7209302325581395
