In [26]:
pip install scikit-learn -U

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Importing Libraries

In [27]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from scipy.spatial import distance

Importing the dataset and spliting to train and test

In [3]:
train = pd.read_csv("trainKNN.txt")
train.columns = ['ID', 'RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type of glass']
train = train.drop('ID', axis=1) 

test = pd.read_csv('testKNN.txt')
test.columns=['ID', 'RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type of glass']
test = test.drop('ID', axis=1)

In [4]:
train.shape

(195, 10)

In [5]:
test.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type of glass
0,1.52152,13.12,3.58,0.9,72.2,0.23,9.82,0.0,0.16,1
1,1.523,13.31,3.58,0.82,71.99,0.12,10.17,0.0,0.03,1
2,1.51709,13.0,3.47,1.79,72.72,0.66,8.18,0.0,0.0,2
3,1.5166,12.99,3.18,1.23,72.97,0.58,8.81,0.0,0.24,2
4,1.51839,12.85,3.67,1.24,72.57,0.62,8.68,0.0,0.35,2


Discribing to find errors

In [6]:
train.describe()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type of glass
count,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0
mean,1.518281,13.373846,2.750103,1.456154,72.639795,0.521744,8.911538,0.165077,0.050513,2.676923
std,0.003057,0.784929,1.390655,0.492293,0.763064,0.673624,1.425102,0.486304,0.086506,2.064234
min,1.51131,10.73,0.0,0.29,69.81,0.0,5.43,0.0,0.0,1.0
25%,1.516455,12.875,2.335,1.19,72.33,0.145,8.22,0.0,0.0,1.0
50%,1.51763,13.27,3.48,1.36,72.81,0.56,8.57,0.0,0.0,2.0
75%,1.51895,13.795,3.605,1.625,73.08,0.61,9.105,0.0,0.09,3.0
max,1.53393,15.79,3.98,3.5,75.18,6.21,16.19,3.15,0.34,7.0


In [7]:
test.describe()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type of glass
count,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0
mean,1.518991,13.805294,1.769412,1.37,72.849412,0.257647,9.437647,0.31,0.128235,4.176471
std,0.00281,1.101659,1.744772,0.58676,0.910415,0.272799,1.419386,0.631951,0.169567,2.157409
min,1.51115,12.85,0.0,0.34,71.36,0.0,6.65,0.0,0.0,1.0
25%,1.51711,13.0,0.0,0.9,72.2,0.0,8.62,0.0,0.0,2.0
50%,1.51934,13.38,1.61,1.4,72.72,0.16,8.99,0.0,0.0,5.0
75%,1.52065,14.23,3.54,1.94,73.39,0.56,10.17,0.15,0.24,6.0
max,1.523,17.38,3.78,2.17,75.41,0.76,12.5,1.67,0.51,7.0


Normalizing

In [8]:
def normalize (df):
    for col in df.columns:
        if col != "Type of glass": 
            df[col] = (df[col] - df[col].mean())/df[col].std()
    return df

In [9]:
train = normalize(train)
test = normalize(test)

In [10]:
train.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type of glass
0,-0.219548,0.65758,0.611149,-0.195318,0.118214,-0.061969,-0.75892,-0.339452,-0.583925,1
1,-0.687312,0.19894,0.575195,0.170317,0.458946,-0.195574,-0.794005,-0.339452,-0.583925,1
2,-0.203193,-0.20874,0.675867,-0.33751,-0.039046,0.071637,-0.485255,-0.339452,-0.583925,1
3,-0.281699,-0.1323,0.625531,-0.439075,0.576892,0.041947,-0.590511,-0.339452,-0.583925,1
4,-0.759276,-0.74382,0.61834,0.332822,0.432736,0.175553,-0.590511,-0.339452,2.421657,1


In [11]:
test.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type of glass
0,0.900154,-0.622057,1.037722,-0.801009,-0.713314,-0.101346,0.269379,-0.490544,0.187328,1
1,1.426849,-0.44959,1.037722,-0.937351,-0.943978,-0.504574,0.515965,-0.490544,-0.579331,1
2,-0.676372,-0.730983,0.974676,0.715795,-0.142146,1.474908,-0.88605,-0.490544,-0.756252,2
3,-0.85075,-0.740061,0.808465,-0.238598,0.132454,1.181651,-0.442196,-0.490544,0.659119,2
4,-0.213734,-0.867142,1.089304,-0.221556,-0.306906,1.32828,-0.533785,-0.490544,1.30783,2


In [20]:
for i in [1,2,3,4,5,6,7,8,9,10]:
  euclid_model = KNeighborsClassifier(n_neighbors=i, metric=distance.sqeuclidean)
  manhattan_model = KNeighborsClassifier(n_neighbors=i, metric=distance.cityblock) 

  x_train = train.drop(["Type of glass"], axis=1)
  y_train = train["Type of glass"]

  euclid_model.fit(x_train,y_train)
  manhattan_model.fit(x_train, y_train)

  x_test = test.drop("Type of glass", axis=1) 
  y_test = test["Type of glass"]

  print("K value for Euclid  : " , i, " score : ", np.mean(cross_val_score(euclid_model, x_train, y_train, cv=6)))
  print("K value for manhattan : " , i, " score : ", np.mean(cross_val_score(manhattan_model, x_train, y_train, cv=6)))
  print()

K value for Euclid  :  1  score :  0.6253156565656566
K value for manhattan :  1  score :  0.6657196969696969

K value for Euclid  :  2  score :  0.69760101010101
K value for manhattan :  2  score :  0.6717171717171717

K value for Euclid  :  3  score :  0.66635101010101
K value for manhattan :  3  score :  0.6766098484848485

K value for Euclid  :  4  score :  0.6870265151515151
K value for manhattan :  4  score :  0.6816603535353535

K value for Euclid  :  5  score :  0.6871843434343434
K value for manhattan :  5  score :  0.6661931818181818

K value for Euclid  :  6  score :  0.6923926767676768
K value for manhattan :  6  score :  0.6816603535353535

K value for Euclid  :  7  score :  0.6614583333333334
K value for manhattan :  7  score :  0.6666666666666666

K value for Euclid  :  8  score :  0.6611426767676768
K value for manhattan :  8  score :  0.7024936868686869

K value for Euclid  :  9  score :  0.6515151515151515
K value for manhattan :  9  score :  0.6822916666666666

K val

K value of 6 has the better score among the other values

In [29]:
euclid_model = KNeighborsClassifier(n_neighbors=6, metric=distance.sqeuclidean) 
manhattan_model = KNeighborsClassifier(n_neighbors=6, metric=distance.cityblock) 
x_train = train.drop(["Type of glass"], axis=1)
y_train = train["Type of glass"]
euclid_model.fit(x_train,y_train) 
manhattan_model.fit(x_train, y_train)
x_test = test.drop("Type of glass", axis=1) 
y_test = test["Type of glass"]

Pridicting the values.

In [30]:
manhattan_predictions = manhattan_model.predict(x_test)
euclid_predictions = euclid_model.predict(x_test) 
df = pd.DataFrame({'actual': y_test, 'manhattan': manhattan_predictions, 'euclid': euclid_predictions})
df.head()


Unnamed: 0,actual,manhattan,euclid
0,1,1,1
1,1,1,1
2,2,2,2
3,2,1,1
4,2,1,2


Evaluating the Perfomance.

In [34]:
manhattan_count = len(df.loc[df['manhattan'] == df['actual']])
euclid_count = len(df.loc[df['euclid'] == df['actual']])

print('Manhattan Accuracy: {}%'.format(round(100*manhattan_count/len(df), 2)))
print(classification_report(y_test, manhattan_predictions, target_names=df['actual'].astype(str).unique()))
print ()

print('Square Euclidean Accuracy: {}%'.format(round(100*euclid_count/len(df), 2)))
print(classification_report(y_test, euclid_predictions, target_names=df['actual'].astype(str).unique()))

Manhattan Accuracy: 64.71%
              precision    recall  f1-score   support

           1       0.40      1.00      0.57         2
           2       0.25      0.33      0.29         3
           3       0.00      0.00      0.00         3
           5       1.00      0.67      0.80         3
           6       1.00      1.00      1.00         3
           7       1.00      1.00      1.00         3

    accuracy                           0.65        17
   macro avg       0.61      0.67      0.61        17
weighted avg       0.62      0.65      0.61        17


Square Euclidean Accuracy: 58.82%
              precision    recall  f1-score   support

           1       0.50      1.00      0.67         2
           2       0.29      0.67      0.40         3
           3       0.00      0.00      0.00         3
           5       1.00      0.67      0.80         3
           6       1.00      0.33      0.50         3
           7       1.00      1.00      1.00         3

    accuracy   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
