In [52]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from scipy.spatial import distance

In [53]:
# Read train and test sets
train = pd.read_csv("trainKNN.txt", header=None)
train.columns = ['ID', 'RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type of glass']
train = train.drop('ID', axis=1) # Drop ID since irrelevant to predictions
test = pd.read_csv('testKNN.txt', header=None)
test.columns=['ID', 'RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type of glass']
test = test.drop('ID', axis=1)

In [54]:
train.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type of glass
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [55]:
train.describe()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type of glass
count,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0
mean,1.518295,13.375204,2.75898,1.454337,72.635408,0.519388,8.910714,0.164235,0.050255,2.668367
std,0.003055,0.783145,1.392641,0.491688,0.763578,0.672703,1.42149,0.485198,0.086359,2.062416
min,1.51131,10.73,0.0,0.29,69.81,0.0,5.43,0.0,0.0,1.0
25%,1.516458,12.8775,2.3625,1.19,72.3175,0.14,8.22,0.0,0.0,1.0
50%,1.51763,13.28,3.48,1.36,72.81,0.56,8.575,0.0,0.0,2.0
75%,1.518985,13.7925,3.61,1.6225,73.08,0.61,9.0925,0.0,0.09,3.0
max,1.53393,15.79,4.49,3.5,75.18,6.21,16.19,3.15,0.34,7.0


In [56]:
def standardize (df):
    for col in df.columns:
        if col != "Type of glass": # Don't standardize the categories
            df[col] = (df[col] - df[col].mean())/df[col].std()
    return df

In [57]:
# Standardize data
train = standardize(train)
test = standardize(test)
train.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type of glass
0,0.888537,0.338119,1.242977,-0.720654,-1.120263,-0.682898,-0.11306,-0.33849,-0.581932,1
1,-0.224221,0.657345,0.603903,-0.191863,0.12388,-0.058551,-0.760269,-0.33849,-0.581932,1
2,-0.692234,0.197659,0.568,0.174223,0.464382,-0.19234,-0.795443,-0.33849,-0.581932,1
3,-0.207857,-0.21095,0.668529,-0.33423,-0.033275,0.075237,-0.485909,-0.33849,-0.581932,1
4,-0.286405,-0.134335,0.618265,-0.43592,0.582248,0.045506,-0.591432,-0.33849,-0.581932,1


In [61]:
# k = 10 neighbors
euclid_model = KNeighborsClassifier(n_neighbors=10, metric=distance.sqeuclidean) # Square Euclidean distance model
manhattan_model = KNeighborsClassifier(n_neighbors=10, metric=distance.cityblock) # Manhattan distance model
x_train = train.drop(["Type of glass"], axis=1)
y_train = train["Type of glass"]
euclid_model.fit(x_train,y_train) # Train models
manhattan_model.fit(x_train, y_train)
x_test = test.drop("Type of glass", axis=1) 
y_test = test["Type of glass"]

In [62]:
# predictions
manhattan_predictions = manhattan_model.predict(x_test)
euclid_predictions = euclid_model.predict(x_test) 
df = pd.DataFrame({'actual': y_test, 'manhattan': manhattan_predictions, 'euclid': euclid_predictions})
df.head()

Unnamed: 0,actual,manhattan,euclid
0,1,1,1
1,1,1,1
2,1,1,1
3,2,2,2
4,2,2,1


In [63]:
# performance Evaluation
manhattan_count = len(df.loc[df['manhattan'] == df['actual']])
euclid_count = len(df.loc[df['euclid'] == df['actual']])
print('Manhattan Accuracy: {}%'.format(round(100*manhattan_count/len(df), 2)))
print(classification_report(y_test, manhattan_predictions, target_names=df['actual'].astype(str).unique()))
print ('\n')
print('Square Euclidean Accuracy: {}%'.format(round(100*euclid_count/len(df), 2)))
print(classification_report(y_test, euclid_predictions, target_names=df['actual'].astype(str).unique()))

Manhattan Accuracy: 72.22%
              precision    recall  f1-score   support

           1       0.75      1.00      0.86         3
           2       0.43      1.00      0.60         3
           3       0.00      0.00      0.00         3
           5       1.00      0.33      0.50         3
           6       1.00      1.00      1.00         3
           7       1.00      1.00      1.00         3

    accuracy                           0.72        18
   macro avg       0.70      0.72      0.66        18
weighted avg       0.70      0.72      0.66        18



Square Euclidean Accuracy: 66.67%
              precision    recall  f1-score   support

           1       0.60      1.00      0.75         3
           2       0.40      0.67      0.50         3
           3       0.00      0.00      0.00         3
           5       1.00      0.67      0.80         3
           6       1.00      0.67      0.80         3
           7       0.75      1.00      0.86         3

    accuracy  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
