In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("C:\\Users\\AK\\Downloads\\multi_classification_train.csv")
data.head()

Unnamed: 0,ID,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,...,Feature_12,Feature_13,Feature_14,Feature_15,Feature_16,Feature_17,Feature_18,Feature_19,Feature_20,Class
0,1,-90.532634,-66.953472,79.261856,-111.800554,126.403549,111.509348,207.728953,-26.600556,-106.229324,...,148.391734,77.835738,-10.728207,1.100625,1.035362,-87.010809,-16.242297,-32.074925,-17.666026,1
1,2,-103.15672,-16.0704,87.819228,12.614599,51.34778,67.483725,40.269172,51.442254,-151.486693,...,29.009475,3.995786,-10.86163,-142.605726,-25.924592,-86.755351,-36.479749,-130.246619,-44.143652,2
2,3,24.326153,-92.098078,82.238354,-56.795879,85.203996,127.916504,-90.080307,-128.124071,18.03602,...,111.810098,65.826018,-101.271203,-44.127749,-7.131464,-105.049759,-130.948256,-43.113523,-37.330448,4
3,4,-64.631737,-83.703583,84.135072,-5.516152,74.338494,112.630556,181.576798,-1.054023,60.469865,...,50.047108,100.439101,-117.842955,150.239788,-144.635542,-144.306209,-69.272905,-79.629675,-51.334456,2
4,5,-55.47383,-78.853237,88.129107,75.200543,76.99152,60.224711,-13.106559,-146.773016,-33.490566,...,85.988282,23.38196,11.876102,-188.296503,-80.323929,-56.757987,-20.314172,-42.62517,-24.102753,2


In [3]:
X = data.iloc[:, 1:-1].to_numpy()  
y = data.iloc[:, -1].to_numpy()

In [4]:
split_length = int(0.8 * len(y))
X_train, X_dev = X[:split_length], X[split_length:]
y_train, y_dev = y[:split_length], y[split_length:]

In [5]:
X_train_mean = np.mean(X_train, axis=0)
X_train_std = np.std(X_train, axis=0)
X_train = (X_train - X_train_mean) / X_train_std
X_dev = (X_dev - X_train_mean) / X_train_std

In [6]:
def knn(X, y, X_, k=3):
    prediction = []
    for i in X_:
        distances = np.linalg.norm(X - i, axis=1)
        nearest_indexes = np.argsort(distances)[:k]
        required_labels = y[nearest_indexes]
        unique, counts = np.unique(required_labels, return_counts=True)
        majority = unique[np.argmax(counts)]
        prediction.append(majority)
    return np.array(prediction)

In [7]:
def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred) * 100

In [8]:
def calculate_precision(y_true, y_pred):
    classes = np.unique(y_true)
    precisions = []
    for cls in classes:
        tp = np.sum((y_true==cls) & (y_pred==cls))
        fp = np.sum((y_true != cls) & (y_pred==cls))
        precision = (tp / (tp + fp)) if (tp + fp) != 0 else 0
        precisions.append(precision)
    return np.mean(precisions)

In [9]:
def calculate_recall(y_true, y_pred):
    classes = np.unique(y_true)
    recalls = []
    for cls in classes:
        tp = np.sum((y_true == cls) & (y_pred == cls))
        fn = np.sum((y_true == cls) & (y_pred != cls))
        recall = (tp / (tp + fn)) if (tp + fn) != 0 else 0
        recalls.append(recall)
    return np.mean(recalls)

In [10]:
def f1_score(precision, recall):
    return (2 * precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

In [11]:
best_k = 6

In [None]:
y_train_pred = knn(X_train, y_train, X_train, best_k)
y_dev_pred = knn(X_train, y_train, X_dev, best_k)

In [27]:
accuracy_train = accuracy(y_train, y_train_pred)
accuracy_dev = accuracy(y_dev, y_dev_pred)
print(f"TRAINING ACCURACY IS: {accuracy_train}%")
print(f"DEVELOPMENT ACCURACY IS: {accuracy_dev}%")

TRAINING ACCURACY IS: 96.13541666666666%
DEVELOPMENT ACCURACY IS: 95.38541666666667%


In [28]:
train_precision = calculate_precision(y_train, y_train_pred)
dev_precision = calculate_precision(y_dev, y_dev_pred)
print(f"TRAINING PRECISION IS: {train_precision}")
print(f"DEVELOPMENT PRECISION IS: {dev_precision}")

TRAINING PRECISION IS: 0.9597916366689978
DEVELOPMENT PRECISION IS: 0.9516713648968661


In [29]:
train_recall = calculate_recall(y_train, y_train_pred)
dev_recall = calculate_recall(y_dev, y_dev_pred)
print(f"TRAINING RECALL IS: {train_recall}")
print(f"DEVELOPMENT RECALL IS: {dev_recall}")

TRAINING RECALL IS: 0.9445326055315062
DEVELOPMENT RECALL IS: 0.9356595910260769


In [30]:
train_f1score = f1_score(train_precision, train_recall)
dev_f1score = f1_score(dev_precision, dev_recall)
print(f"TRAINING F1 SCORE IS: {train_f1score}")
print(f"DEVELOPMENT F1 SCORE IS: {dev_f1score}")

TRAINING F1 SCORE IS: 0.9521009870701075
DEVELOPMENT F1 SCORE IS: 0.9435975574672717


In [32]:
test_data = pd.read_csv("C:\\Users\\AK\\Downloads\\multi_classification_test.csv")
test_data.head()

Unnamed: 0,ID,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,...,Feature_11,Feature_12,Feature_13,Feature_14,Feature_15,Feature_16,Feature_17,Feature_18,Feature_19,Feature_20
0,1,-53.83735,-32.645889,88.656704,68.460038,77.27898,170.29944,-40.452999,157.171382,81.01974,...,45.21222,79.05223,23.209321,-300.397141,-26.927269,-7.724388,-47.897697,-31.844784,-80.670098,-68.25705
1,2,-72.698043,137.186526,85.122064,-135.777642,72.743374,37.159495,1.219229,86.447392,-29.006295,...,41.602345,68.546653,103.288778,0.74286,31.342856,37.431942,-92.965704,-22.113709,-57.311455,-74.076167
2,3,-87.376758,52.292029,82.285036,-103.899819,76.276041,22.141478,131.322204,69.009157,-121.703035,...,50.667553,94.928521,60.908583,-30.925963,-119.456062,170.92059,-8.940891,-49.424624,-76.538002,-35.694483
3,4,-114.053308,-82.064113,87.3523,311.645082,80.806755,158.822556,-74.591773,254.85675,-279.480145,...,63.089374,131.078916,13.590238,3.596146,-220.548262,-77.689231,21.423893,-64.062843,-69.41693,-12.109111
4,5,-129.305845,-70.794135,87.318648,-39.995258,91.126015,96.4968,0.578289,-101.344881,17.968593,...,-3.900131,-63.876158,27.123655,43.230208,-76.974733,-91.008909,-91.917865,1.852507,-88.130343,-103.909397


In [37]:
X_test = test_data.iloc[:, 1:]
X_test = (X_test - X_train_mean) / (X_train_std)
X_test = X_test.values
y_test_pred = knn(X_train, y_train, X_test, best_k)

In [40]:
predictions = pd.DataFrame(y_test_pred)

In [41]:
test_data['Predictions'] = predictions

In [43]:
test_data.to_csv("MULTI CLASSIFICATION USING KNN.csv")