In [1]:
import pandas as pd
import numpy as np
# Data Preparation
df = pd.read_csv('16p.csv', engine='python', encoding='cp1252')
df.drop(['Response Id'], axis = 1 , inplace = True)
df['Personality']=df['Personality'].replace({'ESTJ':0,'ENTJ': 1,'ESFJ': 2,'ENFJ' :3,'ISTJ' :4,'ISFJ' :5,'INTJ' :6,'INFJ' :7,'ESTP': 8,'ESFP': 9,'ENTP' :10,'ENFP': 11,'ISTP': 12,'ISFP': 13,'INTP' :14, 'INFP' :15})

#I split dependent and independent varibles
df_i = df.drop(['Personality'], axis=1) # Independent varibles
df_d = df['Personality'] # Dependent varibles

# Transform dataframe to numpy array
df_i_np = df_i.to_numpy()
df_d_np = df_d.to_numpy()

# I normalize Independent Data and assign this array to array named 'dfi'
dfi = (df_i_np+3)/6
print(dfi)

[[0.5        0.5        0.5        ... 0.5        0.5        0.5       ]
 [0.5        0.5        0.16666667 ... 0.33333333 0.33333333 1.        ]
 [0.5        0.5        0.83333333 ... 0.66666667 0.83333333 0.66666667]
 ...
 [0.5        0.5        0.66666667 ... 0.33333333 0.5        0.33333333]
 [0.5        0.5        0.66666667 ... 0.5        0.66666667 0.5       ]
 [0.5        0.5        0.83333333 ... 0.66666667 0.5        0.33333333]]


In [2]:
# I split the Dataset to train and test by ratio %80.
#train,test = np.split(dfi,[int(0.8 * len(dfi))]
train = dfi[:47999]                         
test= dfi[47999:]                     

In [3]:

train_class = df_d_np[:47999]                         
test_class = df_d_np[47999:] 


In [4]:
print(len(train_class)) 

47999


In [5]:
print(len(test_class))

12000


In [6]:
def prediction(neighbors_list):
    prediction = max(set(neighbors_list), key=neighbors_list.count)
    return prediction

In [7]:

def KNN(train,test,k,train_class,test_class):
    # Calculating euclidian distance = sqrt(x-y)^2 = sqrt(x^2 -2xy + y^2)^2
    x_2 = np.sum(test**2, axis=1) 
    y_2 = np.sum(train**2, axis=1)
    xy = np.matmul(test, train.T)
    x_2 = x_2.reshape(-1, 1)
    distances = np.sqrt(x_2 - 2*xy + y_2)
    sorted_indices = np.argsort(distances)
    
    # Slicing array, total k indices 
    k_indices= sorted_indices[:,:k] 
    # Creating a confusion matrix
    con_mat=np.zeros((16,16))
    
    for i in range(len(test)):
        labels = train_class[k_indices[i]] # Take labels of k nearest neighbors.
        pred = prediction(list(labels)) # Making prediciton
        if pred != test_class[i]:
            con_mat[test_class[i]][pred]+=1
            print('Wrong Prediction',i+47999,'=> prediction:',pred,' real class:',test_class[i])
        else:
            con_mat[pred][pred]+=1 # I add to 1 value for every correct prediction to TP categhory.
            
    
    # ACCURACY
    sum_TP = 0
    sum_all = np.sum(con_mat)
    for x in range(16):
        sum_TP += con_mat[x][x] 
    accuracy = sum_TP/sum_all
    print('For K = ',k)
    print('accuracy:', accuracy)
    
    # PRECISSION AND RECALL
    precision, recall = 0,0
    # I calculate precision and recall for each class(0-15)
    for j in range(16):
        TP = con_mat[j][j]
        FP = np.sum(con_mat,axis=0)[j] - con_mat[j][j] # sum of all column - TP
        FN = np.sum(con_mat,axis=1)[j] - con_mat[j][j] # sum of all row - TP
    
        precision += TP / (TP + FP)
        recall += TP / (TP + FN )
        
    precision = precision/ 16 # For average precision
    recall = recall/ 16 # For average recall
    print('precision:', precision)
    print('recall:', recall)

In [8]:
KNN(train,test,5,train_class,test_class)

Wrong Prediction 48561 => prediction: 3  real class: 0
Wrong Prediction 48727 => prediction: 13  real class: 14
Wrong Prediction 48741 => prediction: 15  real class: 13
Wrong Prediction 48957 => prediction: 14  real class: 9
Wrong Prediction 49036 => prediction: 14  real class: 7
Wrong Prediction 49244 => prediction: 14  real class: 9
Wrong Prediction 49366 => prediction: 3  real class: 11
Wrong Prediction 49416 => prediction: 7  real class: 3
Wrong Prediction 49465 => prediction: 9  real class: 3
Wrong Prediction 49486 => prediction: 2  real class: 7
Wrong Prediction 49581 => prediction: 15  real class: 6
Wrong Prediction 49615 => prediction: 3  real class: 2
Wrong Prediction 49690 => prediction: 3  real class: 15
Wrong Prediction 49960 => prediction: 4  real class: 15
Wrong Prediction 49974 => prediction: 13  real class: 4
Wrong Prediction 50044 => prediction: 15  real class: 4
Wrong Prediction 50045 => prediction: 0  real class: 5
Wrong Prediction 50170 => prediction: 5  real class:

In [9]:
KNN(train,test,1,train_class,test_class)

Wrong Prediction 48076 => prediction: 13  real class: 14
Wrong Prediction 48082 => prediction: 9  real class: 11
Wrong Prediction 48189 => prediction: 1  real class: 13
Wrong Prediction 48207 => prediction: 8  real class: 12
Wrong Prediction 48332 => prediction: 6  real class: 15
Wrong Prediction 48391 => prediction: 2  real class: 6
Wrong Prediction 48413 => prediction: 13  real class: 7
Wrong Prediction 48439 => prediction: 3  real class: 13
Wrong Prediction 48473 => prediction: 4  real class: 15
Wrong Prediction 48561 => prediction: 3  real class: 0
Wrong Prediction 48588 => prediction: 5  real class: 8
Wrong Prediction 48727 => prediction: 13  real class: 14
Wrong Prediction 48741 => prediction: 15  real class: 13
Wrong Prediction 48812 => prediction: 15  real class: 0
Wrong Prediction 48905 => prediction: 2  real class: 6
Wrong Prediction 48957 => prediction: 14  real class: 9
Wrong Prediction 49007 => prediction: 6  real class: 12
Wrong Prediction 49009 => prediction: 5  real cla

In [10]:
KNN(train,test,3,train_class,test_class)

Wrong Prediction 48391 => prediction: 2  real class: 6
Wrong Prediction 48413 => prediction: 13  real class: 7
Wrong Prediction 48561 => prediction: 3  real class: 0
Wrong Prediction 48727 => prediction: 13  real class: 14
Wrong Prediction 48741 => prediction: 15  real class: 13
Wrong Prediction 48957 => prediction: 14  real class: 9
Wrong Prediction 49036 => prediction: 14  real class: 7
Wrong Prediction 49244 => prediction: 14  real class: 9
Wrong Prediction 49366 => prediction: 3  real class: 11
Wrong Prediction 49416 => prediction: 7  real class: 3
Wrong Prediction 49465 => prediction: 9  real class: 3
Wrong Prediction 49486 => prediction: 2  real class: 7
Wrong Prediction 49581 => prediction: 15  real class: 6
Wrong Prediction 49615 => prediction: 3  real class: 2
Wrong Prediction 49690 => prediction: 3  real class: 15
Wrong Prediction 49960 => prediction: 4  real class: 15
Wrong Prediction 49974 => prediction: 13  real class: 4
Wrong Prediction 50044 => prediction: 15  real class

In [11]:
KNN(train,test,7,train_class,test_class)

Wrong Prediction 48561 => prediction: 3  real class: 0
Wrong Prediction 48727 => prediction: 13  real class: 14
Wrong Prediction 48741 => prediction: 15  real class: 13
Wrong Prediction 48957 => prediction: 14  real class: 9
Wrong Prediction 49036 => prediction: 14  real class: 7
Wrong Prediction 49244 => prediction: 14  real class: 9
Wrong Prediction 49366 => prediction: 3  real class: 11
Wrong Prediction 49416 => prediction: 7  real class: 3
Wrong Prediction 49465 => prediction: 9  real class: 3
Wrong Prediction 49486 => prediction: 2  real class: 7
Wrong Prediction 49581 => prediction: 15  real class: 6
Wrong Prediction 49615 => prediction: 3  real class: 2
Wrong Prediction 49690 => prediction: 3  real class: 15
Wrong Prediction 49960 => prediction: 4  real class: 15
Wrong Prediction 49974 => prediction: 13  real class: 4
Wrong Prediction 50044 => prediction: 15  real class: 4
Wrong Prediction 50170 => prediction: 5  real class: 14
Wrong Prediction 50409 => prediction: 2  real class

In [12]:
KNN(train,test,9,train_class,test_class)

Wrong Prediction 48561 => prediction: 3  real class: 0
Wrong Prediction 48727 => prediction: 13  real class: 14
Wrong Prediction 48741 => prediction: 15  real class: 13
Wrong Prediction 48957 => prediction: 14  real class: 9
Wrong Prediction 49036 => prediction: 14  real class: 7
Wrong Prediction 49244 => prediction: 14  real class: 9
Wrong Prediction 49366 => prediction: 3  real class: 11
Wrong Prediction 49416 => prediction: 7  real class: 3
Wrong Prediction 49465 => prediction: 9  real class: 3
Wrong Prediction 49486 => prediction: 2  real class: 7
Wrong Prediction 49581 => prediction: 15  real class: 6
Wrong Prediction 49615 => prediction: 3  real class: 2
Wrong Prediction 49690 => prediction: 3  real class: 15
Wrong Prediction 49960 => prediction: 4  real class: 15
Wrong Prediction 49974 => prediction: 13  real class: 4
Wrong Prediction 50044 => prediction: 15  real class: 4
Wrong Prediction 50170 => prediction: 5  real class: 14
Wrong Prediction 50409 => prediction: 2  real class

In [14]:
# WITHOUT NORMALIZATION 
df_i_np = df_i.to_numpy()
df_d_np = df_d.to_numpy()

train_wn = df_i_np[:47999]  # Train_wn stands for "train without normalization".                       
test_wn = df_i_np[47999:]  # Test_wn stands for "test without normalization".

In [15]:
KNN(train_wn,test_wn,1,train_class,test_class)

Wrong Prediction 48076 => prediction: 13  real class: 14
Wrong Prediction 48082 => prediction: 9  real class: 11
Wrong Prediction 48189 => prediction: 1  real class: 13
Wrong Prediction 48207 => prediction: 8  real class: 12
Wrong Prediction 48332 => prediction: 6  real class: 15
Wrong Prediction 48391 => prediction: 2  real class: 6
Wrong Prediction 48413 => prediction: 13  real class: 7
Wrong Prediction 48439 => prediction: 3  real class: 13
Wrong Prediction 48538 => prediction: 1  real class: 13
Wrong Prediction 48561 => prediction: 3  real class: 0
Wrong Prediction 48588 => prediction: 5  real class: 8
Wrong Prediction 48727 => prediction: 13  real class: 14
Wrong Prediction 48741 => prediction: 15  real class: 13
Wrong Prediction 48812 => prediction: 15  real class: 0
Wrong Prediction 48905 => prediction: 2  real class: 6
Wrong Prediction 48957 => prediction: 14  real class: 9
Wrong Prediction 49007 => prediction: 6  real class: 12
Wrong Prediction 49009 => prediction: 5  real cla

Wrong Prediction 54849 => prediction: 1  real class: 2
Wrong Prediction 54863 => prediction: 13  real class: 3
Wrong Prediction 54986 => prediction: 13  real class: 3
Wrong Prediction 55084 => prediction: 15  real class: 13
Wrong Prediction 55272 => prediction: 4  real class: 15
Wrong Prediction 55366 => prediction: 15  real class: 10
Wrong Prediction 55390 => prediction: 14  real class: 5
Wrong Prediction 55399 => prediction: 7  real class: 6
Wrong Prediction 55403 => prediction: 11  real class: 15
Wrong Prediction 55430 => prediction: 12  real class: 2
Wrong Prediction 55448 => prediction: 10  real class: 6
Wrong Prediction 55489 => prediction: 7  real class: 1
Wrong Prediction 55509 => prediction: 6  real class: 5
Wrong Prediction 55512 => prediction: 10  real class: 11
Wrong Prediction 55517 => prediction: 14  real class: 13
Wrong Prediction 55526 => prediction: 15  real class: 5
Wrong Prediction 55535 => prediction: 4  real class: 14
Wrong Prediction 55536 => prediction: 1  real c

In [16]:
KNN(train_wn,test_wn,3,train_class,test_class)

Wrong Prediction 48413 => prediction: 13  real class: 7
Wrong Prediction 48561 => prediction: 3  real class: 0
Wrong Prediction 48727 => prediction: 13  real class: 14
Wrong Prediction 48741 => prediction: 15  real class: 13
Wrong Prediction 48957 => prediction: 14  real class: 9
Wrong Prediction 49036 => prediction: 14  real class: 7
Wrong Prediction 49244 => prediction: 14  real class: 9
Wrong Prediction 49366 => prediction: 3  real class: 11
Wrong Prediction 49416 => prediction: 7  real class: 3
Wrong Prediction 49465 => prediction: 9  real class: 3
Wrong Prediction 49486 => prediction: 2  real class: 7
Wrong Prediction 49581 => prediction: 15  real class: 6
Wrong Prediction 49615 => prediction: 3  real class: 2
Wrong Prediction 49690 => prediction: 3  real class: 15
Wrong Prediction 49960 => prediction: 4  real class: 15
Wrong Prediction 49974 => prediction: 13  real class: 4
Wrong Prediction 50044 => prediction: 15  real class: 4
Wrong Prediction 50045 => prediction: 0  real class

In [17]:
KNN(train_wn,test_wn,5,train_class,test_class)


Wrong Prediction 48561 => prediction: 3  real class: 0
Wrong Prediction 48727 => prediction: 13  real class: 14
Wrong Prediction 48741 => prediction: 15  real class: 13
Wrong Prediction 48957 => prediction: 14  real class: 9
Wrong Prediction 49036 => prediction: 14  real class: 7
Wrong Prediction 49244 => prediction: 14  real class: 9
Wrong Prediction 49366 => prediction: 3  real class: 11
Wrong Prediction 49416 => prediction: 7  real class: 3
Wrong Prediction 49465 => prediction: 9  real class: 3
Wrong Prediction 49486 => prediction: 2  real class: 7
Wrong Prediction 49581 => prediction: 15  real class: 6
Wrong Prediction 49615 => prediction: 3  real class: 2
Wrong Prediction 49690 => prediction: 3  real class: 15
Wrong Prediction 49960 => prediction: 4  real class: 15
Wrong Prediction 49974 => prediction: 13  real class: 4
Wrong Prediction 50044 => prediction: 15  real class: 4
Wrong Prediction 50045 => prediction: 0  real class: 5
Wrong Prediction 50170 => prediction: 5  real class:

In [18]:
KNN(train_wn,test_wn,7,train_class,test_class)

Wrong Prediction 48561 => prediction: 3  real class: 0
Wrong Prediction 48727 => prediction: 13  real class: 14
Wrong Prediction 48741 => prediction: 15  real class: 13
Wrong Prediction 48957 => prediction: 14  real class: 9
Wrong Prediction 49036 => prediction: 14  real class: 7
Wrong Prediction 49244 => prediction: 14  real class: 9
Wrong Prediction 49366 => prediction: 3  real class: 11
Wrong Prediction 49416 => prediction: 7  real class: 3
Wrong Prediction 49465 => prediction: 9  real class: 3
Wrong Prediction 49486 => prediction: 2  real class: 7
Wrong Prediction 49581 => prediction: 15  real class: 6
Wrong Prediction 49615 => prediction: 3  real class: 2
Wrong Prediction 49690 => prediction: 3  real class: 15
Wrong Prediction 49960 => prediction: 4  real class: 15
Wrong Prediction 49974 => prediction: 13  real class: 4
Wrong Prediction 50044 => prediction: 15  real class: 4
Wrong Prediction 50045 => prediction: 0  real class: 5
Wrong Prediction 50170 => prediction: 5  real class:

In [19]:
KNN(train_wn,test_wn,9,train_class,test_class)

Wrong Prediction 48561 => prediction: 3  real class: 0
Wrong Prediction 48727 => prediction: 13  real class: 14
Wrong Prediction 48741 => prediction: 15  real class: 13
Wrong Prediction 48957 => prediction: 14  real class: 9
Wrong Prediction 49036 => prediction: 14  real class: 7
Wrong Prediction 49244 => prediction: 14  real class: 9
Wrong Prediction 49366 => prediction: 3  real class: 11
Wrong Prediction 49416 => prediction: 7  real class: 3
Wrong Prediction 49465 => prediction: 9  real class: 3
Wrong Prediction 49486 => prediction: 2  real class: 7
Wrong Prediction 49581 => prediction: 15  real class: 6
Wrong Prediction 49615 => prediction: 3  real class: 2
Wrong Prediction 49690 => prediction: 3  real class: 15
Wrong Prediction 49960 => prediction: 4  real class: 15
Wrong Prediction 49974 => prediction: 13  real class: 4
Wrong Prediction 50044 => prediction: 15  real class: 4
Wrong Prediction 50045 => prediction: 0  real class: 5
Wrong Prediction 50170 => prediction: 5  real class:

Gülvera Yazılıtaş b2210356111

                                                 REPORT
 Analyses
 
 1-) When I normalized Data I get these results;
 
 K = 1
accuracy: 0.9783333333333334
precision: 0.9783042430177052
recall: 0.9782880233852064
 
 K = 3
accuracy: 0.9881666666666666
precision: 0.9881548465514688
recall: 0.9881313372472769

 K = 5
accuracy: 0.9886666666666667
precision: 0.988657383232201
recall: 0.9886482340539449

 K = 7
accuracy: 0.989
precision: 0.9889925343654058
recall: 0.988983661350532

 K = 9
accuracy: 0.9890833333333333
precision: 0.9890800036202679
recall: 0.9890676535495522
 
In Conclusion when K values increases accuracy, precision and recall also increases.

2-) When I used non normalized Data ;

 K = 1
accuracy: 0.97775
precision: 0.977716560440014
recall: 0.9776958364107582

 K = 3
accuracy: 0.98825
precision: 0.9882404225423297
recall: 0.988217225253303

 K = 5
accuracy: 0.98875
precision: 0.9887398079282843
recall: 0.9887313457560725

 K = 7
accuracy: 0.9888333333333333
precision: 0.9888283142603189
recall: 0.9888124583311716

 K = 9
accuracy: 0.989
precision: 0.9889955472083461
recall: 0.9889839855441974

Non normalization data has similar pattern in terms of increasment of K values.

However, it can be seen that feature normalized values has bigger accuracy , precision and recall. It is more efficient that use normalized data.

Error Analyse

I have printed errors in every calculations.
Reasons of these errors might be value of K. Because, when model check nearest K values majority of classes might be equal.
For example a test case has 5 nearest class(14,14,12,13,12) it has 50% chance to predict true value. Disadvantages like these might be reason of errors.