Reading data and reshaping of the data.

In [1]:
#libraries
import pandas as pd 
import numpy as np
import matplotlib as plt


mapping_of_characters = {"ESTJ" :  0, "ENTJ" :  1, "ESFJ" :  2, "ENFJ" :  3, # filter for reformatting of personality types
                         "ISTJ" :  4, "ISFJ" :  5, "INTJ" :  6, "INFJ" :  7,
                         "ESTP" :  8, "ESFP" :  9, "ENTP" : 10, "ENFP" : 11,
                         "ISTP" : 12, "ISFP" : 13, "INTP" : 14, "INFP" : 15,}

# formatting of pandas dataframe and turning into numpy array
subjects_df = pd.read_csv("16P.csv", encoding='cp1252')
subjects_df = subjects_df.drop("Response Id", axis= 1)
subjects_df["Personality"] = subjects_df["Personality"].map(mapping_of_characters)
subjects = subjects_df.values

# removal of pandas data frame
del subjects_df

# target/predictors split
subjects_targets = subjects[:,60]
subjects_predictors = subjects[:,:-1]

Knn model:

In [2]:
class KNN_Model: #knn model object this makes it easier to make a lot of models

    # initialization
    def __init__(self, k = 0, trainset_targets = np.array([]), trainset_predictors = np.array([]), weighted = False):
        self.k = k # k is the number defining how many clossest neighbours are we going to check
        self.trainset_targets = trainset_targets # labels for our train set
        self.trainset_predictors = trainset_predictors # features for our train set
        self.weighted = weighted # a boolean variable determining wether our model will be weighted or not
        self.char_info_indexed = [[0, 0, 0, 12000] for r in range(16)] # 2D array holding performance metrics for each character type
        """ character info indexed index information:
                >First Index (Character Numbers): basically numbers in mapping_of_characters
                >Second Index (Info Type): 0 = tp, 1 = fp, 2 = fn, 3 = tn"""
        

    def predict(self, testset_targets = np.array([]), testset_predictors = np.array([])): # function for calling prediction metrics

        # block for non weighted knn
        if not self.weighted:
            
            # for loop running through testset
            for i in range(testset_predictors.shape[0]):
                
                #prediction portion.
                prediction_subject = testset_predictors[i] # variable for improving readability
                trainset_distances = np.linalg.norm((self.trainset_predictors - prediction_subject), axis=1) # calculation of distances to every neighbour
                prediction = self.trainset_targets[np.argpartition(trainset_distances, self.k)[:self.k]] # calling clossest k neihbours

                #narrowing the prediction into one character type
                pred_repeat_counts = np.bincount(prediction)
                most_repeats = np.argmax(pred_repeat_counts)
                
                #adjusting performance metrics
                if most_repeats != testset_targets[i]: # prediction is wrong
                    self.char_info_indexed[most_repeats][3] -= 1
                    self.char_info_indexed[testset_targets[i]][3] -= 1
                    self.char_info_indexed[most_repeats][1] += 1
                    self.char_info_indexed[testset_targets[i]][2] +=1
                    wrong_pred = [prediction, testset_targets[i]]
                
                else: #prediction is true
                    self.char_info_indexed[most_repeats][0] += 1
                    self.char_info_indexed[most_repeats][3] -= 1

        # block for weighted knn        
        else:
            weights = [(2*u-1) for u in range(self.k,0,-1)] # defining weights this way makes it easier to change in the future
            
            # for loop running through testset
            for i in range(testset_predictors.shape[0]):
                
                #prediction portion.
                prediction_subject = testset_predictors[i] # variable for improving readability
                trainset_distances = np.linalg.norm((self.trainset_predictors - prediction_subject), axis=1) # calculation of distances to every neighbour
                prediction = self.trainset_targets[np.argpartition(trainset_distances, self.k)[:self.k]] # calling clossest k neihbours
                prediction_with_weights = np.array([prediction, weights])

                # weighing predictions
                unique_preds = np.unique(prediction_with_weights[0])
                unique_preds_weights = []
                for char in unique_preds:
                    weight = np.sum(prediction_with_weights[1,np.where(prediction_with_weights[0] == char)])
                    unique_preds_weights.append(weight)

                #narrowing the prediction into one character type
                pred_repeat_counts = np.bincount(prediction)
                heaviest_pred = unique_preds[unique_preds_weights.index(max(unique_preds_weights))]
                
                #adjusting performance metrics
                if heaviest_pred != testset_targets[i]: # prediction is wrong
                    self.char_info_indexed[heaviest_pred][3] -= 1
                    self.char_info_indexed[testset_targets[i]][3] -= 1
                    self.char_info_indexed[heaviest_pred][1] += 1
                    self.char_info_indexed[testset_targets[i]][2] +=1
                    wrong_pred = [[prediction, weights], testset_targets[i]]
                
                else: #prediction is true
                    self.char_info_indexed[heaviest_pred][0] += 1
                    self.char_info_indexed[heaviest_pred][3] -= 1

        print("Wrong Prediction Sample:", wrong_pred)

        return np.array(self.char_info_indexed)

As you can see at "adjusting performance metrics" portion, this model doesn't return predictions. Instead It compares predictions to true labels and returns performance metrics. I designed it this way because it is faster to calculate performance metrics this way and we don't need predictions in this assignment. However code could be easily changed to return predictions.







Defining a function to calculate accuracy, precision and recall.

In [3]:
def calculate_metrics(perf_metrics): # we are going to use this calculations a lot so it makes sense to make it into a function.

    # some variables for making code more readable
    tp_array = perf_metrics[:,0]
    fp_array = perf_metrics[:,1]
    fn_array = perf_metrics[:,2]
    tn_array = perf_metrics[:,3]

    # calculating and printing performance metrics
    accuracy = np.mean((tp_array+tn_array)/(np.sum(perf_metrics, axis=1)))*100
    precision = np.mean((tp_array)/(tp_array + fp_array))*100
    recall = np.mean((tp_array)/(tp_array + fn_array))*100

    return accuracy, precision, recall

Standart train/test split for trying knn model.

In [4]:
# train/test split
subjects_predictors_train = subjects_predictors[12000:]
subjects_predictors_test = subjects_predictors[:12000]
subjects_targets_train = subjects_targets[12000:]
subjects_targets_test = subjects_targets[:12000]

Testing knn model with standart k = 3 and 80/20 train/test split parameters.

In [5]:
#model prediction
test_model = KNN_Model(k = 3, trainset_predictors =subjects_predictors_train, trainset_targets= subjects_targets_train)
perf_metrics = test_model.predict(testset_targets= subjects_targets_test, testset_predictors= subjects_predictors_test)

#printing performance metrics
acc, prec, rec = calculate_metrics(perf_metrics)
print("Accuracy = {}, Precision = {}, Recall = {}".format(acc, prec, rec)) 

Wrong Prediction Sample: [array([9, 9, 9], dtype=int64), 7]
Accuracy = 99.85729166666665, Precision = 98.86497902330741, Recall = 98.8537036793035


Testing k values 1, 3, 5, 7, 9 with 80/20 train/test split.

In [6]:
for i in range(1,11,2):
    
    # defining model and predicting with given k values
    k_test_model = KNN_Model(k = i, trainset_predictors = subjects_predictors_train, trainset_targets= subjects_targets_train)
    perf_metrics = k_test_model.predict(testset_targets = subjects_targets_test, testset_predictors= subjects_predictors_test)
    
    # printing of performance metrics
    acc, prec, rec = calculate_metrics(perf_metrics)
    print("k = {}: Accuracy = {}, Precision = {}, Recall = {}".format(i, acc, prec, rec))

Wrong Prediction Sample: [array([9], dtype=int64), 13]
k = 1: Accuracy = 99.73854166666666, Precision = 97.90913999449529, Recall = 97.90335237352792
Wrong Prediction Sample: [array([9, 9, 9], dtype=int64), 7]
k = 3: Accuracy = 99.85729166666665, Precision = 98.86497902330741, Recall = 98.8537036793035
Wrong Prediction Sample: [array([9, 9, 9, 9, 9], dtype=int64), 7]
k = 5: Accuracy = 99.86562500000001, Precision = 98.92911987926229, Recall = 98.9209759741886
Wrong Prediction Sample: [array([9, 9, 9, 9, 9, 9, 9], dtype=int64), 7]
k = 7: Accuracy = 99.86979166666667, Precision = 98.96143845132698, Recall = 98.95443611966999
Wrong Prediction Sample: [array([9, 9, 9, 9, 9, 9, 9, 1, 9], dtype=int64), 7]
k = 9: Accuracy = 99.871875, Precision = 98.97828678529564, Recall = 98.9718599389326


In [7]:
#removal of unnecessary objects
del test_model
del k_test_model

Cross_validdation for every k value.

In [8]:
# lists for printing overall performance metrics
acc_arr = []
prec_arr = []
rec_arr = []

# cross validation
for j in range(5): # loop for folds

    # if and else block below are neccessary due to dataset being 59999 rows instead of 60000
    if j != 4: # using np.delete and slicing to define train/test split
        subj_pred_train_cv = np.delete(subjects_predictors, np.arange((j*12000),((j+1)*12000)), axis = 0)
        subj_tar_train_cv = np.delete(subjects_targets, np.arange((j*12000),((j+1)*12000)), axis = 0)
        subj_pred_test_cv = subjects_predictors[(j*12000):((j+1)*12000)]
        subj_tar_test_cv = subjects_targets[(j*12000):((j+1)*12000)]

    else: # using only slicing to define train/test split
        subj_pred_train_cv = subjects_predictors[:48000]
        subj_tar_train_cv = subjects_targets[:48000]
        subj_pred_test_cv = subjects_predictors[48000:]
        subj_tar_test_cv = subjects_targets[48000:]

    for i in range(1,11,2): #loop for k values
        
        # defining model and predicting with given k values
        k_fold_model = KNN_Model(k = i, trainset_predictors = subj_pred_train_cv, trainset_targets= subj_tar_train_cv)
        perf_metrics = k_fold_model.predict(testset_targets = subj_tar_test_cv, testset_predictors= subj_pred_test_cv)

        # printing and recording of performance metrics
        acc, prec, rec = calculate_metrics(perf_metrics)
        print("fold = {}, k = {}: Accuracy = {}, Precision = {}, Recall = {}".format(j+1, i, acc, prec, rec))
        acc_arr.append(acc)
        prec_arr.append(prec)
        rec_arr.append(rec)

# printing of overall performance metrics
overall_acc = np.mean(np.array(acc_arr))
overall_prec = np.mean(np.array(prec_arr))
overall_rec = np.mean(np.array(rec_arr))
print("Overall Metrics: Accuracy = {}, Macro Precision = {}, Macro Recall = {}".format(overall_acc, overall_prec, overall_rec))

Wrong Prediction Sample: [array([9], dtype=int64), 13]
fold = 1, k = 1: Accuracy = 99.73854166666666, Precision = 97.90913999449529, Recall = 97.90335237352792
Wrong Prediction Sample: [array([9, 9, 9], dtype=int64), 7]
fold = 1, k = 3: Accuracy = 99.85729166666665, Precision = 98.86497902330741, Recall = 98.8537036793035
Wrong Prediction Sample: [array([9, 9, 9, 9, 9], dtype=int64), 7]
fold = 1, k = 5: Accuracy = 99.86562500000001, Precision = 98.92911987926229, Recall = 98.9209759741886
Wrong Prediction Sample: [array([9, 9, 9, 9, 9, 9, 9], dtype=int64), 7]
fold = 1, k = 7: Accuracy = 99.86979166666667, Precision = 98.96143845132698, Recall = 98.95443611966999
Wrong Prediction Sample: [array([9, 9, 9, 9, 9, 9, 9, 1, 9], dtype=int64), 7]
fold = 1, k = 9: Accuracy = 99.871875, Precision = 98.97828678529564, Recall = 98.9718599389326
Wrong Prediction Sample: [array([11], dtype=int64), 15]
fold = 2, k = 1: Accuracy = 99.70416666666667, Precision = 97.63402946520029, Recall = 97.632090695

In [9]:
#removal of unneccessary data
del subj_pred_test_cv
del subj_pred_train_cv
del subj_tar_test_cv
del subj_tar_train_cv
del k_fold_model

In [10]:
# normalisation of the predictors
subjects_predictors_nm = (subjects_predictors + 3) / 6

Doing the same cross_validation with normalized data.

In [11]:
print("NORMALISED")

acc_arr = []
prec_arr = []
rec_arr = []

# cross validation
for j in range(5): # loop for folds

    # if and else block below are neccessary due to dataset being 59999 rows instead of 60000
    if j != 4: # using np.delete and slicing to define train/test split
        subj_pred_train_cv_nm = np.delete(subjects_predictors_nm, np.arange((j*12000),((j+1)*12000)), axis = 0)
        subj_tar_train_cv = np.delete(subjects_targets, np.arange((j*12000),((j+1)*12000)), axis = 0)
        subj_pred_test_cv_nm = subjects_predictors_nm[(j*12000):((j+1)*12000)]
        subj_tar_test_cv = subjects_targets[(j*12000):((j+1)*12000)]

    else: # using only slicing to define train/test split
        subj_pred_train_cv_nm = subjects_predictors_nm[:48000]
        subj_tar_train_cv = subjects_targets[:48000]
        subj_pred_test_cv_nm = subjects_predictors_nm[48000:]
        subj_tar_test_cv = subjects_targets[48000:]

    for i in range(1,11,2): #loop for k values
        
        # defining model and predicting with given k values
        k_fold_nm_model = KNN_Model(k = i, trainset_predictors = subj_pred_train_cv_nm, trainset_targets= subj_tar_train_cv)
        perf_metrics = k_fold_nm_model.predict(testset_targets = subj_tar_test_cv, testset_predictors= subj_pred_test_cv_nm)

        # printing and recording of performance metrics
        acc, prec, rec = calculate_metrics(perf_metrics)
        print("fold = {}, k = {}: Accuracy = {}, Precision = {}, Recall = {}".format(j+1, i, acc, prec, rec))
        acc_arr.append(acc)
        prec_arr.append(prec)
        rec_arr.append(rec)

# printing of overall performance metrics
overall_acc = np.mean(np.array(acc_arr))
overall_prec = np.mean(np.array(prec_arr))
overall_rec = np.mean(np.array(rec_arr))
print("Overall Metrics: Accuracy = {}, Macro Precision = {}, Macro Recall = {}".format(overall_acc, overall_prec, overall_rec))

NORMALISED
Wrong Prediction Sample: [array([9], dtype=int64), 13]
fold = 1, k = 1: Accuracy = 99.73749999999998, Precision = 97.90045833252957, Recall = 97.89487571982583
Wrong Prediction Sample: [array([9, 9, 9], dtype=int64), 7]
fold = 1, k = 3: Accuracy = 99.85729166666667, Precision = 98.86513163223391, Recall = 98.8540974942042
Wrong Prediction Sample: [array([9, 9, 9, 9, 9], dtype=int64), 7]
fold = 1, k = 5: Accuracy = 99.86562500000001, Precision = 98.92911987926229, Recall = 98.9209759741886
Wrong Prediction Sample: [array([9, 9, 9, 9, 9, 9, 1], dtype=int64), 7]
fold = 1, k = 7: Accuracy = 99.871875, Precision = 98.97838668694762, Recall = 98.97171495255941
Wrong Prediction Sample: [array([9, 9, 9, 9, 9, 9, 1, 9, 9], dtype=int64), 7]
fold = 1, k = 9: Accuracy = 99.86979166666667, Precision = 98.96200402709684, Recall = 98.95485301028673
Wrong Prediction Sample: [array([11], dtype=int64), 15]
fold = 2, k = 1: Accuracy = 99.709375, Precision = 97.67438415194138, Recall = 97.67459

As we can see noramlization has no meaningful efecct for this dataset.

In [12]:
# removal of unneccessary data
del subj_pred_test_cv_nm
del subj_pred_train_cv_nm
del subj_tar_test_cv
del subj_tar_train_cv
del k_fold_nm_model


Testing weighted knn model with 80/20 train/test split and k values of 1, 3, 5, 7 and 9.

In [13]:
for i in range(1,11,2):
    
    # defining model and predicting with given k values
    k_weighted_test_model = KNN_Model(k = i, trainset_predictors = subjects_predictors_train, trainset_targets= subjects_targets_train, weighted= True)
    perf_metrics = k_weighted_test_model.predict(testset_targets = subjects_targets_test, testset_predictors= subjects_predictors_test)
    
    # printing of performance metrics
    acc, prec, rec = calculate_metrics(perf_metrics)
    print("k = {}: Accuracy = {}, Precision = {}, Recall = {}".format(i, acc, prec, rec))

Wrong Prediction Sample: [[array([9], dtype=int64), [1]], 13]
k = 1: Accuracy = 99.73854166666666, Precision = 97.90913999449529, Recall = 97.90335237352792
Wrong Prediction Sample: [[array([ 9, 13, 13], dtype=int64), [5, 3, 1]], 13]
k = 3: Accuracy = 99.73854166666666, Precision = 97.90861533401475, Recall = 97.90708973740269
Wrong Prediction Sample: [[array([9, 9, 9, 9, 9], dtype=int64), [9, 7, 5, 3, 1]], 7]
k = 5: Accuracy = 99.85833333333333, Precision = 98.87102756159517, Recall = 98.86170920064752
Wrong Prediction Sample: [[array([9, 9, 9, 9, 9, 9, 9], dtype=int64), [13, 11, 9, 7, 5, 3, 1]], 7]
k = 7: Accuracy = 99.86666666666667, Precision = 98.93719555204217, Recall = 98.92946782201469
Wrong Prediction Sample: [[array([9, 9, 9, 9, 9, 9, 9, 1, 9], dtype=int64), [17, 15, 13, 11, 9, 7, 5, 3, 1]], 7]
k = 9: Accuracy = 99.86979166666666, Precision = 98.96229934579075, Recall = 98.95465448396968


As we can see weighted knn is actually less accurate than non_weighted knn. However the difference is negligible.

In [14]:
# removal of unnecessary data
del subjects_predictors_train
del subjects_predictors_test
del subjects_targets_test
del subjects_targets_train
del k_weighted_test_model

Cross_validation of weighted knn.

In [15]:
print("WEIGHTED")

# lists for printing overall performance metrics
acc_arr = []
prec_arr = []
rec_arr = []

# cross validation
for j in range(5): # loop for folds

    # if and else block below are neccessary due to dataset being 59999 rows instead of 60000
    if j != 4: # using np.delete and slicing to define train/test split
        subj_pred_train_cv = np.delete(subjects_predictors, np.arange((j*12000),((j+1)*12000)), axis = 0)
        subj_tar_train_cv = np.delete(subjects_targets, np.arange((j*12000),((j+1)*12000)), axis = 0)
        subj_pred_test_cv = subjects_predictors[(j*12000):((j+1)*12000)]
        subj_tar_test_cv = subjects_targets[(j*12000):((j+1)*12000)]

    else: # using only slicing to define train/test split
        subj_pred_train_cv = subjects_predictors[:48000]
        subj_tar_train_cv = subjects_targets[:48000]
        subj_pred_test_cv = subjects_predictors[48000:]
        subj_tar_test_cv = subjects_targets[48000:]

    for i in range(1,11,2): #loop for k values
        
        # defining model and predicting with given k values
        k_fold_weighted_model = KNN_Model(k = i, trainset_predictors = subj_pred_train_cv, trainset_targets= subj_tar_train_cv, weighted= True)
        perf_metrics = k_fold_weighted_model.predict(testset_targets = subj_tar_test_cv, testset_predictors= subj_pred_test_cv)

        # printing and recording of performance metrics
        acc, prec, rec = calculate_metrics(perf_metrics)
        print("fold = {}, k = {}: Accuracy = {}, Precision = {}, Recall = {}".format(j+1, i, acc, prec, rec))
        acc_arr.append(acc)
        prec_arr.append(prec)
        rec_arr.append(rec)

# printing of overall performance metrics
overall_acc = np.mean(np.array(acc_arr))
overall_prec = np.mean(np.array(prec_arr))
overall_rec = np.mean(np.array(rec_arr))
print("Overall Metrics: Accuracy = {}, Macro Precision = {}, Macro Recall = {}".format(overall_acc, overall_prec, overall_rec))

WEIGHTED
Wrong Prediction Sample: [[array([9], dtype=int64), [1]], 13]
fold = 1, k = 1: Accuracy = 99.73854166666666, Precision = 97.90913999449529, Recall = 97.90335237352792
Wrong Prediction Sample: [[array([ 9, 13, 13], dtype=int64), [5, 3, 1]], 13]
fold = 1, k = 3: Accuracy = 99.73854166666666, Precision = 97.90861533401475, Recall = 97.90708973740269
Wrong Prediction Sample: [[array([9, 9, 9, 9, 9], dtype=int64), [9, 7, 5, 3, 1]], 7]
fold = 1, k = 5: Accuracy = 99.85833333333333, Precision = 98.87102756159517, Recall = 98.86170920064752
Wrong Prediction Sample: [[array([9, 9, 9, 9, 9, 9, 9], dtype=int64), [13, 11, 9, 7, 5, 3, 1]], 7]
fold = 1, k = 7: Accuracy = 99.86666666666667, Precision = 98.93719555204217, Recall = 98.92946782201469
Wrong Prediction Sample: [[array([9, 9, 9, 9, 9, 9, 9, 1, 9], dtype=int64), [17, 15, 13, 11, 9, 7, 5, 3, 1]], 7]
fold = 1, k = 9: Accuracy = 99.86979166666666, Precision = 98.96229934579075, Recall = 98.95465448396968
Wrong Prediction Sample: [[arr

In [16]:
# removal of unneccessary data
del subj_pred_train_cv
del subj_pred_test_cv
del subj_tar_test_cv
del subj_tar_train_cv
del k_fold_weighted_model

Cross-validation of weighted knn with normalised data.

In [17]:
print("NORMALISED, WEIGHTED")

acc_arr = []
prec_arr = []
rec_arr = []

# cross validation
for j in range(5): # loop for folds

    # if and else block below are neccessary due to dataset being 59999 rows instead of 60000
    if j != 4: # using np.delete and slicing to define train/test split
        subj_pred_train_cv_nm = np.delete(subjects_predictors_nm, np.arange((j*12000),((j+1)*12000)), axis = 0)
        subj_tar_train_cv = np.delete(subjects_targets, np.arange((j*12000),((j+1)*12000)), axis = 0)
        subj_pred_test_cv_nm = subjects_predictors_nm[(j*12000):((j+1)*12000)]
        subj_tar_test_cv = subjects_targets[(j*12000):((j+1)*12000)]

    else: # using only slicing to define train/test split
        subj_pred_train_cv_nm = subjects_predictors_nm[:48000]
        subj_tar_train_cv = subjects_targets[:48000]
        subj_pred_test_cv_nm = subjects_predictors_nm[48000:]
        subj_tar_test_cv = subjects_targets[48000:]

    for i in range(1,11,2): #loop for k values
        
        # defining model and predicting with given k values
        k_fold_nm_weighted_model = KNN_Model(k = i, trainset_predictors = subj_pred_train_cv_nm, trainset_targets= subj_tar_train_cv, weighted = True)
        perf_metrics = k_fold_nm_weighted_model.predict(testset_targets = subj_tar_test_cv, testset_predictors= subj_pred_test_cv_nm)

        # printing and recording of performance metrics
        acc, prec, rec = calculate_metrics(perf_metrics)
        print("fold = {}, k = {}: Accuracy = {}, Precision = {}, Recall = {}".format(j+1, i, acc, prec, rec))
        acc_arr.append(acc)
        prec_arr.append(prec)
        rec_arr.append(rec)

# printing of overall performance metrics
overall_acc = np.mean(np.array(acc_arr))
overall_prec = np.mean(np.array(prec_arr))
overall_rec = np.mean(np.array(rec_arr))
print("Overall Metrics: Accuracy = {}, Macro Precision = {}, Macro Recall = {}".format(overall_acc, overall_prec, overall_rec))

NORMALISED, WEIGHTED
Wrong Prediction Sample: [[array([9], dtype=int64), [1]], 13]
fold = 1, k = 1: Accuracy = 99.73749999999998, Precision = 97.90045833252957, Recall = 97.89487571982583
Wrong Prediction Sample: [[array([ 3, 13, 13], dtype=int64), [5, 3, 1]], 13]
fold = 1, k = 3: Accuracy = 99.71041666666667, Precision = 97.68458991958515, Recall = 97.68096462357457
Wrong Prediction Sample: [[array([9, 9, 9, 9, 9], dtype=int64), [9, 7, 5, 3, 1]], 7]
fold = 1, k = 5: Accuracy = 99.85520833333334, Precision = 98.84678360807754, Recall = 98.836350327738
Wrong Prediction Sample: [[array([9, 9, 9, 9, 9, 9, 1], dtype=int64), [13, 11, 9, 7, 5, 3, 1]], 7]
fold = 1, k = 7: Accuracy = 99.86250000000001, Precision = 98.90371110772718, Recall = 98.8961924903931
Wrong Prediction Sample: [[array([9, 9, 9, 9, 9, 9, 1, 9, 9], dtype=int64), [17, 15, 13, 11, 9, 7, 5, 3, 1]], 7]
fold = 1, k = 9: Accuracy = 99.87083333333334, Precision = 98.97029760787457, Recall = 98.96333333593124
Wrong Prediction Samp

In [18]:
del subj_pred_train_cv_nm
del subj_pred_test_cv_nm
del subj_tar_test_cv
del subj_tar_train_cv
del k_fold_nm_weighted_model

Error Analysis for Classification:

    Effects of Neighbour Number(k):

        Effects of neighbour number is generaly positive. This means that k values 1, 3, 5 and 7 are probably underfitted for this aplication. This doesn't mean that higher k number is always better. If we chose a k value that is too large we might overfit our model and reduce our accuracy.

        Accuracy for k = 1: 99.7223
        Accuracy for k = 9: 99.8655

    Effects of Normalisation:

        Normalisation doesn't have any meaningful effect on this dataset. The reason for this is the fact that our data already came scaled. Since every answer is betwen 3 and -3 scalng every answer between 1 and 0 just changes the range our answer is placed.

        Overall Accuracy for Non-normalised data: 99.8345
        Overall Accuracy for Normalised data:     99.8348

    Effects of K-fold Cross-validation:

        Effects of Cross-validation aren't about increasing performance metrics but making sure that our model uses all of the dataset when we take our performance metrics. By using cross-validation we are using every subject as part of both train split and test split at least once.

    Effects of Adding Weights:
    
        Adding weights affected performance metrics negatively for this dataset. The reason for this probablty is that farthest neighbours can't contribute enough with weighting method I chosed.

        Accuracy for Non-weighted Knn: 99.8345
        Accuracy for Weighted Knn: 99.8051
    
    Performance Metrics:

        Non_normalised and Non_weighted model:

            Accuracy = 99.8345
            Precision = 98.6781
            Recall = 98.6759

        Normalised and Non_weighted model:

            Accuracy = 99.8348
            Precision = 98.6801
            Recall = 98.6779

        Non_normalised and Weighted model:

            Accuracy = 99.8051
            Precision = 98.4431
            Recall = 98.4407

        Normalised and Weighted model:

            Accuracy = 99.8043
            Precision = 98.4369
            Recall = 98.4340

    Examining Missclassified Samples:

        Non-weighted:

            k = 1:
                
                Neighbour: 9   True Label: 13
                Neighbour: 11  True Label: 15
                Neighbour: 9   True Label: 12
                Neighbour: 14  True Label: 0

                There is nothing we can do to fix these mispredictions other than choosing a higher k value.

            k = 3:

                Neighbours: 9, 9, 9      True Label: 7
                Neighbours: 6, 6, 6      True Label: 14
                Neighbours: 9, 9, 9      True Label: 12
                Neighbours: 5, 5, 5      True Label: 12
                Neighbours: 14, 14, 14   True Label: 0

                There is nothing we can do to fix these mispredictions. These subjects are outliers so they can't be fixed by improving knn algorithm.

            k = 5:

                Neighbours: 9, 9, 9, 9, 9       True Label: 7
                Neighbours: 6, 6, 6, 6, 6       True Label: 14
                Neighbours: 9, 9, 9, 9, 9       True Label: 12
                Neighbours: 5, 5, 5, 5, 5       True Label: 12
                Neighbours: 14, 14, 14, 14, 14  True Label: 0

                There is nothing we can do to fix these mispredictions. These subjects are outliers so they can't be fixed by improving knn algorithm.
            
            k = 7:

                Neighbours: 9, 9, 9, 9, 9, 9, 9            True Label: 7
                Neighbours: 9, 9, 9, 9, 9, 9, 9            True Label: 12
                Neighbours: 5, 5, 5, 5, 5, 5, 5            True Label: 12
                Neighbours: 14, 14, 14, 14, 14, 14, 14     True Label: 0

                There is nothing we can do to fix these mispredictions. These subjects are outliers so they can't be fixed by improving knn algorithm.

                
                Neighbours: 14, 14,  6, 14,  6,  6,  6     True Label: 14

                We can use a weighted knn to find true labels.

            k = 9:   

                Neighbours: 9, 9, 9, 9, 9, 9, 9, 1, 9           True Label: 7
                Neighbours: 6, 6, 6, 6, 6, 6, 6, 6, 6           True Label: 14
                Neighbours: 9, 9, 9, 9, 9, 9, 9, 9, 9           True Label: 12
                Neighbours: 5, 5, 5, 5, 5, 5, 5, 5, 5           True Label: 12
                Neighbours: 14, 14, 14, 14, 14, 14, 14, 14, 14  True Label: 0

                There is nothing we can do to fix these mispredictions. These subjects are outliers so they can't be fixed by improving knn algorithm.

        Weighted:

            k = 1:
                
                Neighbour: 9     True Label: 13
                Neighbour: 11    True Label: 15
                Neighbour: 9     True Label: 12
                Neighbour: 14    True Label: 0

                There is nothing we can do to fix these mispredictions other than choosing a higher k value.

            k = 3:

                
                Neighbours: 9, 9, 9      True Label: 12
                Neighbours: 5, 5, 5      True Label: 12
                Neighbours: 14, 14, 14   True Label: 0

                There is nothing we can do to fix these mispredictions. These subjects are outliers so they can't be fixed by improving knn algorithm.


                Neighbours: 10,  1,  1   True Label: 1
                Neighbours: 2, 8, 8      True Label: 8
                Neighbours: 9, 12, 12    True Label: 12
                Neighbours: 9, 13, 13    True Label: 13
                Neighbours: 9, 14, 14    True Label: 14

                We can use a non-weighted knn to find true labels.

            k = 5:

                Neighbours: 9, 9, 9, 9, 9       True Label: 7
                Neighbours: 6, 6, 6, 6, 6       True Label: 14
                Neighbours: 9, 9, 9, 9, 9       True Label: 12
                Neighbours: 5, 5, 5, 5, 5       True Label: 12
                Neighbours: 14, 14, 14, 14, 14  True Label: 0

                There is nothing we can do to fix these mispredictions. These subjects are outliers so they can't be fixed by improving knn algorithm.

            k = 7:

                Neighbours: 9, 9, 9, 9, 9, 9, 9         True Label: 7
                Neighbours: 6, 6, 6, 6, 6, 6, 6         True Label: 14
                Neighbours: 9, 9, 9, 9, 9, 9, 9         True Label: 12
                Neighbours: 5, 5, 5, 5, 5, 5, 5         True Label: 12
                Neighbours: 14, 14, 14, 14, 14, 14, 14  True Label: 0

                There is nothing we can do to fix these mispredictions. These subjects are outliers so they can't be fixed by improving knn algorithm.

            k = 9:

                Neighbours: 9, 9, 9, 9, 9, 9, 9, 1, 9           True Label: 7
                Neighbours: 6, 6, 6, 6, 6, 6, 6, 6, 6           True Label: 14
                Neighbours: 9, 9, 9, 9, 9, 9, 9, 9, 9           True Label: 12
                Neighbours: 5, 5, 5, 5, 5, 5, 5, 5, 5           True Label: 12
                Neighbours: 14, 14, 14, 14, 14, 14, 14, 14, 14  True Label: 0

                There is nothing we can do to fix these mispredictions. These subjects are outliers so they can't be fixed by improving knn algorithm.