In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

In [2]:
data = pd.read_csv("European_bank_marketing.csv")

In [3]:
training_frac = 0.7
training_end = int(len(data)*training_frac)

In [5]:
train_data = data.iloc[:training_end, :]
test_data = data.iloc[training_end:, :]

In [6]:
y_train = train_data['term_deposit'] # Target Variable
X_train = train_data.loc[:, train_data.columns != 'term_deposit'] # Exogenous Variables

In [7]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
num_X_train = X_train.select_dtypes(include=numerics) # Numerical Exogenous Variables
str_X_train = X_train.select_dtypes(include =['object']) # Categorical Exogenous Variables

In [8]:
def integer_encoding(df, series_name):

    # Find unique categorical labels
    unique_labels = df[series_name].unique()

    # Translate str label into simple numerical label e.g. 0, 1, 2, 3, ..., n
    # where n is the number of labels
    encoding_dict = {unique_labels[i]: i for i in range(len(unique_labels))}

    # Replace the str labels with the numerical ones
    df = df.replace({series_name: encoding_dict})
    return df

In [9]:
# Implementing Integer Encoding
for column in str_X_train.columns:
    str_X_train = integer_encoding(str_X_train, column)

In [10]:
def standardise_df(df):
    return (df - df.mean())/df.std()

In [11]:
# Standardising Numerical Values
num_X_train = standardise_df(num_X_train)

In [12]:
# X['marital'].unique()

In [13]:
processed_X_train = pd.concat([num_X_train, str_X_train], axis = 1)

Fitting K-NN with $k = 1, 3, 5, 10$

In [14]:
models_by_k = {1: KNeighborsClassifier(n_neighbors=1).fit(processed_X_train, y_train),
               3: KNeighborsClassifier(n_neighbors=3).fit(processed_X_train, y_train),
               5: KNeighborsClassifier(n_neighbors=5).fit(processed_X_train, y_train),
               10: KNeighborsClassifier(n_neighbors=10).fit(processed_X_train, y_train)}

**Test Data**

In [15]:
y_test = test_data['term_deposit'] # Target Variable
X_test = test_data.loc[:, test_data.columns != 'term_deposit'] # Exogenous Variables

In [16]:
num_X_test = X_test.select_dtypes(include=numerics) # Numerical Exogenous Variables
str_X_test = X_test.select_dtypes(include =['object']) # Categorical Exogenous Variables

In [17]:
# Implementing Integer Encoding
for column in str_X_test.columns:
    str_X_test = integer_encoding(str_X_test, column)

In [18]:
# Standardising Numerical Values
num_X_test = standardise_df(num_X_test)

In [19]:
processed_X_test = pd.concat([num_X_test, str_X_test], axis = 1)

In [20]:
def confusion_matrix(test_labels, preds):

    model_classifications = {"TP": 0, 
                             "TN": 0, 
                             "FP": 0, 
                             "FN": 0}

    for i in range(len(y_test)):
        if preds[i] == 1 and preds[i] == test_labels[i]:
            model_classifications['TP'] += 1
        
        elif preds[i] == 0 and preds[i] == test_labels[i]:
            model_classifications['TN'] += 1

        elif preds[i] == 1 and test_labels[i] == 0:
            model_classifications['FP'] += 1
        
        elif preds[i] == 0 and test_labels[i] == 1:
            model_classifications['FN'] += 1
    
    return model_classifications


In [21]:
def accuracy(confusion_matrix):
    TP = confusion_matrix["TP"]
    TN = confusion_matrix["TN"]
    FP = confusion_matrix["FP"]
    FN = confusion_matrix["FN"]
    return (TP + TN)/(TP + FP + TN + FN)

In [22]:
def sensitivity(confusion_matrix):
    TP = confusion_matrix["TP"]
    FN = confusion_matrix["FN"]
    return (TP)/(TP + FN)

In [23]:
def specificity(confusion_matrix):
    TN = confusion_matrix["TN"]
    FP = confusion_matrix["FP"]
    return (TN)/(TN + FP)

In [24]:
confusion_matrices = {}
accuracies = {}
sensitivities = {}
specificities = {}
for k in [1,3,5,10]:
    pred_k = models_by_k[k].predict(processed_X_test)
    confusion_matrices[k] = confusion_matrix(np.array(y_test), pred_k)
    accuracies[k] = accuracy(confusion_matrices[k])
    sensitivities[k] = sensitivity(confusion_matrices[k])
    specificities[k] = specificity(confusion_matrices[k])

In [25]:
accuracies

{1: 0.7615926195678563,
 3: 0.7619972485231044,
 5: 0.7607833616573602,
 10: 0.7607833616573602}

In [26]:
sensitivities

{1: 0.1786420566908372,
 3: 0.11305207646671062,
 5: 0.08305866842452209,
 10: 0.04515491100856955}

In [27]:
specificities

{1: 0.9513032285744932,
 3: 0.9731845972326504,
 5: 0.9813364796739247,
 10: 0.9936715649469055}