## Introduction to K-Nearest Neighbors

The following is the machine learning workflow from the previous lesson.

![Supervised Machine Learning Workflow](../images/1.1-m737.svg)

In [1]:
# import neccessary libraries

import pandas as pd

## Data Exploration

In [8]:
# explore the dataset
banking_df = pd.read_csv("../data/subscription_prediction.csv")

# print the first few observations
print(banking_df.head())

# print the number of features and observations
print(banking_df.dtypes.value_counts())



   age          job   marital    education  default housing loan    contact  \
0   40       admin.   married     basic.6y       no      no   no  telephone   
1   56     services   married  high.school       no      no  yes  telephone   
2   41  blue-collar   married      unknown  unknown      no   no  telephone   
3   57    housemaid  divorced     basic.4y       no     yes   no  telephone   
4   39   management    single     basic.9y  unknown      no   no  telephone   

  month day_of_week  ...  campaign  pdays  previous     poutcome emp.var.rate  \
0   may         mon  ...         1    999         0  nonexistent          1.1   
1   may         mon  ...         1    999         0  nonexistent          1.1   
2   may         mon  ...         1    999         0  nonexistent          1.1   
3   may         mon  ...         1    999         0  nonexistent          1.1   
4   may         mon  ...         1    999         0  nonexistent          1.1   

   cons.price.idx  cons.conf.idx  euri

In [12]:
# print the shape of the dataset
print(f"Shape of the dataset: {banking_df.shape}")

# print the number of missing values in each column
print(banking_df.isna().sum())



Shape of the dataset: (10122, 21)
age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64


In [13]:

# print the number of customers who subscribed to the term deposit and the number of customers who didn't
print(banking_df["y"].value_counts())

# print the summary statistics for the dataset
print(banking_df.describe())

y
no     5482
yes    4640
Name: count, dtype: int64
                age      duration      campaign         pdays      previous  \
count  10122.000000  10122.000000  10122.000000  10122.000000  10122.000000   
mean      40.313673    373.414049      2.369789    896.476882      0.297471   
std       11.855014    353.277755      2.472392    302.175859      0.680535   
min       17.000000      0.000000      1.000000      0.000000      0.000000   
25%       31.000000    140.000000      1.000000    999.000000      0.000000   
50%       38.000000    252.000000      2.000000    999.000000      0.000000   
75%       48.000000    498.000000      3.000000    999.000000      0.000000   
max       98.000000   4199.000000     42.000000    999.000000      6.000000   

       emp.var.rate  cons.price.idx  cons.conf.idx     euribor3m   nr.employed  
count  10122.000000    10122.000000   10122.000000  10122.000000  10122.000000  
mean      -0.432671       93.492407     -40.250573      3.035134   5138.83

## Data Preparation

In [14]:
# convert the target variable to binary
banking_df["y"] = banking_df["y"].apply(lambda x: 1 if x=="yes" else 0)

# split the dataset into training and testing sets
train_df = banking_df.sample(frac=0.85, random_state=417)
test_df = banking_df.drop(train_df.index)

# print the proportion of customers who subscribed to the term deposit in the training and testing sets
print(train_df["y"].value_counts(normalize=True))
print(test_df["y"].value_counts(normalize=True))

# drop the target variable from the training and testing sets
X_train = train_df.drop("y", axis=1)
y_train = train_df["y"]

# drop the target variable from the testing set
X_test = test_df.drop("y", axis=1)
y_test = test_df["y"]

y
0    0.540098
1    0.459902
Name: proportion, dtype: float64
y
0    0.550066
1    0.449934
Name: proportion, dtype: float64


## KNN One Feature


In [16]:
# KNN Algorithm
def knn(feature, single_test_input, k):
    X_train["distance"] = abs(X_train[feature] - single_test_input[feature])
    prediction = y_train[X_train["distance"].nsmallest(n=k).index].mode()[0]
    return prediction

model_prediction = knn("age", X_test.iloc[417], 3)
print(f"Predicted label: {model_prediction}")
print(f"Actual label: {y_test.iloc[417]}")

Predicted label: 0
Actual label: 0


## Evaluating The Model

In [17]:
# Predicting the label for the test set using the KNN algorithm
X_test["age_predicted_y"] = X_test.apply(lambda x: knn("age", x, 3), axis=1)

# Calculating the accuracy of the model
model_accuracy = (X_test["age_predicted_y"] == y_test).value_counts(normalize=True)[True]*100
print(f"Accuracy of model trained on the column 'age': {model_accuracy:.2f}%")

# Predicting the label for the test set using the KNN algorithm
X_test["campaign_predicted_y"] = X_test.apply(lambda x: knn("campaign", x, 3), axis=1)

# Calculating the accuracy of the model
model_accuracy = (X_test["campaign_predicted_y"] == y_test).value_counts(normalize=True)[True]*100
print(f"Accuracy of model trained on the column 'campaign': {model_accuracy:.2f}%")

Accuracy of model trained on the column 'age': 53.89%
Accuracy of model trained on the column 'campaign': 55.14%


## Feature Engineering


In [31]:
# Create dummy variables for the marital column

banking_df_copy = banking_df.copy()
banking_df_copy = pd.get_dummies(data = banking_df_copy, columns = ["marital"], drop_first = True)

In [46]:

# KNN with different data types
def knn(features, single_test_input, k):
    squared_distance = 0
    for feature in features:
        # For numeric features, use subtraction
        if X_train[feature].dtype in ['int64', 'float64']:
            squared_distance += (X_train[feature] - single_test_input[feature])**2
        # For boolean/categorical features, use equality comparison
        else:
            squared_distance += (X_train[feature] != single_test_input[feature]).astype(int)
    
    X_train["distance"] = squared_distance**0.5
    prediction = y_train[X_train["distance"].nsmallest(n=k).index].mode()[0]
    return prediction

print(f"Accuracy of the model: {model_accuracy:.2f}%")

Accuracy of the model: 55.14%


In [47]:
def knn(features, test_input, k):
    # Initialize distance as 0
    distance = 0
    # Calculate Euclidean distance for each feature
    for feature in features:
        distance += (test_input[feature] - X_train[feature])**2
    
    # Calculate square root of sum of squared differences
    X_train["distance"] = distance**0.5
    
    # Get prediction based on k nearest neighbors
    prediction = y_train[X_train["distance"].nsmallest(n=k).index].mode()[0]
    return prediction

print(f"Accuracy of the model: {model_accuracy:.2f}%")

Accuracy of the model: 55.14%
