# K Nearest Neighbors

In [1]:
import pandas as pd
import scipy.stats
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import matplotlib.patches as mpatches
from statistics import mode
from math import sqrt
from fancyimpute import KNN
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_blobs
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report, confusion_matrix

# Reading in our Dataset

In [2]:
dataset_df = pd.read_csv("knn-training-data.txt")

In [3]:
# let's get rid of customer ID, we have an index
dataset_df = dataset_df.drop('customer_id', axis = 1)

# now let's also get rid of most popular category, a categorical var 
# we don't need in our dataset
dataset_df = dataset_df.drop('most_popular_category', axis = 1)

In [4]:
dataset_df.head()

Unnamed: 0,relationship_days,total_spend,beverage_categories,segment
0,240,230.75,9,explorer
1,218,117.5,3,explorer
2,139,105.5,4,explorer
3,748,18.0,1,cheapo
4,743,404.5,6,loyal core


# Creating Training Dataset

In [5]:
training_df = dataset_df

#create labels
cols = ['explorer', 'cheapo', 'loyal core', 'newbie']

#add labels
training_df.rename(columns = {cols[0]:0, cols[1]:1, cols[2]:2, cols[3]:3 })

#add a distance column
training_df['distance'] = '9999'

#view our training DF
training_df.head()

Unnamed: 0,relationship_days,total_spend,beverage_categories,segment,distance
0,240,230.75,9,explorer,9999
1,218,117.5,3,explorer,9999
2,139,105.5,4,explorer,9999
3,748,18.0,1,cheapo,9999
4,743,404.5,6,loyal core,9999


Now let's create our target for our KNN, this will be the first row of our training_df.

In [6]:
target = pd.Series([36,240,230.75,9])
#target

Now that we have our target we will measure the 'distance' between this target point and all the other points in our dataset. I will be using Euclidean Distance.

In [7]:
training_df['distance'] = (((training_df.loc[:,'relationship_days']-target[0])**2 + (training_df.loc[:,'total_spend']-target[1])**2 + (training_df.loc[:,'beverage_categories']-target[2])**2)** 0.5)

In [8]:
training_df.sort_values('distance', ascending = True).head()

Unnamed: 0,relationship_days,total_spend,beverage_categories,segment,distance
15725,103,231.25,12,explorer,228.947865
7301,109,237.5,13,explorer,229.674362
1342,108,261.25,12,explorer,231.272837
2877,101,219.0,9,explorer,232.03246
12017,116,228.5,12,explorer,233.203372


# Building KNN Model

In [9]:
k = 10
training_df = training_df.sort_values('distance', ascending = True)
knn = list(training_df.head(k).segment)
knn

['explorer',
 'explorer',
 'explorer',
 'explorer',
 'explorer',
 'explorer',
 'explorer',
 'explorer',
 'explorer',
 'explorer']

# Reading in Testing Data

In [10]:
testing_df = pd.read_csv("knn-testing-data.txt")

In [11]:
del testing_df['most_popular_category']

#create a testing prediction column
testing_df['prediction']=''


In [12]:
#choose our target customer to compare the rest too
the_cust = testing_df.iloc[0,:]

In [13]:
#calculate my euclidean distance
training_df['distance'] = (((training_df.loc[:,'relationship_days']-the_cust[1])**2 + (training_df.loc[:,'total_spend']-the_cust[2])**2 + (training_df.loc[:,'beverage_categories']-the_cust[3])**2)** 0.5)


# Results 

In [14]:
training_df.sort_values('distance', ascending=True).head()

Unnamed: 0,relationship_days,total_spend,beverage_categories,segment,distance
321,224,144.0,4,explorer,1.414214
2407,224,146.0,2,explorer,3.162278
1823,223,148.0,5,explorer,3.162278
5667,227,140.75,6,explorer,5.297405
11612,222,149.5,7,explorer,5.315073


In [15]:
k = 10
training_df = training_df.sort_values('distance', ascending = True)
knn = list(training_df.head(k).segment)
knn

['explorer',
 'explorer',
 'explorer',
 'explorer',
 'explorer',
 'explorer',
 'explorer',
 'explorer',
 'explorer',
 'explorer']

# Testing Prediction on One dataset Row

In [16]:
k = 10

prediction = mode(training_df.sort_values('distance', ascending=True).head(k).segment)

In [17]:
testing_df.iloc[0,4] = prediction

In [18]:
testing_df.head()

Unnamed: 0,customer_id,relationship_days,total_spend,beverage_categories,prediction
0,", 9GT0P908DS5VGJYRYG4CHAPSQG",224,145.0,5,explorer
1,", D74N0TBT7D2K6YPYDA734V56Y4",272,152.5,5,
2,", XHN902RSGS7V6ZBWN7FE5QVGA4",371,55.0,3,
3,", YDZ8kOpLlCLDlmom",609,159.0,6,
4,-uNXHhkiA5JEebdj,802,107.0,8,


# Testing Prediction on all dataset Rows

In [23]:
k = 25

#this allows us to iterate over all rows in our testing_df
for row_idx in  range(testing_df.shape[0]):
    
    #the customer is changed every iteration
    the_cust = testing_df.iloc[row_idx,:]
    #distance calculation
    training_df['distance'] = (((training_df.loc[:,'relationship_days']-the_cust[1])**2 + (training_df.loc[:,'total_spend']-the_cust[2])**2 + (training_df.loc[:,'beverage_categories']-the_cust[3])**2)** 0.5)
    # Prediction line
    prediction = mode(training_df.sort_values('distance', ascending=True).head(k).segment)
    #prediction line locking columns
    testing_df.iloc[row_idx,4] = prediction
    
    
    
    #if row_idx>10:
        #break
    
    

# Results 

In [29]:
testing_df.value_counts('prediction')

prediction
newbie        562
loyal core    516
cheapo        451
explorer      260
dtype: int64

In [26]:
testing_df.head(10)

Unnamed: 0,customer_id,relationship_days,total_spend,beverage_categories,prediction
0,", 9GT0P908DS5VGJYRYG4CHAPSQG",224,145.0,5,explorer
1,", D74N0TBT7D2K6YPYDA734V56Y4",272,152.5,5,explorer
2,", XHN902RSGS7V6ZBWN7FE5QVGA4",371,55.0,3,cheapo
3,", YDZ8kOpLlCLDlmom",609,159.0,6,loyal core
4,-uNXHhkiA5JEebdj,802,107.0,8,loyal core
5,00GN9QQ53H639C9EW0KJRKGN5W,213,92.0,2,newbie
6,00M2PX68JH0E2Y44BSXPB8WPR8,590,123.0,12,loyal core
7,00PYAN31NH58CYD297MVV2JQTC,559,94.5,5,cheapo
8,00R2NCKF4X5SCSTW59HRY5VDG8,314,79.0,6,newbie
9,00ZSCPVYFH2TEMWQTV6M8PC3K0,393,378.25,13,loyal core


# Read in Comparison Data

Finally we will read in our true values file, and compare our KNN prediction results to the true values to test our accuracy.

In [27]:
evaluation_df = pd.read_csv("testing-true-values.txt")

# Joining my evaluation_df to my testing_df 

In [32]:
testing_df = testing_df.merge(evaluation_df, on = 'customer_id')

In [36]:
testing_df.head()

Unnamed: 0,customer_id,relationship_days,total_spend,beverage_categories,prediction,segment
0,", 9GT0P908DS5VGJYRYG4CHAPSQG",224,145.0,5,explorer,explorer
1,", D74N0TBT7D2K6YPYDA734V56Y4",272,152.5,5,explorer,explorer
2,", XHN902RSGS7V6ZBWN7FE5QVGA4",371,55.0,3,cheapo,cheapo
3,", YDZ8kOpLlCLDlmom",609,159.0,6,loyal core,loyal core
4,-uNXHhkiA5JEebdj,802,107.0,8,loyal core,loyal core


# Final Results

In [33]:
print(confusion_matrix(testing_df.prediction, testing_df.segment))

[[444   0   7   0]
 [  0 258   1   1]
 [  0   3 513   0]
 [  0   1   0 561]]


In [35]:
print(classification_report(testing.prediction, testing.segment))

              precision    recall  f1-score   support

      cheapo       1.00      0.98      0.99       451
    explorer       0.98      0.99      0.99       260
  loyal core       0.98      0.99      0.99       516
      newbie       1.00      1.00      1.00       562

    accuracy                           0.99      1789
   macro avg       0.99      0.99      0.99      1789
weighted avg       0.99      0.99      0.99      1789



Here we can see the acuracy of my KNN model is .99. 