### KNN model 2

We were concerned about the possibility of omitting the varaibles in our model and hence resulting an inaccurate 
modeling. Therefore, we decided to do some feature engineering such as to replace some null with mean values and in the 'Profession' column we filled the null value with a new column which named 'FreeWorker' to see if we were to be able to increase the accuracy rate.


In [1]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
# To read the dataset 
df_train= pd.read_csv('train.csv')
df_test=pd.read_csv('test.csv')
#cheaking for the data
df_train.head(20)

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A
5,461319,Male,Yes,56,No,Artist,0.0,Average,2.0,Cat_6,C
6,460156,Male,No,32,Yes,Healthcare,1.0,Low,3.0,Cat_6,C
7,464347,Female,No,33,Yes,Healthcare,1.0,Low,3.0,Cat_6,D
8,465015,Female,Yes,61,Yes,Engineer,0.0,Low,3.0,Cat_7,D
9,465176,Female,Yes,55,Yes,Artist,1.0,Average,4.0,Cat_6,C


In [3]:
#checking for test data
df_test.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,458989,Female,Yes,36,Yes,Engineer,0.0,Low,1.0,Cat_6,B
1,458994,Male,Yes,37,Yes,Healthcare,8.0,Average,4.0,Cat_6,A
2,458996,Female,Yes,69,No,,0.0,Low,1.0,Cat_6,A
3,459000,Male,Yes,59,No,Executive,11.0,High,2.0,Cat_6,B
4,459001,Female,No,19,No,Marketing,,Low,4.0,Cat_6,A


In [4]:
df_train.shape

(8068, 11)

In [5]:
#To get a general understanding of the data, differents types and counts
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8068 entries, 0 to 8067
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               8068 non-null   int64  
 1   Gender           8068 non-null   object 
 2   Ever_Married     7928 non-null   object 
 3   Age              8068 non-null   int64  
 4   Graduated        7990 non-null   object 
 5   Profession       7944 non-null   object 
 6   Work_Experience  7239 non-null   float64
 7   Spending_Score   8068 non-null   object 
 8   Family_Size      7733 non-null   float64
 9   Var_1            7992 non-null   object 
 10  Segmentation     8068 non-null   object 
dtypes: float64(2), int64(2), object(7)
memory usage: 693.5+ KB


In [6]:
#checking for missing data
df_train.isnull().sum()

ID                   0
Gender               0
Ever_Married       140
Age                  0
Graduated           78
Profession         124
Work_Experience    829
Spending_Score       0
Family_Size        335
Var_1               76
Segmentation         0
dtype: int64

In [8]:
#adjusting some missing values
#replace them with mean values so as not to drop the entire column
#which may lead to ommition of data,leading to inaccuracy model
df_train['Profession'].fillna('FreeWorker', inplace=True)
df_train['Graduated'].fillna('No', inplace=True)
df_train['Work_Experience'].fillna(df_train['Work_Experience'].mean(),inplace=True)
df_train['Family_Size'].fillna(df_train['Family_Size'].mean(),inplace=True)
df_train['Profession'].fillna('FreeWorker')
#set our target
target=df_train['Segmentation']

In [9]:
#To cheak if all the values are filled before modeling
df_train.isnull().sum()

ID                   0
Gender               0
Ever_Married       140
Age                  0
Graduated            0
Profession           0
Work_Experience      0
Spending_Score       0
Family_Size          0
Var_1               76
Segmentation         0
dtype: int64

In [10]:
#checking for different occupations 
df_train['Profession'].unique()

array(['Healthcare', 'Engineer', 'Lawyer', 'Entertainment', 'Artist',
       'Executive', 'Doctor', 'Homemaker', 'Marketing', 'FreeWorker'],
      dtype=object)

In [11]:
df_train['Gender'] = df_train['Gender'].astype('category')
df_train['Ever_Married'] = df_train['Ever_Married'].astype('category')
df_train['Graduated'] = df_train['Graduated'].astype('category')
df_train['Spending_Score'] = df_train['Spending_Score'].astype('category')
df_train['Segmentation'] = df_train['Segmentation'].astype('category')
df_train['Var_1'] = df_train['Var_1'].astype('category')
df_train['Profession'] = df_train['Profession'].astype('category')

# use .cat.codes on `category` type to map all literals to numeric values

df_train['Gender'] = df_train['Gender'].cat.codes
df_train['Ever_Married'] = df_train['Ever_Married'].cat.codes
df_train['Graduated'] = df_train['Graduated'].cat.codes
df_train['Spending_Score'] = df_train['Spending_Score'].cat.codes
df_train['Segmentation'] = df_train['Segmentation'].cat.codes
df_train['Var_1'] = df_train['Var_1'].cat.codes
df_train['Profession'] = df_train['Profession'].cat.codes

df_train.head(20)

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,1,0,22,0,6,1.0,2,4.0,3,3
1,462643,0,1,38,1,2,2.641663,0,3.0,3,0
2,466315,0,1,67,1,2,1.0,2,1.0,5,1
3,461735,1,1,67,1,8,0.0,1,2.0,5,1
4,462669,0,1,40,1,3,2.641663,1,6.0,5,0
5,461319,1,1,56,0,0,0.0,0,2.0,5,2
6,460156,1,0,32,1,6,1.0,2,3.0,5,2
7,464347,0,0,33,1,6,1.0,2,3.0,5,3
8,465015,0,1,61,1,2,0.0,2,3.0,6,3
9,465176,0,1,55,1,0,1.0,0,4.0,5,2


In [12]:
# drop the unused column
df_train.drop('ID', axis=1,inplace=True)
df_train.drop('Segmentation', axis=1,inplace=True)

In [13]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8068 entries, 0 to 8067
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Gender           8068 non-null   int8   
 1   Ever_Married     8068 non-null   int8   
 2   Age              8068 non-null   int64  
 3   Graduated        8068 non-null   int8   
 4   Profession       8068 non-null   int8   
 5   Work_Experience  8068 non-null   float64
 6   Spending_Score   8068 non-null   int8   
 7   Family_Size      8068 non-null   float64
 8   Var_1            8068 non-null   int8   
dtypes: float64(2), int64(1), int8(6)
memory usage: 236.5 KB


###  Applying the same method to test dataset

In [14]:
#adjusting some missing values
#replace them with mean values so as not to drop the entire column
#which may lead to ommition of data,leading to inaccuracy model
df_test['Profession'].fillna('FreeWorker', inplace=True)
df_test['Graduated'].fillna('No', inplace=True)
df_test['Work_Experience'].fillna(df_test['Work_Experience'].mean(),inplace=True)
df_test['Family_Size'].fillna(df_test['Family_Size'].mean(),inplace=True)
df_test['Profession'].fillna('FreeWorker')
#set our target
y_test=df_test['Segmentation']

In [15]:
df_test['Gender'] = df_test['Gender'].astype('category')
df_test['Ever_Married'] = df_test['Ever_Married'].astype('category')
df_test['Graduated'] = df_test['Graduated'].astype('category')
df_test['Spending_Score'] = df_test['Spending_Score'].astype('category')
df_test['Segmentation'] = df_test['Segmentation'].astype('category')
df_test['Var_1'] = df_test['Var_1'].astype('category')
df_test['Profession'] = df_test['Profession'].astype('category')

# use .cat.codes on `category` type to map all literals to numeric values

df_test['Gender'] = df_test['Gender'].cat.codes
df_test['Ever_Married'] = df_test['Ever_Married'].cat.codes
df_test['Graduated'] = df_test['Graduated'].cat.codes
df_test['Spending_Score'] = df_test['Spending_Score'].cat.codes
df_test['Segmentation'] = df_test['Segmentation'].cat.codes
df_test['Var_1'] = df_test['Var_1'].cat.codes
df_test['Profession'] = df_test['Profession'].cat.codes

df_test.head(20)

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,458989,0,1,36,1,2,0.0,2,1.0,5,1
1,458994,1,1,37,1,6,8.0,0,4.0,5,0
2,458996,0,1,69,0,5,0.0,2,1.0,5,0
3,459000,1,1,59,0,4,11.0,1,2.0,5,1
4,459001,0,0,19,0,9,2.552587,2,4.0,5,0
5,459003,1,1,47,1,1,0.0,1,5.0,3,2
6,459005,1,1,61,1,1,5.0,2,3.0,5,3
7,459008,0,1,47,1,0,1.0,0,3.0,5,3
8,459013,1,1,50,1,0,2.0,0,4.0,5,1
9,459014,1,0,19,0,6,0.0,2,4.0,5,1


In [16]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2627 entries, 0 to 2626
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               2627 non-null   int64  
 1   Gender           2627 non-null   int8   
 2   Ever_Married     2627 non-null   int8   
 3   Age              2627 non-null   int64  
 4   Graduated        2627 non-null   int8   
 5   Profession       2627 non-null   int8   
 6   Work_Experience  2627 non-null   float64
 7   Spending_Score   2627 non-null   int8   
 8   Family_Size      2627 non-null   float64
 9   Var_1            2627 non-null   int8   
 10  Segmentation     2627 non-null   int8   
dtypes: float64(2), int64(2), int8(7)
memory usage: 100.2 KB


In [7]:
# finding the mean of the values
df_train.mean()

ID                 463479.214551
Age                    43.466906
Work_Experience         2.641663
Family_Size             2.850123
dtype: float64

In [9]:
# find max and min(below) to see if the values
#deviate much from the mean value which will affect the 
#accuracy rate of our model
df_train.max()

ID                 467974
Gender               Male
Age                    89
Work_Experience        14
Spending_Score        Low
Family_Size             9
Segmentation            D
dtype: object

In [10]:
df_train.min()

ID                  458982
Gender              Female
Age                     18
Work_Experience          0
Spending_Score     Average
Family_Size              1
Segmentation             A
dtype: object

In [17]:
# drop the unused  feature columns
df_test.drop('ID', axis=1,inplace=True)
df_test.drop('Segmentation', axis=1,inplace=True)

In [18]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2627 entries, 0 to 2626
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Gender           2627 non-null   int8   
 1   Ever_Married     2627 non-null   int8   
 2   Age              2627 non-null   int64  
 3   Graduated        2627 non-null   int8   
 4   Profession       2627 non-null   int8   
 5   Work_Experience  2627 non-null   float64
 6   Spending_Score   2627 non-null   int8   
 7   Family_Size      2627 non-null   float64
 8   Var_1            2627 non-null   int8   
dtypes: float64(2), int64(1), int8(6)
memory usage: 77.1 KB


In [23]:
%%time
#finding the training time of the model
#train the model
# we are not going to split the data as it was already done with given data
x_train=df_train[['Gender','Ever_Married','Age','Graduated','Profession','Work_Experience','Spending_Score','Family_Size','Var_1']]
y_train=target
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5) 
knn.fit(x_train, y_train) 

Wall time: 69.4 ms


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [20]:
x_train.head()

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
0,1,0,22,0,6,1.0,2,4.0,3
1,0,1,38,1,2,2.641663,0,3.0,3
2,0,1,67,1,2,1.0,2,1.0,5
3,1,1,67,1,8,0.0,1,2.0,5
4,0,1,40,1,3,2.641663,1,6.0,5


In [21]:
#fit the model
knn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [26]:
#testing the model
y_pred = knn.predict(df_test)
print(y_pred)
print(y_test)

['A' 'B' 'C' ... 'A' 'D' 'D']
0       B
1       A
2       A
3       B
4       A
       ..
2622    B
2623    A
2624    C
2625    C
2626    A
Name: Segmentation, Length: 2627, dtype: object


In [24]:
%%time
#time to run the accuracy_score
#fiding accuracy score
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.3269889607917777
Wall time: 6.98 ms


In [25]:
%%time 
# time for to run 15 times
#testing for accuracy rate with diffent k values 
k_array = np.arange(1,30,2)
for k in k_array:
    knn_ex = KNeighborsClassifier(n_neighbors = k)
    knn_ex.fit(x_train, y_train)
    ac = accuracy_score(y_test, knn_ex.predict(df_test))
    print("When k value is ",k)
    print("The accuracy score is ",ac)

When k value is  1
The accuracy score is  0.31252379139703085
When k value is  3
The accuracy score is  0.33269889607917774
When k value is  5
The accuracy score is  0.3269889607917777
When k value is  7
The accuracy score is  0.33536353254663115
When k value is  9
The accuracy score is  0.3338408831366578
When k value is  11
The accuracy score is  0.3418347925390179
When k value is  13
The accuracy score is  0.3338408831366578
When k value is  15
The accuracy score is  0.33612485725161784
When k value is  17
The accuracy score is  0.33536353254663115
When k value is  19
The accuracy score is  0.3349828701941378
When k value is  21
The accuracy score is  0.3410734678340312
When k value is  23
The accuracy score is  0.3365055196041112
When k value is  25
The accuracy score is  0.3357441948991245
When k value is  27
The accuracy score is  0.34145413018652454
When k value is  29
The accuracy score is  0.34449942900647124
Wall time: 3.19 s


In [45]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[273, 210, 173, 190],
       [165, 150, 129, 106],
       [138, 126, 147,  61],
       [193, 129, 148, 289]], dtype=int64)