# **ML Project based on KNN Classification**

>Analyse Past Purchase behavior Dataset

>Feature Engineering - releavent feature selection and data reformation(scaling,prediciton performance maximise)

>Traning a model - Put processed data for Training (80%)

>Testing the model - Test with trainned data for selection of suitable model (20%)

>Prediction for new customers

## **Import Libraries and Load dataset**

In [61]:
import pandas as pd
import numpy as np


In [62]:
df = pd.read_csv('https://raw.githubusercontent.com/futurexskill/projects/main/knn-classification/purchase_history.csv')

In [63]:
df.head()

Unnamed: 0,Customer ID,Gender,Age,Salary,Product ID,Price,Purchased
0,1,Female,49,61000,P01,2000,1
1,2,Male,36,30000,P02,2000,1
2,3,Female,26,81000,P02,3500,1
3,4,Female,32,74000,P01,7000,0
4,5,Female,42,56000,P01,5000,0


In [64]:
df.count()

Customer ID    1000
Gender         1000
Age            1000
Salary         1000
Product ID     1000
Price          1000
Purchased      1000
dtype: int64

In [65]:
df.shape

(1000, 7)

In [66]:
#Machine learning model only works on Numerical data hence we need to transform Gender column.
#Methods = 1. One Hot Encoding & 2. get dummies (pandas)
gender_encoded = pd.get_dummies(df['Gender'])

In [67]:
gender_encoded

Unnamed: 0,Female,Male
0,1,0
1,0,1
2,1,0
3,1,0
4,1,0
...,...,...
995,0,1
996,0,1
997,1,0
998,0,1


In [68]:
#To selesct male column only
gender_encoded = pd.get_dummies(df['Gender'],drop_first=True)
gender_encoded # 1 = male & 0 = female

Unnamed: 0,Male
0,0
1,1
2,0
3,0
4,0
...,...
995,1
996,1
997,0
998,1


In [69]:
new_df = pd.concat([df,gender_encoded],axis=1)
new_df

Unnamed: 0,Customer ID,Gender,Age,Salary,Product ID,Price,Purchased,Male
0,1,Female,49,61000,P01,2000,1,0
1,2,Male,36,30000,P02,2000,1,1
2,3,Female,26,81000,P02,3500,1,0
3,4,Female,32,74000,P01,7000,0,0
4,5,Female,42,56000,P01,5000,0,0
...,...,...,...,...,...,...,...,...
995,996,Male,21,73000,P02,5000,1,1
996,997,Male,62,54000,P01,5000,0,1
997,998,Female,41,20000,P03,2000,0,0
998,999,Male,22,29000,P03,5000,0,1


##**Seperate X & Y**

In [70]:
x = new_df[['Male','Age','Salary','Price']].to_numpy()
x #Independent variable


array([[    0,    49, 61000,  2000],
       [    1,    36, 30000,  2000],
       [    0,    26, 81000,  3500],
       ...,
       [    0,    41, 20000,  2000],
       [    1,    22, 29000,  5000],
       [    0,    49, 36000,  3500]])

In [71]:
y = new_df['Purchased'].to_numpy()
y #Dependent/ouput variable


array([1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,

## **Data Splitting**

In [72]:
from sklearn.model_selection import train_test_split

In [73]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=100)

In [74]:
len(x_train)

800

In [75]:
len(y_train)

800

In [76]:
len(x_test)

200

In [77]:
len(y_test)

200

## **Standard Scaler**

In [78]:
from sklearn.preprocessing import StandardScaler
#StandardScaler transforms data which has mean=0 & Std.Deviation=1

In [79]:
sc = StandardScaler()

In [80]:
x_train = sc.fit_transform(x_train)
x_train

array([[-0.95357636,  1.57825851, -1.65073628, -0.50985029],
       [-0.95357636, -0.23208619,  0.77470514,  1.39359079],
       [-0.95357636,  1.28860336,  0.05132787,  0.30591017],
       ...,
       [-0.95357636,  0.05756896, -0.03377534,  0.30591017],
       [-0.95357636,  1.43343093,  0.09387948, -0.50985029],
       [-0.95357636, -0.59415513,  1.66828882, -0.50985029]])

In [85]:
x_test = sc.fit_transform(x_test)
x_test

array([[-0.96076892,  0.88735767,  1.40981106,  0.30456033],
       [-0.96076892,  0.73790796,  1.15984456,  1.36852653],
       [-0.96076892, -0.08406546, -1.21483719, -1.29138898],
       [-0.96076892, -0.5324146 ,  1.36814998, -0.49341433],
       [-0.96076892,  0.51373339,  0.03499531, -1.29138898],
       [-0.96076892,  1.33570681, -0.83988744, -1.29138898],
       [-0.96076892,  0.51373339, -0.63158202, -1.29138898],
       [ 1.040833  ,  0.21483396,  0.53492831, -0.49341433],
       [-0.96076892, -1.42911288,  0.07665639,  1.36852653],
       [ 1.040833  , -0.08406546, -0.67324311,  1.36852653],
       [ 1.040833  ,  1.63460624, -0.67324311,  0.30456033],
       [ 1.040833  ,  0.88735767, -0.79822636,  0.30456033],
       [ 1.040833  ,  1.63460624,  0.40994506,  1.36852653],
       [-0.96076892, -1.57856259, -1.50646477,  1.36852653],
       [ 1.040833  ,  0.06538425, -0.92320961, -1.29138898],
       [ 1.040833  , -0.90603888, -1.33982044,  1.36852653],
       [ 1.040833  ,  0.

## **Apply KNN**

In [82]:
from sklearn.neighbors import KNeighborsClassifier

In [83]:
k = 5
knn = KNeighborsClassifier(k)

## **KNN model training**

In [84]:
knn.fit(x_train,y_train)

## **KNN model testing**

In [86]:
y_pred = knn.predict(x_test)
y_pred #Predicted

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 1])

In [88]:
y_test #Actual

array([1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       1, 1])

## **Model accuracy check**

In [89]:
from sklearn.metrics import accuracy_score

In [90]:
accuracy = accuracy_score(y_test,y_pred)*100
accuracy

79.0

## **Save Model & Scaler with pickle**

In [91]:
import pickle


In [93]:
with open('knn_model.pickle','wb') as f:
  pickle.dump(knn,f) #File saved at left panel(Files)

In [96]:
with open('scaler.pickle','wb') as f:
  pickle.dump(sc,f) #File saved at left panel(Files)

In [97]:
#Linux command to check files avalible in current directory
!ls

knn_model.pickle  sample_data  scaler.pickle


# **Read pickle files (for new notebook if files wont availble at current directory)**

In [101]:
import pickle
with open('knn_model.pickle','rb') as f:
  knn_new = pickle.load(f)

In [102]:
with open('scaler.pickle','rb') as f:
  scaler_new = pickle.load(f)

# Load new customer data

In [98]:
df2 = pd.read_csv('https://raw.githubusercontent.com/futurexskill/projects/main/knn-classification/new_customers.csv')

In [99]:
df2

Unnamed: 0,Gender,Age,Salary,Product ID,Price
0,Male,24,38652,3,4446
1,Female,47,64727,1,5188
2,Male,46,54648,1,6193
3,Male,31,20018,3,4779
4,Male,58,77731,2,6855
5,Male,28,55478,1,6153
6,Female,48,54211,2,7588
7,Male,31,60076,3,5238
8,Female,42,29290,1,5418
9,Male,50,21445,1,5598


In [100]:
df2.shape

(10, 5)

### **Data Reform**

In [107]:

gender1 = pd.get_dummies(df2['Gender'],drop_first=True)
gender1

Unnamed: 0,Male
0,1
1,0
2,1
3,1
4,1
5,1
6,0
7,1
8,0
9,1


In [108]:
df2_new = pd.concat([df2,gender1],axis=1)
df2_new

Unnamed: 0,Gender,Age,Salary,Product ID,Price,Male
0,Male,24,38652,3,4446,1
1,Female,47,64727,1,5188,0
2,Male,46,54648,1,6193,1
3,Male,31,20018,3,4779,1
4,Male,58,77731,2,6855,1
5,Male,28,55478,1,6153,1
6,Female,48,54211,2,7588,0
7,Male,31,60076,3,5238,1
8,Female,42,29290,1,5418,0
9,Male,50,21445,1,5598,1


### **Data Splitting into X**

In [111]:
x_new = df2_new[['Male','Age','Salary','Price']].to_numpy()
x_new

array([[    1,    24, 38652,  4446],
       [    0,    47, 64727,  5188],
       [    1,    46, 54648,  6193],
       [    1,    31, 20018,  4779],
       [    1,    58, 77731,  6855],
       [    1,    28, 55478,  6153],
       [    0,    48, 54211,  7588],
       [    1,    31, 60076,  5238],
       [    0,    42, 29290,  5418],
       [    1,    50, 21445,  5598]])

### **New X Scaler**

In [114]:
x_new_scale = scaler_new.fit_transform(x_new)
x_new_scale

array([[ 0.65465367, -1.54774434, -0.48875298, -1.42543997],
       [-1.52752523,  0.60971747,  0.93112246, -0.61159228],
       [ 0.65465367,  0.51591478,  0.38228547,  0.49072164],
       [ 0.65465367, -0.89112553, -1.50343981, -1.06019565],
       [ 0.65465367,  1.64154703,  1.63923599,  1.21682294],
       [ 0.65465367, -1.17253359,  0.42748189,  0.44684845],
       [-1.52752523,  0.70352015,  0.35848928,  2.02079916],
       [ 0.65465367, -0.89112553,  0.67785916, -0.55675079],
       [-1.52752523,  0.14070403, -0.99854681, -0.35932143],
       [ 0.65465367,  0.89112553, -1.42573465, -0.16189207]])

### **New Y predicition on new KNN model**

In [117]:
y_pred_new = knn_new.predict(x_new_scale)
y_pred_new

array([1, 1, 0, 0, 0, 1, 0, 0, 0, 0])

In [121]:
#Add new prediciton value column to new customer dataset
df2_new['Will Purchase'] = y_pred_new
df2_new

Unnamed: 0,Gender,Age,Salary,Product ID,Price,Male,Will Purchase
0,Male,24,38652,3,4446,1,1
1,Female,47,64727,1,5188,0,1
2,Male,46,54648,1,6193,1,0
3,Male,31,20018,3,4779,1,0
4,Male,58,77731,2,6855,1,0
5,Male,28,55478,1,6153,1,1
6,Female,48,54211,2,7588,0,0
7,Male,31,60076,3,5238,1,0
8,Female,42,29290,1,5418,0,0
9,Male,50,21445,1,5598,1,0


### **Save new predicted data to csv file**

In [122]:
df2_new.to_csv('New customer Predicted.csv',index=False)
#Share this csv file with Marketing Dept..