### Libraries

In [1]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

import pickle

### Data Import 

In [2]:
df_purchase = pd.read_csv('purchase data.csv')

### Data Exploration 

In [3]:
df_purchase.head()

Unnamed: 0,ID,Day,Incidence,Brand,Quantity,Last_Inc_Brand,Last_Inc_Quantity,Price_1,Price_2,Price_3,...,Promotion_3,Promotion_4,Promotion_5,Sex,Marital status,Age,Education,Income,Occupation,Settlement size
0,200000001,1,0,0,0,0,0,1.59,1.87,2.01,...,0,0,0,0,0,47,1,110866,1,0
1,200000001,11,0,0,0,0,0,1.51,1.89,1.99,...,0,0,0,0,0,47,1,110866,1,0
2,200000001,12,0,0,0,0,0,1.51,1.89,1.99,...,0,0,0,0,0,47,1,110866,1,0
3,200000001,16,0,0,0,0,0,1.52,1.89,1.98,...,0,0,0,0,0,47,1,110866,1,0
4,200000001,18,0,0,0,0,0,1.52,1.89,1.99,...,0,0,0,0,0,47,1,110866,1,0


In [4]:
# so right now we also want to find out if there are missing value in the columns 
# what we can do is to use the following codes: 
df_purchase.isnull().sum()


ID                   0
Day                  0
Incidence            0
Brand                0
Quantity             0
Last_Inc_Brand       0
Last_Inc_Quantity    0
Price_1              0
Price_2              0
Price_3              0
Price_4              0
Price_5              0
Promotion_1          0
Promotion_2          0
Promotion_3          0
Promotion_4          0
Promotion_5          0
Sex                  0
Marital status       0
Age                  0
Education            0
Income               0
Occupation           0
Settlement size      0
dtype: int64

### Data Segmentation 

#### Import Segmentation model 
cuz what we want to know is to import the model and use it on a different dataset

- we also need to standardize the data 

In [5]:
# rb means read bytes
scaler = pickle.load(open('scaler.pickle', 'rb'))
pca = pickle.load(open('pca.pickle', 'rb'))
kmeans_pca = pickle.load(open('kmeans_pca.pickle', 'rb'))

## Standardization 

In [6]:
# only need the following columns, so we extract them out and put it inside a list 
features = df_purchase[['Sex', 'Marital status','Age','Education','Income', 'Occupation','Settlement size']]
# to standardized the data 
df_purchase_segm_std = scaler.transform(features)

## PCA

In [7]:
df_purchase_segm_pca = pca.transform(df_purchase_segm_std)


## K-means PCA

In [8]:
purchase_segm_kmeans_pca = kmeans_pca.predict(df_purchase_segm_pca)

In [9]:
# make a new copy of dataframe
df_purchase_predictors = df_purchase.copy()

In [11]:
# adding a new column into the new df so that we can have a master df
df_purchase_predictors['Segment'] = purchase_segm_kmeans_pca

# Descriptive Analysis by Segments

### Data Analyis by Customer

In [12]:
df_purchase_predictors.head()

Unnamed: 0,ID,Day,Incidence,Brand,Quantity,Last_Inc_Brand,Last_Inc_Quantity,Price_1,Price_2,Price_3,...,Promotion_4,Promotion_5,Sex,Marital status,Age,Education,Income,Occupation,Settlement size,Segment
0,200000001,1,0,0,0,0,0,1.59,1.87,2.01,...,0,0,0,0,47,1,110866,1,0,2
1,200000001,11,0,0,0,0,0,1.51,1.89,1.99,...,0,0,0,0,47,1,110866,1,0,2
2,200000001,12,0,0,0,0,0,1.51,1.89,1.99,...,0,0,0,0,47,1,110866,1,0,2
3,200000001,16,0,0,0,0,0,1.52,1.89,1.98,...,0,0,0,0,47,1,110866,1,0,2
4,200000001,18,0,0,0,0,0,1.52,1.89,1.99,...,0,0,0,0,47,1,110866,1,0,2


In [21]:
# selecting what we need and then group by ID 
# then we will count() it
temp1 = df_purchase_predictors[['ID','Incidence']].groupby(['ID'], as_index=False).count()

temp1 = temp1.set_index('ID')

# renaming the Incidence col into N_Visits
temp1 = temp1.rename(columns={'Incidence': 'N-Visits'})
temp1.head()


Unnamed: 0_level_0,N-Visits
ID,Unnamed: 1_level_1
200000001,101
200000002,87
200000003,97
200000004,85
200000005,111


In [22]:
temp2 = df_purchase_predictors[['ID','Incidence']].groupby(['ID'], as_index= False).sum()
temp2 = temp2.set_index('ID')
temp2 = temp2.rename(columns={'Incidence': 'N_Purchases'})


# joining the 2 df together
temp3 = temp1.join(temp2)
temp3.head()


Unnamed: 0_level_0,N-Visits,N_Purchases
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
200000001,101,9
200000002,87,11
200000003,97,10
200000004,85,11
200000005,111,13


In [23]:
# finding out the avergae number of purchases

temp3['Average_N_Purchases'] = temp3['N_Purchases'] / temp3['N-Visits']
temp3.head()

Unnamed: 0_level_0,N-Visits,N_Purchases,Average_N_Purchases
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
200000001,101,9,0.089109
200000002,87,11,0.126437
200000003,97,10,0.103093
200000004,85,11,0.129412
200000005,111,13,0.117117


In [24]:
temp4 = df_purchase_predictors[['ID','Segment']].groupby(['ID'], as_index= False).mean()
temp4 = temp4.set_index('ID')
df_purchase_descr = temp3.join(temp4)


In [25]:
df_purchase_descr.head()

Unnamed: 0_level_0,N-Visits,N_Purchases,Average_N_Purchases,Segment
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
200000001,101,9,0.089109,2.0
200000002,87,11,0.126437,3.0
200000003,97,10,0.103093,2.0
200000004,85,11,0.129412,2.0
200000005,111,13,0.117117,1.0
