In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import preprocessing

sns.set()

In [2]:
data = pd.read_csv('Canadian Tire Case Dataset.csv')
data.head()

Unnamed: 0,CustomerID,recency,historical customer spend,has_used_discount_in_the_past,has_used_bogo_in_the_past,Urbanity,is_referral,channel,offer,conversion,Unnamed: 10,Example on reading data
0,1,10,142.44,1,0,Surburban,0,store,Buy One Get One,0,,"Customer 1 last shopped 10 months ago, and has..."
1,2,6,329.08,1,1,Rural,1,Web,No Offer,0,,
2,3,7,180.65,0,1,Surburban,1,Web,Buy One Get One,0,,
3,4,9,675.83,1,0,Rural,1,Web,Discount,0,,
4,5,2,45.34,1,0,Urban,0,Web,Buy One Get One,0,,


In [3]:
data_no_id = data.drop('CustomerID', axis=1)

### One-hot encode to handle categorical data

In [4]:
#handling of categorical data through 1 hot encoding
Urbanity_dummies = pd.get_dummies(data_no_id['Urbanity'], drop_first=True)
channel_dummies = pd.get_dummies(data_no_id['channel'], drop_first=True)

In [5]:
data_w_dummies = data_no_id.copy()
data_w_dummies = pd.concat([data_no_id, Urbanity_dummies, channel_dummies],axis=1)
data_w_dummies.head()

Unnamed: 0,recency,historical customer spend,has_used_discount_in_the_past,has_used_bogo_in_the_past,Urbanity,is_referral,channel,offer,conversion,Unnamed: 10,Example on reading data,Surburban,Urban,Web,store
0,10,142.44,1,0,Surburban,0,store,Buy One Get One,0,,"Customer 1 last shopped 10 months ago, and has...",1,0,0,1
1,6,329.08,1,1,Rural,1,Web,No Offer,0,,,0,0,1,0
2,7,180.65,0,1,Surburban,1,Web,Buy One Get One,0,,,1,0,1,0
3,9,675.83,1,0,Rural,1,Web,Discount,0,,,0,0,1,0
4,2,45.34,1,0,Urban,0,Web,Buy One Get One,0,,,0,1,1,0


In [6]:
#drop categorical variables that have been replaced
data_w_dummies = data_w_dummies.drop(['Urbanity', 'channel'], axis=1)

In [7]:
data_rearranged = data_w_dummies.copy()

#changing column names for ease of use
data_rearranged = data_rearranged.rename(columns={'historical customer spend':'hist_spend', 'has_used_discount_in_the_past':'past_discount', 'has_used_bogo_in_the_past':'past_bogo'})
data_rearranged.columns.values

array(['recency', 'hist_spend', 'past_discount', 'past_bogo',
       'is_referral', 'offer', 'conversion', 'Unnamed: 10',
       'Example on reading data', 'Surburban', 'Urban', 'Web', 'store'],
      dtype=object)

In [8]:
#rearranging columns
data_rearranged = data_rearranged[['recency', 'hist_spend',
       'past_discount', 'past_bogo','Surburban', 'Urban',
       'is_referral','Web', 'store','offer', 'conversion',]]
data_rearranged

Unnamed: 0,recency,hist_spend,past_discount,past_bogo,Surburban,Urban,is_referral,Web,store,offer,conversion
0,10,142.44,1,0,1,0,0,0,1,Buy One Get One,0
1,6,329.08,1,1,0,0,1,1,0,No Offer,0
2,7,180.65,0,1,1,0,1,1,0,Buy One Get One,0
3,9,675.83,1,0,0,0,1,1,0,Discount,0
4,2,45.34,1,0,0,1,0,1,0,Buy One Get One,0
...,...,...,...,...,...,...,...,...,...,...,...
63995,10,105.54,1,0,0,1,0,1,0,Discount,0
63996,5,38.91,0,1,0,1,1,0,1,Discount,0
63997,6,29.99,1,0,0,1,1,0,1,Discount,0
63998,1,552.94,1,0,1,0,1,0,0,Buy One Get One,0


### Preprocessing the bogo data

In [9]:
data_bogo = pd.DataFrame()


for i in range(len(data_rearranged)):
    if data_rearranged.loc[i,'offer'] == 'Buy One Get One' or data_rearranged.loc[i,'offer']== 'No Offer':
        data_bogo = pd.concat([data_bogo, data_rearranged.iloc[[i]]])
    #elif new_approach.loc[i, 'offer'] == 'Discount':
        #data_only_discount = pd.concat([data_only_discount, new_approach.iloc[[i]]])
    #elif new_approach.loc[i, 'offer'] == 'No Offer':
     #   data_no_offer = pd.concat([data_no_offer, new_approach.iloc[[i]]])
        


In [11]:
data_bogo.head()

Unnamed: 0,recency,hist_spend,past_discount,past_bogo,Surburban,Urban,is_referral,Web,store,offer,conversion
0,10,142.44,1,0,1,0,0,0,1,Buy One Get One,0
1,6,329.08,1,1,0,0,1,1,0,No Offer,0
2,7,180.65,0,1,1,0,1,1,0,Buy One Get One,0
4,2,45.34,1,0,0,1,0,1,0,Buy One Get One,0
5,6,134.83,0,1,1,0,0,0,1,Buy One Get One,1


In [12]:
data_bogo['offer'].unique()

array(['Buy One Get One', 'No Offer'], dtype=object)

In [None]:
#mapping data to binary from categorical
data_bogo['offer'] = data_bogo['offer'].map({'Buy One Get One':1, 'No Offer':0})

In [None]:
#exporting prepocessed data to csv
data_bogo.to_csv('data_bogo.csv')

### Preprocessing the discount data

In [13]:
data_discount.head()

Unnamed: 0,recency,hist_spend,past_discount,past_bogo,Surburban,Urban,is_referral,Web,store,offer,conversion
1,6,329.08,1,1,0,0,1,1,0,No Offer,0
3,9,675.83,1,0,0,0,1,1,0,Discount,0
8,9,675.07,1,1,0,0,1,0,1,Discount,0
13,2,101.64,0,1,0,1,0,1,0,Discount,1
14,4,241.42,0,1,0,0,1,0,0,No Offer,0


In [10]:
#seperating out only no offers and discounts to new dataframe
data_discount = pd.DataFrame()

for i in range(len(data_rearranged)):
    if data_rearranged.loc[i, 'offer'] == 'Discount' or data_rearranged.loc[i,'offer']== 'No Offer':
        data_discount = pd.concat([data_discount,data_rearranged.iloc[[i]]])

In [None]:
#break data into input data, treatment vector and target data
data_discount['offer'] = data_discount['offer'].map({'Discount':1, 'No Offer':0})

In [None]:
data_discount.to_csv('data_disc.csb')