## 2. Data Preprocessing
Removing the data issues and making it ready for exploration

### 2.1 Transactions

In [2]:
import numpy as np
import pandas as pd

In [4]:
raw_data = pd.read_excel("KPMG-Sprocket.xlsx", "Transactions")
raw_data.head().transpose()

Unnamed: 0,0,1,2,3,4
transaction_id,1,2,3,4,5
product_id,2,3,37,88,78
customer_id,2950,3120,402,3135,787
transaction_date,2017-02-25 00:00:00,2017-05-21 00:00:00,2017-10-16 00:00:00,2017-08-31 00:00:00,2017-10-01 00:00:00
online_order,0,1,0,0,1
order_status,Approved,Approved,Approved,Approved,Approved
brand,Solex,Trek Bicycles,OHM Cycles,Norco Bicycles,Giant Bicycles
product_line,Standard,Standard,Standard,Standard,Standard
product_class,medium,medium,low,medium,medium
product_size,medium,large,medium,medium,large


In [5]:
raw_data_first_copy = raw_data

In [5]:
raw_data['purchase_month'] = pd.DatetimeIndex(raw_data['transaction_date']).month

In [6]:
feat_encoding_dict = {
    "product_class":  {'nan':0, 'low':1,'medium':2, 'high':3},
    "product_size":  {'nan':0, 'small':1, 'medium':2, 'large':3},
    "order_status": {'Approved':1, 'Cancelled':0},
    "brand": {'Solex':1, 'Trek Bicycles':2, 'OHM Cycles':3, 'Norco Bicycles':4, 'Giant Bicycles':5, 'WeareA2B':6, 'nan':0},
    "product_line": {'nan':0, 'Standard':1, 'Road':2, 'Mountain':3, 'Touring':4}
}

In [7]:
raw_data = raw_data.replace(feat_encoding_dict)

In [8]:
raw_data.head().transpose()

Unnamed: 0,0,1,2,3,4
transaction_id,1,2,3,4,5
product_id,2,3,37,88,78
customer_id,2950,3120,402,3135,787
transaction_date,2017-02-25 00:00:00,2017-05-21 00:00:00,2017-10-16 00:00:00,2017-08-31 00:00:00,2017-10-01 00:00:00
online_order,0,1,0,0,1
order_status,1,1,1,1,1
brand,1,2,3,4,5
product_line,1,1,1,1,1
product_class,2,2,1,2,2
product_size,2,3,2,2,3


In [9]:
raw_data = raw_data.drop(['transaction_id', 'transaction_date', 'product_first_sold_date'], axis=1)

In [10]:
raw_data.head().transpose()

Unnamed: 0,0,1,2,3,4
product_id,2.0,3.0,37.0,88.0,78.0
customer_id,2950.0,3120.0,402.0,3135.0,787.0
online_order,0.0,1.0,0.0,0.0,1.0
order_status,1.0,1.0,1.0,1.0,1.0
brand,1.0,2.0,3.0,4.0,5.0
product_line,1.0,1.0,1.0,1.0,1.0
product_class,2.0,2.0,1.0,2.0,2.0
product_size,2.0,3.0,2.0,2.0,3.0
list_price,71.49,2091.47,1793.43,1198.46,1765.3
standard_cost,53.62,388.92,248.82,381.1,709.48


In [11]:
prep_data = raw_data

In [12]:
prep_data.to_csv('Prep-Transactions.csv', index=False)

### 2.2 Customer Demographic

In [15]:
raw_data = pd.read_excel("KPMG-Sprocket.xlsx", "CustomerDemographic")

Unnamed: 0,0,1,2,3,4
customer_id,1,2,3,4,5
first_name,Laraine,Eli,Arlin,Talbot,Sheila-kathryn
last_name,Medendorp,Bockman,Dearle,,Calton
gender,F,Male,Male,Male,Female
past_3_years_bike_related_purchases,93,81,61,33,56
DOB,1953-10-12 00:00:00,1980-12-16 00:00:00,1954-01-20 00:00:00,1961-10-03 00:00:00,1977-05-13 00:00:00
job_title,Executive Secretary,Administrative Officer,Recruiting Manager,,Senior Editor
job_industry_category,Health,Financial Services,Property,IT,
wealth_segment,Mass Customer,Mass Customer,Mass Customer,Mass Customer,Affluent Customer
deceased_indicator,N,N,N,N,N


In [17]:
raw_data['wealth_segment'].unique()

array(['Mass Customer', 'Affluent Customer', 'High Net Worth'],
      dtype=object)

In [18]:
feat_encoding_dict = {
    "gender": {'F':1, 'Female':1, 'Femal':1, 'U':2, 'Male':0, 'M':0},
    "job_industry_category": {'Health':1, 'Financial Services':2, 'Property':3, 'IT':4, 'nan':0, 'Retail':5, 
                              'Argiculture':6, 'Manufacturing':7, 'Telecommunications':8, 'Entertainment':9},
    "wealth_segment": {'Mass Customer':1, 'Affluent Customer':2, 'High Net Worth':3},
    "deceased_indicator": {'N':0, 'Y':1},
    "owns_car": {"Yes":1, "No":0}
}

In [19]:
raw_data = raw_data.replace(feat_encoding_dict)

In [20]:
raw_data.head().transpose()

Unnamed: 0,0,1,2,3,4
customer_id,1,2,3,4,5
first_name,Laraine,Eli,Arlin,Talbot,Sheila-kathryn
last_name,Medendorp,Bockman,Dearle,,Calton
gender,1,0,0,0,1
past_3_years_bike_related_purchases,93,81,61,33,56
DOB,1953-10-12 00:00:00,1980-12-16 00:00:00,1954-01-20 00:00:00,1961-10-03 00:00:00,1977-05-13 00:00:00
job_title,Executive Secretary,Administrative Officer,Recruiting Manager,,Senior Editor
job_industry_category,1,2,3,4,
wealth_segment,1,1,1,1,2
deceased_indicator,0,0,0,0,0


In [21]:
raw_data_first_copy = raw_data

In [30]:
raw_data.drop(raw_data[ raw_data['deceased_indicator']==1 ].index, axis=0, inplace=True)

In [40]:
raw_data['DOB'] = pd.to_datetime(raw_data['DOB'])

Unnamed: 0,0,1,2,3,4
customer_id,1,2,3,4,5
first_name,Laraine,Eli,Arlin,Talbot,Sheila-kathryn
last_name,Medendorp,Bockman,Dearle,,Calton
gender,1,0,0,0,1
past_3_years_bike_related_purchases,93,81,61,33,56
DOB,1953-10-12 00:00:00,1980-12-16 00:00:00,1954-01-20 00:00:00,1961-10-03 00:00:00,1977-05-13 00:00:00
job_title,Executive Secretary,Administrative Officer,Recruiting Manager,,Senior Editor
job_industry_category,1,2,3,4,
wealth_segment,1,1,1,1,2
deceased_indicator,0,0,0,0,0


In [42]:
raw_data['Age'] = 2017 - raw_data['DOB'].dt.year

In [43]:
raw_data.head().transpose()

Unnamed: 0,0,1,2,3,4
customer_id,1,2,3,4,5
first_name,Laraine,Eli,Arlin,Talbot,Sheila-kathryn
last_name,Medendorp,Bockman,Dearle,,Calton
gender,1,0,0,0,1
past_3_years_bike_related_purchases,93,81,61,33,56
DOB,1953-10-12 00:00:00,1980-12-16 00:00:00,1954-01-20 00:00:00,1961-10-03 00:00:00,1977-05-13 00:00:00
job_title,Executive Secretary,Administrative Officer,Recruiting Manager,,Senior Editor
job_industry_category,1,2,3,4,
wealth_segment,1,1,1,1,2
deceased_indicator,0,0,0,0,0


In [48]:
raw_data = raw_data.drop(['first_name', 'last_name', 'DOB', 'job_title', 'deceased_indicator', 'default'], axis=1)

In [49]:
prep_data = raw_data.copy()
prep_data.head().transpose()

Unnamed: 0,0,1,2,3,4
customer_id,1.0,2.0,3.0,4.0,5.0
gender,1.0,0.0,0.0,0.0,1.0
past_3_years_bike_related_purchases,93.0,81.0,61.0,33.0,56.0
job_industry_category,1.0,2.0,3.0,4.0,
wealth_segment,1.0,1.0,1.0,1.0,2.0
owns_car,1.0,1.0,1.0,0.0,1.0
tenure,11.0,16.0,15.0,7.0,8.0
Age,64.0,37.0,63.0,56.0,40.0


In [51]:
prep_data.to_csv("Prep-CustomerDemographic.csv", index=False)

### 2.3 Customer Address

In [2]:
raw_data = pd.read_excel("KPMG-Sprocket.xlsx", "CustomerAddress")

In [4]:
raw_data.head().transpose()

Unnamed: 0,0,1,2,3,4
customer_id,1,2,4,5,6
address,060 Morning Avenue,6 Meadow Vale Court,0 Holy Cross Court,17979 Del Mar Point,9 Oakridge Court
postcode,2016,2153,4211,2448,3216
state,New South Wales,New South Wales,QLD,New South Wales,VIC
country,Australia,Australia,Australia,Australia,Australia
property_valuation,10,10,9,4,9


In [5]:
raw_data['state'].unique()

array(['New South Wales', 'QLD', 'VIC', 'NSW', 'Victoria'], dtype=object)

In [7]:
feat_encoding_dict = {
    "state": {'New South Wales':2, 'QLD':1, 'VIC':3, 'NSW':2, 'Victoria':3}
}

In [13]:
raw_data = raw_data.replace(feat_encoding_dict)

In [14]:
raw_data = raw_data.drop(['address', 'postcode', 'country'], axis=1)

In [15]:
prep_data = raw_data.copy()
prep_data.head().transpose()

Unnamed: 0,0,1,2,3,4
customer_id,1,2,4,5,6
state,2,2,1,2,3
property_valuation,10,10,9,4,9


In [16]:
prep_data.to_csv("Prep-CustomerAddress.csv")