In [68]:
import pandas as pd
import numpy as np
from datetime import time, datetime

# Transactions

In [2]:
# Importing datafile into dataframe
df_trans = pd.read_csv('Transactions.csv')

In [3]:
# Displaying dataframe summary
df_trans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   transaction_id           20000 non-null  int64  
 1   product_id               20000 non-null  int64  
 2   customer_id              20000 non-null  int64  
 3   transaction_date         20000 non-null  object 
 4   online_order             19640 non-null  object 
 5   order_status             20000 non-null  object 
 6   brand                    19803 non-null  object 
 7   product_line             19803 non-null  object 
 8   product_class            19803 non-null  object 
 9   product_size             19803 non-null  object 
 10  list_price               20000 non-null  float64
 11  standard_cost            19803 non-null  object 
 12  product_first_sold_date  19803 non-null  float64
dtypes: float64(2), int64(3), object(8)
memory usage: 1.4+ MB


In [4]:
df_trans.head()

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,product_first_sold_date
0,1,2,2950,2/25/2017,False,Approved,Solex,Standard,medium,medium,71.49,$53.62,41245.0
1,2,3,3120,5/21/2017,True,Approved,Trek Bicycles,Standard,medium,large,2091.47,$388.92,41701.0
2,3,37,402,10/16/2017,False,Approved,OHM Cycles,Standard,low,medium,1793.43,$248.82,36361.0
3,4,88,3135,8/31/2017,False,Approved,Norco Bicycles,Standard,medium,medium,1198.46,$381.10,36145.0
4,5,78,787,10/1/2017,True,Approved,Giant Bicycles,Standard,medium,large,1765.3,$709.48,42226.0


In [5]:
# Convert date column to datetime datatype
df_trans['transaction_date'] = pd.to_datetime(df_trans['transaction_date'])

# It's probably better to have three seperate columns for year, month, day for easier aggregation / data verification

In [6]:
# Convert standard_cost to float datatype

# Remove dollar sign from string
df_trans['standard_cost'] = df_trans['standard_cost'].str.replace('$','')

# Remove comma from string
df_trans['standard_cost'] = df_trans['standard_cost'].str.replace(',','')

# Making the conversion after the extra characters were removed
df_trans['standard_cost'] = df_trans['standard_cost'].astype(float)

In [7]:
df_trans.head()

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,product_first_sold_date
0,1,2,2950,2017-02-25,False,Approved,Solex,Standard,medium,medium,71.49,53.62,41245.0
1,2,3,3120,2017-05-21,True,Approved,Trek Bicycles,Standard,medium,large,2091.47,388.92,41701.0
2,3,37,402,2017-10-16,False,Approved,OHM Cycles,Standard,low,medium,1793.43,248.82,36361.0
3,4,88,3135,2017-08-31,False,Approved,Norco Bicycles,Standard,medium,medium,1198.46,381.1,36145.0
4,5,78,787,2017-10-01,True,Approved,Giant Bicycles,Standard,medium,large,1765.3,709.48,42226.0


In [8]:
df_trans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   transaction_id           20000 non-null  int64         
 1   product_id               20000 non-null  int64         
 2   customer_id              20000 non-null  int64         
 3   transaction_date         20000 non-null  datetime64[ns]
 4   online_order             19640 non-null  object        
 5   order_status             20000 non-null  object        
 6   brand                    19803 non-null  object        
 7   product_line             19803 non-null  object        
 8   product_class            19803 non-null  object        
 9   product_size             19803 non-null  object        
 10  list_price               20000 non-null  float64       
 11  standard_cost            19803 non-null  float64       
 12  product_first_sold_date  19803 n

In [31]:
# Determine if trans_ids are in sequential order
trans_ids = df_trans['transaction_id'].to_list()
compare_trans_ids = range(1, df_trans.shape[0]+1, 1)

trans_ids == list(compare_trans_ids)

20000

In [10]:
# Checking for all possible order_status values
df_trans['order_status'].unique()

array(['Approved', 'Cancelled'], dtype=object)

In [11]:
# Checking for all possible product_class values
df_trans['product_class'].unique()

array(['medium', 'low', 'high', nan], dtype=object)

In [12]:
# Checking for all possible online_order values
df_trans['online_order'].unique()

array([False, True, nan], dtype=object)

In [13]:
# Checking for all possible product_size values
df_trans['product_size'].unique()

array(['medium', 'large', 'small', nan], dtype=object)

In [14]:
# determine the range for list_price
df_trans['list_price'].min(), df_trans['list_price'].max()

(12.01, 2091.47)

In [15]:
# determine the range for standard_cost
df_trans['standard_cost'].min(), df_trans['standard_cost'].max()

(7.21, 1759.85)

In [16]:
# Checking for all possible product_size values
df_trans['product_line'].unique()

array(['Standard', 'Road', 'Mountain', 'Touring', nan], dtype=object)

In [17]:
# Checking for all possible brand values
df_trans['brand'].unique()

array(['Solex', 'Trek Bicycles', 'OHM Cycles', 'Norco Bicycles',
       'Giant Bicycles', 'WeareA2B', nan], dtype=object)

# CustomerAddress

In [19]:
# Importing datafile into dataframe

df_cust_addr = pd.read_csv('CustomerAddress.csv')

In [20]:
# Displaying dataframe summary
df_cust_addr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3999 entries, 0 to 3998
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   customer_id         3999 non-null   int64 
 1   address             3999 non-null   object
 2   postcode            3999 non-null   int64 
 3   state               3999 non-null   object
 4   country             3999 non-null   object
 5   property_valuation  3999 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 140.7+ KB


In [21]:
df_cust_addr.head()

Unnamed: 0,customer_id,address,postcode,state,country,property_valuation
0,1,060 Morning Avenue,2016,New South Wales,Australia,10
1,2,6 Meadow Vale Court,2153,New South Wales,Australia,10
2,4,0 Holy Cross Court,4211,QLD,Australia,9
3,5,17979 Del Mar Point,2448,New South Wales,Australia,4
4,6,9 Oakridge Court,3216,VIC,Australia,9


In [23]:
df_cust_addr['country'].unique()

array(['Australia'], dtype=object)

In [36]:
# Determine if customer_ids are in sequential order
customer_ids = df_cust_addr['customer_id'].to_list()
compare_cust_ids = range(1, df_cust_addr.shape[0]+1, 1)

trans_ids == list(compare_cust_ids)

False

# Customer Demographic

In [24]:
# Data file did not contain columns, so creating temporary columns for dataframe
columns = ['col' + str(x) for x in range(13)]

# Importing datafile into dataframe
df_cust_demo = pd.read_csv('CustomerDemographic.csv', header = 0, names = columns) 

In [25]:
# Displaying dataframe summary
df_cust_demo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   col0    4000 non-null   object
 1   col1    4000 non-null   object
 2   col2    3875 non-null   object
 3   col3    4000 non-null   object
 4   col4    4000 non-null   object
 5   col5    3913 non-null   object
 6   col6    3494 non-null   object
 7   col7    3344 non-null   object
 8   col8    4000 non-null   object
 9   col9    4000 non-null   object
 10  col10   3698 non-null   object
 11  col11   4000 non-null   object
 12  col12   3913 non-null   object
dtypes: object(13)
memory usage: 203.2+ KB


In [26]:
df_cust_demo.head()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12
0,1368,Aarika,Magog,Female,54,4/14/1973,,Property,Mass Customer,N,-1.00E+02,No,4
1,508,Abagail,Tordiffe,Female,94,1/18/1971,Project Manager,Property,Mass Customer,N,é¨è½æ ¼,Yes,11
2,57,Abba,Masedon,M,87,6/13/1988,Chief Design Engineer,,Mass Customer,N,1.00E+02,Yes,13
3,2413,Abbey,Murrow,Male,27,8/11/1943,Environmental Specialist,Manufacturing,High Net Worth,N,á,Yes,17
4,3409,Abbey,Nellen,Female,75,4/29/1977,Desktop Support Technician,Argiculture,Mass Customer,N,($1.00),No,16


In [38]:
# Purpose of col4, col5, col7, col8, col10, coll11, coll12?
# Inconsistency in col3

# NewCustomerList

## Null values
* last_name
* DOB
* job_title
* job_industry_category

## Relevancy of:
* job_title
* job_industry_category
* deceased_indicator
* tenure
* property_valuation
* Rank (with respect to what)
* Value (Is this home value?) 

In [43]:
# Importing datafile into dataframe
df_new_cust_list = pd.read_csv('NewCustomerList.csv') 

In [44]:
# In original Excel datafile, there are five hidden columns (17, 18, 19, 20, 21)
# Are these values important?

df_new_cust_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 18 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   first_name                           1000 non-null   object 
 1   last_name                            971 non-null    object 
 2   gender                               1000 non-null   object 
 3   past_3_years_bike_related_purchases  1000 non-null   int64  
 4   DOB                                  983 non-null    object 
 5   job_title                            894 non-null    object 
 6   job_industry_category                835 non-null    object 
 7   wealth_segment                       1000 non-null   object 
 8   deceased_indicator                   1000 non-null   object 
 9   owns_car                             1000 non-null   object 
 10  tenure                               1000 non-null   int64  
 11  address                        

In [120]:
#
current_year = datetime.now().year
df_new_cust_list['age'] = df_new_cust_list['DOB'].apply(lambda x : x if pd.isnull(x) else current_year - int(x[-4:]))
df_new_cust_list['age'] = df_new_cust_list['age'].astype('Int64')

In [121]:
df_new_cust_list.head()

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,address,postcode,state,country,property_valuation,Rank,Value,age
0,Chickie,Brister,Male,86,7/12/1957,General Manager,Manufacturing,Mass Customer,N,Yes,14,45 Shopko Center,4500,QLD,Australia,6,1,1.71875,63
1,Morly,Genery,Male,69,3/22/1970,Structural Engineer,Property,Mass Customer,N,No,16,14 Mccormick Park,2113,NSW,Australia,11,1,1.71875,50
2,Ardelis,Forrester,Female,10,8/28/1974,Senior Cost Accountant,Financial Services,Affluent Customer,N,No,10,5 Colorado Crossing,3505,VIC,Australia,5,1,1.71875,46
3,Lucine,Stutt,Female,64,1/28/1979,Account Representative III,Manufacturing,Affluent Customer,N,Yes,5,207 Annamark Plaza,4814,QLD,Australia,1,4,1.703125,41
4,Melinda,Hadlee,Female,34,9/21/1965,Financial Analyst,Financial Services,Affluent Customer,N,No,19,115 Montana Place,2093,NSW,Australia,9,4,1.703125,55


In [122]:
df_new_cust_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 19 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   first_name                           1000 non-null   object 
 1   last_name                            971 non-null    object 
 2   gender                               1000 non-null   object 
 3   past_3_years_bike_related_purchases  1000 non-null   int64  
 4   DOB                                  983 non-null    object 
 5   job_title                            894 non-null    object 
 6   job_industry_category                835 non-null    object 
 7   wealth_segment                       1000 non-null   object 
 8   deceased_indicator                   1000 non-null   object 
 9   owns_car                             1000 non-null   object 
 10  tenure                               1000 non-null   int64  
 11  address                        