## Customer Analysis 

In [1]:
# General Dependencies
import numpy as np
import pandas as pd
import datetime as dt

In [2]:
# Read the data
df_data = pd.read_csv('data/data_ready.csv')
print('Dataframe dimensions:', df_data.shape)

Dataframe dimensions: (1781, 18)


In [3]:
# Convert column 'Order Date' to datetime format, and remove empty rows

df_data['Order Date'] = pd.to_datetime(df_data['Order Date'])

df_data = df_data.dropna(how='all')
print('Dataframe dimensions:', df_data.shape)

Dataframe dimensions: (1752, 18)


In [4]:
df_2017 =  df_data[df_data['Year'] == 2017]
df_2018 =  df_data[df_data['Year'] == 2018]

In [5]:
print('Dataframe dimensions 2017:', df_2017.shape)
print('------------------------------------------------------------------------------------------------')
df_2017.describe()

Dataframe dimensions 2017: (1429, 18)
------------------------------------------------------------------------------------------------


Unnamed: 0,Cost of Goods,Unit Price,Shipping Cost,Profit,Quantity,Sales,Month,Year
count,1429.0,1429.0,1429.0,1429.0,1429.0,1429.0,1429.0,1429.0
mean,180.055143,253.216235,11.622743,113.717075,1.010497,384.286914,5.153954,2017.0
std,621.745415,1009.833999,12.317772,1736.874308,0.160624,4366.991274,2.633831,0.0
min,1.0,21.0,0.0,0.0,1.0,21.0,1.0,2017.0
25%,43.0,82.0,7.5,21.3,1.0,82.0,3.0,2017.0
50%,91.0,141.0,7.5,35.0,1.0,141.0,5.0,2017.0
75%,175.0,240.0,15.0,56.5,1.0,240.0,7.0,2017.0
max,15816.0,26360.0,300.0,50410.0,5.0,126650.0,12.0,2017.0


In [6]:
print('Dataframe dimensions 2018:', df_2018.shape)
print('------------------------------------------------------------------------------------------------')
df_2018.describe()

Dataframe dimensions 2018: (323, 18)
------------------------------------------------------------------------------------------------


Unnamed: 0,Cost of Goods,Unit Price,Shipping Cost,Profit,Quantity,Sales,Month,Year
count,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0
mean,257.809907,356.733746,11.925697,87.651393,1.034056,363.455108,2.043344,2018.0
std,804.750064,1048.718836,11.870447,275.470921,0.512652,1056.140207,0.786914,0.0
min,3.0,27.0,5.0,0.0,1.0,27.0,1.0,2018.0
25%,64.0,106.0,8.0,27.0,1.0,106.0,1.0,2018.0
50%,134.0,195.0,8.0,46.0,1.0,195.0,2.0,2018.0
75%,205.5,281.0,15.0,68.0,1.0,281.0,3.0,2018.0
max,12315.0,15370.0,178.0,3050.0,10.0,15370.0,3.0,2018.0


## Building a customer table

Let's learn more about Lei customers.

In [7]:

def BuildDF(df):
    
    # Frequency helps us to know how many times a customer purchased. 
    # For getting the Frequency we need to find how many orders each customer has placed.
    df_orders = df.groupby(by='Customer ID', as_index=False)['Order ID'].count()
    df_orders.columns = ['CustomerID', 'NumberOrders']

    # Monetary responds How much money did the customer spent over time?
    df_tSpent = df.groupby(by='Customer ID', as_index=False)['Sales'].sum()
    df_tSpent.columns = ['CustomerID', 'TotalSpent']

    # How many times they bought
    df_tItems = df.groupby(by='Customer ID', as_index=False)['Quantity'].sum()
    df_tItems.columns = ['CustomerID', 'TotalItems']
    
    # When was their first order 
    dt_lday = (df['Order Date']).max()
    df_fOrder = df.groupby(by='Customer ID', as_index=False)['Order Date'].min()
    df_fOrder.columns = ['CustomerID', 'EarliestOrder']
    df_fOrder['DaysAsCustomer'] = 1 + (dt_lday - df_fOrder['EarliestOrder']).astype('timedelta64[D]')
    
    # How long ago was the last order.
    df_lOrder = df.groupby(by='Customer ID', as_index=False)['Order Date'].max()
    df_lOrder.columns = ['CustomerID', 'LastPurchase']
    df_lOrder['DaysNoPurchase'] = 1 + (dt_lday - df_lOrder['LastPurchase']).astype('timedelta64[D]')


    # Merge all the dataframes in one.
    df_customers = df_orders.merge(df_tSpent, on="CustomerID")
    df_customers = df_customers.merge(df_tItems, on="CustomerID")
    df_purchases = df_fOrder.merge(df_lOrder, on="CustomerID")
    df_customers = df_customers.merge(df_purchases, on="CustomerID" )
    
    df_customers.drop(['EarliestOrder', 'LastPurchase'], axis=1, inplace=True)
    
    return(df_customers)

In [8]:
df_customers17 = BuildDF(df_2017)
df_customers18 = BuildDF(df_2018)

df_customers17.describe()

Unnamed: 0,NumberOrders,TotalSpent,TotalItems,DaysAsCustomer,DaysNoPurchase
count,494.0,494.0,494.0,494.0,494.0
mean,2.892713,1111.631579,2.923077,221.882591,173.1417
std,4.011978,10460.87798,4.027561,84.94708,88.869598
min,1.0,29.0,1.0,1.0,1.0
25%,1.0,142.0,1.0,184.0,123.0
50%,2.0,327.5,2.0,215.0,184.0
75%,3.0,705.25,3.0,276.0,215.0
max,36.0,232090.0,36.0,335.0,335.0


In [9]:
df_customers18.describe()

Unnamed: 0,NumberOrders,TotalSpent,TotalItems,DaysAsCustomer,DaysNoPurchase
count,182.0,182.0,182.0,182.0,182.0
mean,1.774725,645.032967,1.835165,34.681319,27.241758
std,1.50829,2330.199847,1.75747,23.566002,23.913445
min,1.0,41.0,1.0,1.0,1.0
25%,1.0,139.5,1.0,29.0,1.0
50%,1.0,257.0,1.0,29.0,29.0
75%,2.0,480.25,2.0,60.0,60.0
max,10.0,30496.0,14.0,60.0,60.0


In [10]:
df_customers17.sort_values(by=['TotalSpent'], ascending=False )
df_customers17 = df_customers17[(df_customers17.TotalSpent < 232090 )]
df_customers17.describe()

Unnamed: 0,NumberOrders,TotalSpent,TotalItems,DaysAsCustomer,DaysNoPurchase
count,493.0,493.0,493.0,493.0,493.0
mean,2.894523,643.115619,2.910751,221.772819,173.306288
std,4.015852,997.711351,4.022314,84.998285,88.884468
min,1.0,29.0,1.0,1.0,1.0
25%,1.0,142.0,1.0,184.0,123.0
50%,2.0,326.0,2.0,215.0,184.0
75%,3.0,703.0,3.0,276.0,215.0
max,36.0,9800.0,36.0,335.0,335.0


In [11]:
df_customers18.describe()

Unnamed: 0,NumberOrders,TotalSpent,TotalItems,DaysAsCustomer,DaysNoPurchase
count,182.0,182.0,182.0,182.0,182.0
mean,1.774725,645.032967,1.835165,34.681319,27.241758
std,1.50829,2330.199847,1.75747,23.566002,23.913445
min,1.0,41.0,1.0,1.0,1.0
25%,1.0,139.5,1.0,29.0,1.0
50%,1.0,257.0,1.0,29.0,29.0
75%,2.0,480.25,2.0,60.0,60.0
max,10.0,30496.0,14.0,60.0,60.0


In [12]:
df_customers18.sort_values(by=['TotalSpent'], ascending=False )
df_customers18 = df_customers18[(df_customers18.TotalSpent < 30496 )]
df_customers18.describe()

Unnamed: 0,NumberOrders,TotalSpent,TotalItems,DaysAsCustomer,DaysNoPurchase
count,181.0,181.0,181.0,181.0,181.0
mean,1.762431,480.110497,1.823204,34.867403,27.38674
std,1.503301,694.406788,1.754902,23.496905,23.899434
min,1.0,41.0,1.0,1.0,1.0
25%,1.0,139.0,1.0,29.0,1.0
50%,1.0,256.0,1.0,29.0,29.0
75%,2.0,457.0,2.0,60.0,60.0
max,10.0,5870.0,14.0,60.0,60.0


In [13]:
# Save Customers datasets to csv file for Forecasting

df_customers17.to_csv('data/cust17.csv',index=False)
df_customers18.to_csv('data/cust18.csv',index=False)