## 1. Import Libraries

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

## 2. Data

In [2]:
df = pd.read_excel(r'data/customer.xlsx')

## 3. EDA

### 3a. Description

In [3]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [4]:
df.sample(5).T

Unnamed: 0,41937,138762,206210,8103,59235
InvoiceNo,539958,548211,554917,537126,541279
StockCode,22032,22558,21671,21242,20679
Description,BOTANICAL LILY GREETING CARD,CLOTHES PEGS RETROSPOT PACK 24,RED SPOT CERAMIC DRAWER KNOB,RED RETROSPOT PLATE,EDWARDIAN PARASOL RED
Quantity,5,48,12,1,3
InvoiceDate,2010-12-23 13:26:00,2011-03-30 09:06:00,2011-05-27 12:29:00,2010-12-05 12:13:00,2011-01-17 11:55:00
UnitPrice,0.43,1.25,1.25,1.69,5.95
CustomerID,,14646.0,15104.0,18118.0,18223.0
Country,United Kingdom,Netherlands,United Kingdom,United Kingdom,United Kingdom


In [5]:
df.describe()

Unnamed: 0,Quantity,InvoiceDate,UnitPrice,CustomerID
count,541909.0,541909,541909.0,406829.0
mean,9.55225,2011-07-04 13:34:57.156386048,4.611114,15287.69057
min,-80995.0,2010-12-01 08:26:00,-11062.06,12346.0
25%,1.0,2011-03-28 11:34:00,1.25,13953.0
50%,3.0,2011-07-19 17:17:00,2.08,15152.0
75%,10.0,2011-10-19 11:27:00,4.13,16791.0
max,80995.0,2011-12-09 12:50:00,38970.0,18287.0
std,218.081158,,96.759853,1713.600303


In [6]:
df.shape

(541909, 8)

### 4a. Null Values

In [7]:
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

### 4b. Unique Values

In [8]:
df.nunique()

InvoiceNo      25900
StockCode       4070
Description     4223
Quantity         722
InvoiceDate    23260
UnitPrice       1630
CustomerID      4372
Country           38
dtype: int64

## 5. Preprocessing

In [9]:
df.isnull().sum()/df.shape[0]*100

InvoiceNo       0.000000
StockCode       0.000000
Description     0.268311
Quantity        0.000000
InvoiceDate     0.000000
UnitPrice       0.000000
CustomerID     24.926694
Country         0.000000
dtype: float64

### 5a. Duplicate values

In [10]:
df.Country.unique()

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Austria',
       'Israel', 'Finland', 'Bahrain', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

### 5b. Null values in Customer ID

- Check for null values for customer ID
- For visualization purposes fill null values with "unknown"/"unregistered" customers

In [11]:
null_inv = df[df['CustomerID'].isna()].InvoiceNo.unique().tolist()

In [12]:
df['CustomerID'].fillna("unknown", inplace=True)

In [13]:
df.query('~CustomerID.isna()')['CustomerID'] = df.query('~CustomerID.isna()')['CustomerID'].apply(lambda x: str(x)[:-2]).tolist()

In [14]:
df['Description'].fillna('no description', inplace=True)

## 6. Visualisations

## 7. Analysis

### 7a. Lifetime Value (LTV)

In [23]:
ltv = df.query('CustomerID!="unknown"')

In [16]:
df_ltv = ltv.groupby('CustomerID').apply(lambda s: pd.Series({
    'Revenue' : round((s['UnitPrice'].sum()*s['Quantity'].sum())),
    'Avg_revenue' : round((s['UnitPrice'].sum()*s['Quantity'].sum()).mean()),
    'Transactions' : s['InvoiceNo'].count(),
    'Retention_time' : (s['InvoiceDate'].max() - s['InvoiceDate'].min()).days
}))

df_ltv['Lifetime_value'] = df_ltv['Avg_revenue']*df_ltv['Transactions']*df_ltv['Retention_time']

In [26]:
print("Average Lifetime Value :", round(df_ltv['Revenue'].sum()/df_ltv.shape[0])*round(df_ltv['Transactions'].mean()))

Average Lifetime Value : 220126257


In [25]:
print("Average Transactions per Customer :", round(df_ltv['Transactions'].mean()))

Average Transactions per Customer : 93


### 7b. Average Revenue Per User (ARPU)

In [22]:
print("ARPU :",round(df_ltv['Revenue'].sum()/df_ltv.shape[0]))

ARPU : 2366949


### 7c. MRR

### 7d. Churned MRR

### 7e. RPR