## 1. Import Libraries

In [62]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

## 2. Data

In [9]:
df = pd.read_excel(r'data/customer.xlsx')

## 3. EDA

### 3a. Description

In [10]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [11]:
df.sample(5).T

Unnamed: 0,357569,364487,442985,425272,143511
InvoiceNo,568100,568662,574684,573314,548703
StockCode,23188,23333,22727,22112,84755
Description,VINTAGE 2 METER FOLDING RULER,IVORY WICKER HEART MEDIUM,ALARM CLOCK BAKELIKE RED,CHOCOLATE HOT WATER BOTTLE,COLOUR GLASS T-LIGHT HOLDER HANGING
Quantity,12,1,4,5,6
InvoiceDate,2011-09-23 15:20:00,2011-09-28 12:36:00,2011-11-06 12:56:00,2011-10-28 17:26:00,2011-04-03 11:37:00
UnitPrice,1.65,1.25,3.75,4.95,0.65
CustomerID,15475.0,16764.0,17581.0,16987.0,16010.0
Country,United Kingdom,United Kingdom,United Kingdom,United Kingdom,United Kingdom


In [12]:
df.describe()

Unnamed: 0,Quantity,InvoiceDate,UnitPrice,CustomerID
count,541909.0,541909,541909.0,406829.0
mean,9.55225,2011-07-04 13:34:57.156386048,4.611114,15287.69057
min,-80995.0,2010-12-01 08:26:00,-11062.06,12346.0
25%,1.0,2011-03-28 11:34:00,1.25,13953.0
50%,3.0,2011-07-19 17:17:00,2.08,15152.0
75%,10.0,2011-10-19 11:27:00,4.13,16791.0
max,80995.0,2011-12-09 12:50:00,38970.0,18287.0
std,218.081158,,96.759853,1713.600303


In [13]:
df.shape

(541909, 8)

### 4a. Null Values

In [14]:
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

### 4b. Unique Values

In [15]:
df.nunique()

InvoiceNo      25900
StockCode       4070
Description     4223
Quantity         722
InvoiceDate    23260
UnitPrice       1630
CustomerID      4372
Country           38
dtype: int64

## 5. Preprocessing

In [16]:
df.isnull().sum()/df.shape[0]*100

InvoiceNo       0.000000
StockCode       0.000000
Description     0.268311
Quantity        0.000000
InvoiceDate     0.000000
UnitPrice       0.000000
CustomerID     24.926694
Country         0.000000
dtype: float64

### 5a. Duplicate values

In [17]:
df.Country.unique()

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Austria',
       'Israel', 'Finland', 'Bahrain', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

### 5b. Null values in Customer ID

- Check for null values for customer ID
- For visualization purposes fill null values with "unknown"/"unregistered" customers

In [18]:
null_inv = df[df['CustomerID'].isna()].InvoiceNo.unique().tolist()

In [19]:
df['CustomerID'].fillna("unknown", inplace=True)

In [44]:
df.query('~CustomerID.isna()')['CustomerID'] = df.query('~CustomerID.isna()')['CustomerID'].apply(lambda x: str(x)[:-2]).tolist()

In [60]:
df['Description'].fillna('no description', inplace=True)

## 6. Visualisations

## 7. Analysis

### 7a. Lifetime Value (LTV)

In [68]:
ltv = df.query('CustomerID!="unknown"')

In [69]:
ltv.groupby('CustomerID').apply(lambda s: pd.Series({
    'CustomerValue' : round(s['UnitPrice'].sum()*s['Quantity'].sum())
}))

Unnamed: 0_level_0,CustomerValue
CustomerID,Unnamed: 1_level_1
12346.0,0
12347.0,1182814
12348.0,418360
12349.0,381818
12350.0,12864
...,...
18280.0,2144
18281.0,2125
18282.0,6143
18283.0,1705639


### 7b. Average Revenue Per User (ARPU)

### 7c. MRR

### 7d. Churned MRR

### 7e. RPR