# KMeans Clustering ALGO

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [2]:
df = pd.read_csv('OnlineRetail.csv', encoding='unicode_escape')
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [3]:
df.shape

(541909, 8)

In [4]:
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [5]:
# df.dropna()

In [6]:
df.isna().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [7]:
df['CustomerID'] = df['CustomerID'].astype(str)

In [9]:
df['Amount'] = df['Quantity']*df['UnitPrice']
df_m = df.groupby('CustomerID')['Amount'].sum()
df_m = df_m.reset_index()
df_m.head()

Unnamed: 0,CustomerID,Amount
0,12346.0,0.0
1,12347.0,4310.0
2,12348.0,1797.24
3,12349.0,1757.55
4,12350.0,334.4


In [10]:
df_f = df.groupby('CustomerID')['InvoiceNo'].count()
df_f = df_f.reset_index()
df_f.columns = ['CustomerID', 'Frequency']
df_f.head()

Unnamed: 0,CustomerID,Frequency
0,12346.0,2
1,12347.0,182
2,12348.0,31
3,12349.0,73
4,12350.0,17


In [11]:
df_retail = pd.merge(df_m, df_f, on='CustomerID', how='inner')
df_retail.head()

Unnamed: 0,CustomerID,Amount,Frequency
0,12346.0,0.0,2
1,12347.0,4310.0,182
2,12348.0,1797.24,31
3,12349.0,1757.55,73
4,12350.0,334.4,17


In [23]:
df['Invoice'] = pd.to_datetime(df['InvoiceDate'], format='%m/%d/%Y %H:%M')
df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Amount,Invoice
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom,15.30,2010-12-01 08:26:00
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom,20.34,2010-12-01 08:26:00
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom,22.00,2010-12-01 08:26:00
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom,20.34,2010-12-01 08:26:00
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom,20.34,2010-12-01 08:26:00
...,...,...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,12/9/2011 12:50,0.85,12680.0,France,10.20,2011-12-09 12:50:00
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,12/9/2011 12:50,2.10,12680.0,France,12.60,2011-12-09 12:50:00
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,12/9/2011 12:50,4.15,12680.0,France,16.60,2011-12-09 12:50:00
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,12/9/2011 12:50,4.15,12680.0,France,16.60,2011-12-09 12:50:00


In [27]:
max_date = max(df['InvoiceDate'])
max_date

'9/9/2011 9:52'

In [28]:
max_date = pd.to_datetime(max_date)  # Ensure max_date is a datetime object
df['Diff'] = max_date - df['Invoice']


In [29]:
df.head(2)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Amount,Invoice,Diff
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom,15.3,2010-12-01 08:26:00,282 days 01:26:00
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom,20.34,2010-12-01 08:26:00,282 days 01:26:00


In [30]:
df_p = df.groupby('CustomerID')['Diff'].min()
df_p = df_p.reset_index()
df_p.head()

Unnamed: 0,CustomerID,Diff
0,12346.0,233 days 23:35:00
1,12347.0,-90 days +18:00:00
2,12348.0,-17 days +20:39:00
3,12349.0,-73 days +00:01:00
4,12350.0,218 days 17:51:00


In [32]:
df_p['Diff'] = df_p['Diff'].dt.days
df_p.head()

Unnamed: 0,CustomerID,Diff
0,12346.0,233
1,12347.0,-90
2,12348.0,-17
3,12349.0,-73
4,12350.0,218


In [33]:
df_retail = pd.merge(df_retail, df_p, on='CustomerID', how='inner')
df_retail.columns = ['CustomerID', 'Amount', 'Frequnecy', 'Recency']
df_retail.head()

Unnamed: 0,CustomerID,Amount,Frequnecy,Recency
0,12346.0,0.0,2,233
1,12347.0,4310.0,182,-90
2,12348.0,1797.24,31,-17
3,12349.0,1757.55,73,-73
4,12350.0,334.4,17,218


In [36]:
df_retail.shape

(4373, 4)

In [None]:
attributes = ['Amount', 'Frequency', 'Recency']
plt.rcParams['figure.figsize'] = [10, 8]
sns.boxplot(data=df_retail[attributes], orient='v', palette='set2', whis=1.5, saturation=1, width)