<div align="Center">

# JK Lakshmipat University
## Institute of Engineering and Technology
### Machine Learning (CS1138)
#### Project-I
#### RFM model-based Customer Segmentation using Clustering and Classification

</div>
<hr>

#### Importing the Libraries

In [None]:
from calendar import month_abbr
from datetime import datetime

import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

from scipy.cluster import hierarchy

<hr>

### Data Configuration

#### Importing the Data

In [None]:
df1 = pd.read_excel('online_retail_II.xlsx', sheet_name='Year 2009-2010')
df2 = pd.read_excel('online_retail_II.xlsx', sheet_name='Year 2010-2011')
df = pd.concat([df1, df2])

#### Initial Dataset

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

#### Imputing the Dataset

In [None]:
df.isnull().sum()

- Description : Not Available
- Customer ID : -1

In [None]:
df['Description'] = df['Description'].fillna('Not Available')
df['Customer ID'] = df['Customer ID'].fillna(-1)

In [None]:
df.drop_duplicates(keep='first', inplace=True)

#### Feature Engineering

In [None]:
df['Customer ID'] = df['Customer ID'].astype(int)
df['TotalPrice'] = df['Price'] * df['Quantity']

In [None]:
df['Country'] = df['Country'].astype('category')
df['Description'] = df['Description'].astype('category')
df['StockCode'] = df['StockCode'].astype(str)

In [None]:
df[df['Price'] < 0]

In [None]:
df['Cancelled'] = df['Invoice'].astype(str).str.contains('C').astype(int)
df['Bad Debt'] = df['Invoice'].astype(str).str.contains('A').astype(int)
df['Invoice'] = df['Invoice'].astype(str).str.replace('[A-Z]', '', regex=True).astype(int)

StockCode contains Codes for different Situations, so it Cannot be Converted to numerical.

In [None]:
len(df['StockCode'].str.extractall(r"([a-zA-Z]+)").groupby(level=0).sum(numeric_only=False)[0].unique())

#### Final Dataset

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

#### Dataset with Valid Orders

In [None]:
df2 = df.drop(df[(df['Bad Debt'] == 1) | (df['Cancelled'] == 1) | (df['Description'] == 'Not Available') | (df['Customer ID'] == -1) | (df['Price'] == 0.0) | ~(df['StockCode'].str.isdigit())].index)
df2.drop(['Bad Debt', 'Cancelled'], axis=1, inplace=True)

In [None]:
df2.describe()

<hr>

### Exploratory Data Analysis

#### Top Selling Products

In [None]:
plotDF = df.groupby('Description', observed=True).size().sort_values(ascending=False).reset_index()
plotDF.drop(plotDF[plotDF['Description'] == 'Not Available'].index, inplace=True)
plotDF.columns = ['Description', 'Count']

fig = px.bar(plotDF.head(20), x='Description', y='Count', title='Top Selling Products')
fig.show()

#### Most Expensive and Least Expensive Products

In [None]:
plotDF = df2[['Description', 'Price']]
plotDF.sort_values(by='Price', ascending=False, inplace=True)
plotDF.drop_duplicates(subset=['Description'], keep='first', inplace=True)
plotDF.reset_index(drop=True, inplace=True)

In [None]:
fig = px.line(plotDF, x='Description', y='Price', title='Prices of Offered Products')
fig.update_xaxes(showticklabels=False)
fig.show()

In [None]:
fig = px.bar(plotDF.head(20), x='Description', y='Price', title='Most Expensive Products')
fig.show()

In [None]:
fig = px.bar(plotDF.tail(20), x='Description', y='Price', title='Least Expensive Products')
fig.show()

#### Average Number of Orders per Customer

In [None]:
ordersPerCustomer = df2[['Invoice', 'Customer ID']].drop_duplicates()
ordersPerCustomer = ordersPerCustomer.groupby(['Customer ID'], observed=True).count().reset_index().sort_values(by='Invoice', ascending=False).reset_index(drop=True)
ordersPerCustomer.columns = ['Customer ID', 'Number of Orders']

In [None]:
print(f"Average Orders per Customer: {ordersPerCustomer['Number of Orders'].mean()}")

In [None]:
plotDF = ordersPerCustomer.head(20)

fig = px.bar(plotDF, x=plotDF.index, y='Number of Orders', hover_data=['Customer ID'], title='Most Number of Orders per Customer')
fig.update_xaxes(title='Customer Rank')
fig.show()

#### Average Number of Unique Items per Customer and per Order

In [None]:
itemsPerCustomer = df2[['Customer ID', 'Description']].drop_duplicates()
itemsPerCustomer = itemsPerCustomer.groupby(['Customer ID'], observed=True).count().reset_index().sort_values(by='Description', ascending=False).reset_index(drop=True)
itemsPerCustomer.columns = ['Customer ID', 'Number of Items']

itemsPerOrder = df2[['Invoice', 'Description']].drop_duplicates()
itemsPerOrder = itemsPerOrder.groupby(['Invoice'], observed=True).count().reset_index().sort_values(by='Description', ascending=False).reset_index(drop=True)
itemsPerOrder.columns = ['Invoice Number', 'Number of Items']

In [None]:
print(f"Average Number of Unique Items per Customer: {itemsPerCustomer['Number of Items'].mean()}")
print(f"Average Number of Unique Items per Order: {itemsPerOrder['Number of Items'].mean()}")

In [None]:
plotDF = itemsPerCustomer.head(20)

fig = px.bar(plotDF, x=plotDF.index, y='Number of Items', hover_data=['Customer ID'], title='Most Number of Unique Items per Customer')
fig.update_xaxes(title='Customer Rank')
fig.show()

In [None]:
plotDF = itemsPerOrder.head(20)

fig = px.bar(plotDF, x=plotDF.index, y='Number of Items', hover_data=['Invoice Number'], title='Most Number of Unique Items per Order')
fig.update_xaxes(title='Order Rank')
fig.show()

#### Top Countries by Number of Customers and Number of Orders

In [None]:
plotDF = df.groupby('Country', observed=True).size().sort_values(ascending=False).reset_index()
plotDF.columns = ['Country', 'Count']

fig = px.bar(plotDF, x='Country', y='Count', title='Sales per Country')
fig.show()

In [None]:
plotDF = df[['Country','Customer ID']].drop_duplicates()
plotDF = plotDF.groupby(['Country'], observed=True)['Customer ID'].count().sort_values(ascending=False).reset_index()
plotDF.columns = ['Country', 'Count']

fig = px.bar(plotDF, x='Country', y='Count', title='Sales by Country per Customer')
fig.show()

In [None]:
plotDF = df[['Country','Invoice']].drop_duplicates()
plotDF = plotDF.groupby(['Country'], observed=True)['Invoice'].count().sort_values(ascending=False).reset_index()
plotDF.columns = ['Country', 'Count']

fig = px.bar(plotDF, x='Country', y='Count', title='Sales by Country per Order')
fig.show()

#### Total Sales per Month

In [None]:
plotDF = df2[['TotalPrice', 'InvoiceDate']]
plotDF = plotDF.groupby([plotDF['InvoiceDate'].dt.year, plotDF['InvoiceDate'].dt.month], observed=True).agg({'TotalPrice': 'sum'})
plotDF.reset_index(names=['Year', 'Month'], inplace=True)
plotDF['Month-Year'] = pd.Series(month_abbr[i] for i in plotDF['Month']) + ' ' + plotDF['Year'].astype(str)
plotDF.drop(['Year', 'Month'], axis=1, inplace=True)
plotDF.columns = ['Total Sales', 'Month & Year']

In [None]:
fig = px.line(plotDF, x='Month & Year', y='Total Sales', title='Monthly Sales')
fig.show()

#### Cancelled Items Analysis

In [None]:
plotDF = df[df['Cancelled'] == 1][['TotalPrice', 'InvoiceDate']]
plotDF = plotDF.groupby([plotDF['InvoiceDate'].dt.year, plotDF['InvoiceDate'].dt.month], observed=True).agg({'TotalPrice': ['sum', 'count']})
plotDF.reset_index(names=['Year', 'Month'], inplace=True)
plotDF['Month-Year'] = pd.Series(month_abbr[i] for i in plotDF['Month']) + ' ' + plotDF['Year'].astype(str)
plotDF.columns = ['Year', 'Month', 'Cancellation Amount', 'Total Cancellations', 'Month & Year']
plotDF.drop(['Year', 'Month'], axis=1, inplace=True)
plotDF['Cancellation Amount'] = - plotDF['Cancellation Amount']

In [None]:
fig = px.line(plotDF, x='Month & Year', y='Total Cancellations', title='Monthly Cancellations', hover_data=['Cancellation Amount'])
fig.show()

<hr>

## Machine Learning

#### Spliting Data into Train, Test and Validate

In [None]:
dfShuffled = df.sample(frac=1, random_state=42)

In [None]:
dfTrain, dfTest = train_test_split(dfShuffled, test_size=0.2, random_state=1)

dfTrain, dfValidate = train_test_split(dfTrain, test_size=0.2, random_state=1)

<hr>

### RFM Analysis

#### Creating the RFM Table

In [None]:
today = datetime.date(df2['InvoiceDate'].max())
print(f"The Last Date in the Dataset: {today.strftime('%d %B %Y')}")

In [None]:
df2['Date'] = df2['InvoiceDate'].dt.date
recencyDF = df2[['Customer ID', 'Date']].drop_duplicates()
recencyDF = recencyDF.groupby(['Customer ID'], observed=True)['Date'].max().reset_index()
recencyDF.columns = ['Customer ID', 'Last Purchase Date']
recencyDF['Recency'] = today - recencyDF['Last Purchase Date']
recencyDF['Recency'] = recencyDF['Recency'].astype('timedelta64[ns]').astype('int64') // 86400000000000

In [None]:
frequencyDF = df2[['Customer ID', 'Invoice']].drop_duplicates()
frequencyDF = frequencyDF.groupby(['Customer ID'], observed=True)['Invoice'].count().reset_index()
frequencyDF.columns = ['Customer ID', 'Frequency']

In [None]:
monetaryDF = df2[['Customer ID', 'TotalPrice']]
monetaryDF = monetaryDF.groupby(['Customer ID'], observed=True)['TotalPrice'].sum().reset_index()
monetaryDF.columns = ['Customer ID', 'Monetary']

In [None]:
rfmDF = pd.merge(recencyDF, frequencyDF, on='Customer ID')
rfmDF = pd.merge(rfmDF, monetaryDF, on='Customer ID')
rfmDF.drop('Last Purchase Date', axis=1, inplace=True)
rfmDF.set_index('Customer ID', inplace=True)

rfmDF.head()

#### Quantile Distribution

In [None]:
fig = make_subplots(rows=1, cols=3)
fig.add_trace(go.Box(y=rfmDF['Recency'].values, name='Recency'), row=1, col=1)
fig.add_trace(go.Box(y=rfmDF['Frequency'].values, name='Frequency'), row=1, col=2)
fig.add_trace(go.Box(y=rfmDF['Monetary'].values, name='Monetary'), row=1, col=3)
fig.update_layout(title_text="Box Plot of RFM Values")
fig.show()

In [None]:
def rScore(value: np.floating, category: str, quantiles: pd.DataFrame):
    "Decides a Customer's Score on the basis of Recency Value"
    if value <= quantiles[category][0.25]:
        return 4
    elif value <= quantiles[category][0.50]:
        return 3
    elif value <= quantiles[category][0.75]:
        return 2
    else:
        return 1

def fmScore(value: np.floating, category: str, quantiles: pd.DataFrame):
    "Decides a Customer's Score on the basis of Frequency/Monetary Value"
    if value <= quantiles[category][0.25]:
        return 1
    elif value <= quantiles[category][0.50]:
        return 2
    elif value <= quantiles[category][0.75]:
        return 3
    else:
        return 4

In [None]:
rfmQuantiles = rfmDF[['Recency', 'Frequency', 'Monetary']].quantile(q=[0.25, 0.50, 0.75])
rfmQuantiles

In [None]:
rfmSegmentation = rfmDF.copy()
rfmSegmentation['R'] = rfmSegmentation['Recency'].apply(rScore, args=('Recency', rfmQuantiles))
rfmSegmentation['F'] = rfmSegmentation['Frequency'].apply(fmScore, args=('Frequency', rfmQuantiles))
rfmSegmentation['M'] = rfmSegmentation['Monetary'].apply(fmScore, args=('Monetary', rfmQuantiles))
rfmSegmentation['RFM Score'] = rfmSegmentation['R'].astype(str) + rfmSegmentation['F'].astype(str) + rfmSegmentation['M'].astype(str)

rfmSegmentation.head()

#### Estimated Segmentation

In [None]:
def rfmClusters(data: pd.DataFrame, rCol: str, fCol: str, mCol: str, rfmCol: str):
    """Segments the RFM Data into Clusters namely,

    Best Customers, Worst Customers, Newest/Most Loyal Customeres, Frequent Buyers, Big Spenders, and Others."""

    bestCustomers = data[data[rfmCol] == '444']
    worstCustomers = data[data[rfmCol] == '111']
    loyalCustomers = data[data[rCol] == 4]
    frequentBuyers = data[data[fCol] == 4]
    bigSpenders = data[data[mCol] == 4]

    others = data[(data[rfmCol] != '444') & (data[rfmCol] != '111') & (data[rCol] != 4) & (data[fCol] != 4) & (data[mCol] != 4)]

    return {
        'Best Customers': bestCustomers,
        'Worst Customers': worstCustomers,
        'Newest/Most Loyal Customers': loyalCustomers,
        'Frequent Buyers': frequentBuyers,
        'Big Spendors': bigSpenders,
        'Others': others
    }

In [None]:
segments = rfmClusters(rfmSegmentation, 'R', 'F', 'M', 'RFM Score')

for segment in segments:
    print(f"{segment}: {segments[segment].shape[0]}")

In [None]:
fig = px.pie(names=segments.keys(), values=[segments[segment].shape[0] for segment in segments], title='RFM Estimated Segmentation')
fig.show()

#### Standardization

In [None]:
scaler = StandardScaler()
rfmScaled = scaler.fit_transform(rfmDF[['Recency', 'Frequency', 'Monetary']])
rfmScaled = pd.DataFrame(rfmScaled)
rfmScaled.columns = ['Recency', 'Frequency', 'Monetary']
rfmScaled.index = rfmDF.index
rfmScaled.head()

#### Normalization

In [None]:
fig = px.imshow(rfmScaled.corr(), title='Correlation Matrix of RFM Values')
fig.show()

In [None]:
fig = ff.create_scatterplotmatrix(rfmScaled, diag='histogram', height=500, width=1200, title='Scatter Matrix before Normalization')
fig.show()

In [None]:
rfmNorm = pd.DataFrame({
    'Log Recency': np.log(rfmDF['Recency']+0.01),
    'Log Frequency': np.log(rfmDF['Frequency']),
    'Log Monetary': np.log(rfmDF['Monetary'])
})
rfmNorm.head()

In [None]:
fig = px.imshow(rfmNorm.corr(), title='Correlation Matrix of Normalized RFM Values')
fig.show()

In [None]:
fig = ff.create_scatterplotmatrix(rfmNorm, diag='histogram', height=500, width=1200, title='Scatter Matrix after Normalization')
fig.show()

<hr>

### k-Means Clustering

#### Selecting $k$

In [None]:
dataAsMatrix = rfmNorm.to_numpy()
kRange = list(range(2, 11))
kData = list()

for i in kRange:
    model = KMeans(n_clusters=i, random_state=42)
    model.fit(dataAsMatrix)
    kData.append((model, model.inertia_/dataAsMatrix.shape[0], silhouette_score(dataAsMatrix, model.labels_)))

In [None]:
fig = make_subplots(rows=1, cols=2, subplot_titles=['Elbow Curve', 'Silhouette Scores'], x_title='Number of Clusters')
fig.add_trace(go.Scatter(x=kRange, y=[i[1] for i in kData], name='Elbow Curve'), row=1, col=1)
fig.add_trace(go.Scatter(x=kRange, y=[i[2] for i in kData], name='Silhouette Scores'), row=1, col=2)
fig.update_layout(title_text='Evaluating the Number of Clusters for k-Means Clustering', yaxis_title='Distortion', yaxis2_title='Silhouette Score')
fig.show()

#### Applying the Model

In [None]:
model = kData[0] # 2 Clusters
rfmDF['Cluster kMC'] = model[0].labels_.astype(str)
rfmDF.head()

In [None]:
fig = px.scatter_3d(rfmDF, x='Recency', y='Frequency', z='Monetary', color='Cluster kMC', symbol='Cluster kMC', opacity=0.5, title='RFM-based k-Means Clusters')
fig.show()

In [None]:
fig = make_subplots(rows=1, cols=3, x_title='Clusters')
fig.add_trace(go.Box(x=rfmDF['Cluster kMC'], y=rfmDF['Recency'], name='Recency'), row=1, col=1)
fig.add_trace(go.Box(x=rfmDF['Cluster kMC'], y=rfmDF['Frequency'], name='Frequency'), row=1, col=2)
fig.add_trace(go.Box(x=rfmDF['Cluster kMC'], y=rfmDF['Monetary'], name='Monetary'), row=1, col=3)
fig.update_layout(title_text='Clusters as per Recency, Frequency and Monetary Values', yaxis_title='Recency', yaxis2_title='Frequency', yaxis3_title='Monetary')
fig.show()

<hr>

### Hierarchical Clustering

#### Average Linkage

In [None]:
mergings = hierarchy.linkage(rfmScaled, method='average', metric='euclidean')

In [None]:
# # Only Use when Necessary
# fig = ff.create_dendrogram(mergings)
# fig.show('browser')

#### Creating Clusters

In [None]:
clusterLabels = hierarchy.cut_tree(mergings, 10).reshape(-1,)
rfmDF['Cluster HC'] = clusterLabels.astype(str)
rfmDF.head()

In [None]:
fig = px.scatter_3d(rfmDF, x='Recency', y='Frequency', z='Monetary', color='Cluster HC', symbol='Cluster HC', opacity=0.5, title='RFM-based k-Means Clusters')
fig.show()

In [None]:
fig = make_subplots(rows=1, cols=3, x_title='Clusters')
fig.add_trace(go.Box(x=rfmDF['Cluster HC'], y=rfmDF['Recency'], name='Recency'), row=1, col=1)
fig.add_trace(go.Box(x=rfmDF['Cluster HC'], y=rfmDF['Frequency'], name='Frequency'), row=1, col=2)
fig.add_trace(go.Box(x=rfmDF['Cluster HC'], y=rfmDF['Monetary'], name='Monetary'), row=1, col=3)
fig.update_layout(title_text='Clusters as per Recency, Frequency and Monetary Values', yaxis_title='Recency', yaxis2_title='Frequency', yaxis3_title='Monetary')
fig.show()

<hr>

### k-NN Classification

<hr>

### Logistic Regression

<hr>

### BG/NBD CLV Modelling
Beta-Geometric/Negative Binomial Distribution Customer Lifetime Value Modelling

<hr>

### Gamma-Gamma Modelling

<hr>

## Conclusion

<hr>