# 1. Imports

In [None]:
import numpy as np
import pandas as pd 
import plotly.express as px
import datetime as dt
import plotly.graph_objects as go
import re

# Plotly settings
import plotly.io as pio
pio.renderers.default='notebook'

# 2. Load data

In [None]:
df = pd.read_csv("../data/processed/online_retail_II_2009.csv")

In [None]:
display(df.head(3))
display(df.info())

# 3. Clean data

In [None]:
# Print amount of missing values
print(df.isnull().sum().sort_values(ascending = False))

In [None]:
# Remove missing values
df.dropna(inplace=True)

In [None]:
# Summary statistics
df.describe([0.01, 0.05, 0.10, 0.20, 0.90, 0.95, 0.99]).T

Negative values are refunds, refunded invoices contain C in the invoice ID.

In [None]:
# Remove negative values for invoices
df = df[~df["Invoice"].str.contains("C", na = False)]

df.describe([0.01, 0.05, 0.10, 0.20, 0.90, 0.95, 0.99]).T

In [None]:
# Remove where price is 0
df = df[df["Price"] > 0]

In [None]:
# Change country EIRE to Ireland
df.loc[df["Country"] == "EIRE", "Country"] = "Ireland"

In [None]:
# Convert InvoiceDate to datetime
df.loc[:, 'InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

In [None]:
# Calculate revenue 2 decimal places
df.loc[:, 'Revenue'] = df['Quantity'] * df['Price']
df.loc[:, 'Revenue'] = df['Revenue'].round(2)

# 

# 4. EDA

### 4.1 Distribution of Quantity & Price

In [None]:
# Boxplot
fig = go.Figure()

# Add box traces for Quantity and Price
for col in ['Quantity', 'Price']:
    fig.add_trace(go.Box(
        x=df[col],
        name=col,
        orientation='h'
    ))

# Update layout
fig.update_layout(
    width=1000,
    title='Boxplot of Quantity and Price',
    xaxis_title='Value (log scale)',
    xaxis_type='log'
)

fig.show()

### 4.2 Top-N analysis 

In [None]:
# Barchart
country_counts = df['Country'].value_counts().head(3)
fig = go.Figure(data=[
    go.Bar(
        x=country_counts.index,
        y=country_counts.values
    )
])

# Update layout
fig.update_layout(
    width=500,
    title='Top 3 Countries by Number of Transactions',
    xaxis_title='Country',
    yaxis_title='Count'
)

fig.show()

In [None]:
# Barchart top-selling products
# Get top 3 products and their counts
product_counts = df['StockCode'].value_counts().head(3)

# Get descriptions for top products
top_products = pd.DataFrame({
    'StockCode': product_counts.index,
    'Count': product_counts.values
})
top_products = top_products.merge(df[['StockCode', 'Description']].drop_duplicates(), on='StockCode')

fig = go.Figure(data=[
    go.Bar(
        x=top_products['StockCode'],
        y=top_products['Count'],
        hovertext=top_products['Description'],
        hoverinfo='text+y'
    )
])

# Update layout
fig.update_layout(
    width=500,
    title='Top 3 Products by Number of Transactions<br><sup>Hover over bars to see product descriptions</sup>',
    xaxis_title='Stock Code',
    yaxis_title='Count'
)

fig.show()

# 5. Customer Segmentation - RFM analysis

In [None]:
# Create snapshot date (most recent transaction date + 1 day)
snapshot_date = df['InvoiceDate'].max() + pd.DateOffset(days=1)

In [None]:
# Aggregate data by Customer ID
rfm = df.groupby('Customer ID').agg({
    'InvoiceDate': lambda x: (snapshot_date - x.max()).days,  # Recency
    'Invoice': 'nunique',                                      # Frequency
    'Revenue': 'sum'                                           # Monetary
}).reset_index()

rfm.columns = ['CustomerID', 'Recency', 'Frequency', 'Monetary']

In [None]:
# Assign quartile-based scores (1-4)
rfm['RecencyScore'] = pd.qcut(rfm['Recency'], q=5, labels=[5, 4 , 3, 2, 1])
rfm['FrequencyScore'] = pd.qcut(rfm['Frequency'].rank(method="first"), q=5, labels=[1,2,3,4,5])
rfm['MonetaryScore'] = pd.qcut(rfm['Monetary'], q=5, labels=[1, 2, 3, 4, 5])

# Combine scores
rfm['RFM_Score'] = (
    rfm['RecencyScore'].astype(str) + 
    rfm['FrequencyScore'].astype(str) + 
    rfm['MonetaryScore'].astype(str)
)

rfm.head(3)

In [None]:
seg_map = {
    r'[1-2][1-2]': 'Hibernating',
    r'[1-2][3-4]': 'At Risk', 
    r'[1-2]5': "Can't Lose",
    r'3[1-2]': 'About to Sleep',
    r'33': 'Need Attention',
    r'[3-4][4-5]': 'Loyal Customers',
    r'41': 'Promising',
    r'51': 'New Customers',
    r'[4-5][2-3]': 'Potential Loyalists',
    r'5[4-5]': 'Champions'
}

In [None]:
# First ensure RFM scores are strings
rfm['RFM_Score'] = rfm['RFM_Score'].astype(str)

# Extract first two digits for segmentation (Recency + Frequency)
rfm['RF_Frequency_Recency'] = rfm['RFM_Score'].str[:2]

# Create function to map segments using regex
def assign_segment(rf_code):
    for pattern, segment in seg_map.items():
        if re.match(pattern, rf_code):
            return segment
    return 'Other'  # Fallback category

# Apply segmentation
rfm['Segment'] = rfm['RF_Frequency_Recency'].apply(assign_segment)

# Sort segments by business priority
segment_order = [
    'Champions', 'Loyal Customers', 'Potential Loyalists',
    'New Customers', 'Promising', 'Need Attention',
    'About to Sleep', 'At Risk', "Can't Loose", 
    'Hibernating'
]


In [None]:
# Visualization 1: Interactive Treemap
fig = px.treemap(rfm.groupby('Segment', observed=False).size().reset_index(name='Count'),
                 path=['Segment'], values='Count',
                 title='Customer Segmentation Distribution | RFM Analysis',
                 width=800,
                 height=800)
fig.update_traces(textinfo="label+value+percent parent")
fig.show()

In [None]:
# Get high-value customers needing attention
high_value_risk = rfm[
    (rfm['Segment'].isin(['Need Attention', 'At Risk'])) & 
    (rfm['Monetary'] > rfm['Monetary'].quantile(0.75))
]

print(f"High-value customers needing attention: {len(high_value_risk)}")
print(high_value_risk[['CustomerID', 'Recency', 'Frequency', 'Monetary', 'Segment']].head())

In [None]:
rfmStats = rfm[["Segment","Recency","Frequency", "Monetary"]].groupby("Segment").agg(["mean","median","count", "std"])
rfmStats.columns = rfmStats.columns.map('_'.join).str.strip('|')
rfmStats