In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt

# Load the dataset
df = pd.read_csv('online_retail.csv')

# Data Cleaning and Preparation
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df = df[df['Quantity'] > 0]
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']

# RFM Distributions
recency_df = df.groupby('CustomerID')['InvoiceDate'].max().reset_index()
reference_date = df['InvoiceDate'].max() + dt.timedelta(days=1)
recency_df['Recency'] = (reference_date - recency_df['InvoiceDate']).dt.days
frequency_df = df.groupby('CustomerID')['InvoiceNo'].nunique().reset_index()
frequency_df.rename(columns={'InvoiceNo': 'Frequency'}, inplace=True)
monetary_df = df.groupby('CustomerID')['TotalPrice'].sum().reset_index()
monetary_df.rename(columns={'TotalPrice': 'Monetary'}, inplace=True)
rfm_df = recency_df.merge(frequency_df, on='CustomerID').merge(monetary_df, on='CustomerID')

# Plot RFM distributions
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
axes[0].hist(rfm_df['Recency'], bins=50, color='skyblue', edgecolor='black')
axes[0].set_title('Recency Distribution')
axes[0].set_xlabel('Days Since Last Purchase')
axes[0].set_ylabel('Frequency')
axes[0].grid(True, linestyle='--', linewidth=0.5)
axes[1].hist(rfm_df['Frequency'], bins=50, color='lightgreen', edgecolor='black')
axes[1].set_title('Frequency Distribution')
axes[1].set_xlabel('Number of Purchases')
axes[1].set_ylabel('Frequency')
axes[1].set_yscale('log')
axes[1].grid(True, which='both', linestyle='--', linewidth=0.5)
axes[2].hist(rfm_df['Monetary'], bins=50, color='salmon', edgecolor='black')
axes[2].set_title('Monetary Distribution')
axes[2].set_xlabel('Total Spend')
axes[2].set_ylabel('Frequency')
axes[2].set_yscale('log')
axes[2].grid(True, which='both', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.savefig('rfm_distributions.png')