#  RFM Analysis and Feature Engineering

This notebook creates proper RFM features.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

In [None]:
# Load and preprocess data
file_path = r"../Data/OnlineRetail.csv"
retail_data = pd.read_csv(file_path, encoding='ISO-8859-1', parse_dates=['InvoiceDate'])

print(f"Dataset Shape: {retail_data.shape}")
print("\nFirst 5 rows:")
print(retail_data.head())

# Basic preprocessing
retail_data['TotalAmount'] = retail_data['Quantity'] * retail_data['UnitPrice']
retail_data_clean = retail_data.dropna(subset=['CustomerID'])
retail_data_clean['CustomerID'] = retail_data_clean['CustomerID'].astype(int)
retail_data_clean = retail_data_clean[retail_data_clean['UnitPrice'] > 0]

print(f"\nCleaned dataset shape: {retail_data_clean.shape}")

In [None]:
# Create RFM features
reference_date = retail_data_clean['InvoiceDate'].max() + timedelta(days=1)
print(f"Reference date: {reference_date}")

# Basic RFM calculation
rfm = retail_data_clean.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (reference_date - x.max()).days,  # Recency
    'InvoiceNo': 'nunique',                                    # Frequency
    'TotalAmount': 'sum'                                       # Monetary
}).rename(columns={
    'InvoiceDate': 'Recency', 
    'InvoiceNo': 'Frequency', 
    'TotalAmount': 'Monetary'
})

print(f"RFM base features created: {rfm.shape}")
print(rfm.head())

In [None]:
# Add additional features

# Customer tenure
tenure = retail_data_clean.groupby('CustomerID')['InvoiceDate'].agg(['min', 'max']).reset_index()
tenure['Tenure'] = (tenure['max'] - tenure['min']).dt.days
rfm = rfm.merge(tenure[['CustomerID', 'Tenure']], on='CustomerID')

# Average Order Value
total_by_invoice = retail_data_clean.groupby(['CustomerID', 'InvoiceNo'])['TotalAmount'].sum().reset_index()
avg_order_value = total_by_invoice.groupby('CustomerID')['TotalAmount'].mean().reset_index()
avg_order_value.columns = ['CustomerID', 'AvgOrderValue']
rfm = rfm.merge(avg_order_value, on='CustomerID')

# Product diversity
product_diversity = retail_data_clean.groupby('CustomerID')['StockCode'].nunique().reset_index()
product_diversity.columns = ['CustomerID', 'ProductDiversity']
rfm = rfm.merge(product_diversity, on='CustomerID')

print(f"Enhanced RFM features: {rfm.shape}")
print(rfm.head())

In [None]:
# Create RFM scores (fixed version)

# Create quintile-based scores
rfm['R_Score'] = pd.qcut(rfm['Recency'], 5, labels=[5, 4, 3, 2, 1], duplicates='drop')
rfm['F_Score'] = pd.qcut(rfm['Frequency'].rank(method='first'), 5, labels=[1, 2, 3, 4, 5], duplicates='drop')
rfm['M_Score'] = pd.qcut(rfm['Monetary'].rank(method='first'), 5, labels=[1, 2, 3, 4, 5], duplicates='drop')

# Convert categorical scores to numeric BEFORE any operations
rfm['R_Score'] = rfm['R_Score'].astype(float)
rfm['F_Score'] = rfm['F_Score'].astype(float)
rfm['M_Score'] = rfm['M_Score'].astype(float)

# Now we can safely fill NaN values
rfm['R_Score'] = rfm['R_Score'].fillna(3)  # Fill with median score
rfm['F_Score'] = rfm['F_Score'].fillna(3)
rfm['M_Score'] = rfm['M_Score'].fillna(3)

# Create composite RFM score
rfm['RFM_Score'] = rfm['R_Score'] + rfm['F_Score'] + rfm['M_Score']

print("RFM scores created successfully!")
print(rfm[['R_Score', 'F_Score', 'M_Score', 'RFM_Score']].describe())

In [None]:
# Customer segmentation
def segment_customers(row):
    """Segment customers based on RFM scores"""
    if row['RFM_Score'] >= 12:
        return 'Champions'
    elif row['RFM_Score'] >= 9:
        return 'Loyal Customers'
    elif row['R_Score'] >= 4 and (row['F_Score'] + row['M_Score'] >= 6):
        return 'Potential Loyalists'
    elif row['R_Score'] >= 3 and (row['F_Score'] + row['M_Score'] >= 5):
        return 'Recent Customers'
    elif row['R_Score'] <= 2 and (row['F_Score'] + row['M_Score'] >= 6):
        return 'At Risk Customers'
    elif row['R_Score'] <= 2 and (row['F_Score'] + row['M_Score'] <= 5):
        return 'Hibernating'
    else:
        return 'Others'

rfm['Segment'] = rfm.apply(segment_customers, axis=1)

print("Customer Segmentation:")
print(rfm['Segment'].value_counts())

In [None]:
# Additional derived features
rfm['PurchaseFrequency'] = np.where(rfm['Tenure'] > 0, 
                                   rfm['Frequency'] / (rfm['Tenure'] / 7), 
                                   rfm['Frequency'])

rfm['FrequencyMonetaryRatio'] = rfm['Frequency'] / (rfm['Monetary'] + 1)
rfm['AvgDaysBetweenPurchases'] = rfm['Tenure'] / (rfm['Frequency'] + 1)

# Fill any remaining NaN values with 0
numeric_columns = rfm.select_dtypes(include=[np.number]).columns
rfm[numeric_columns] = rfm[numeric_columns].fillna(0)

print(f"Final RFM dataset shape: {rfm.shape}")
print(f"Features: {list(rfm.columns)}")

In [None]:
# Save the processed data
rfm.to_csv('rfm_features_fixed.csv', index=True)
print("RFM features saved to 'rfm_features_fixed.csv'")

# Display summary statistics
print("\nRFM Summary Statistics:")
print(rfm.describe())

In [None]:
# Create visualizations
plt.figure(figsize=(20, 15))

# RFM distributions
plt.subplot(3, 3, 1)
sns.histplot(rfm['Recency'], kde=True)
plt.title('Recency Distribution')

plt.subplot(3, 3, 2)
sns.histplot(rfm['Frequency'], kde=True)
plt.title('Frequency Distribution')

plt.subplot(3, 3, 3)
sns.histplot(rfm['Monetary'], kde=True)
plt.xlim(0, rfm['Monetary'].quantile(0.95))
plt.title('Monetary Distribution')

plt.subplot(3, 3, 4)
sns.histplot(rfm['AvgOrderValue'], kde=True)
plt.xlim(0, rfm['AvgOrderValue'].quantile(0.95))
plt.title('Average Order Value Distribution')

plt.subplot(3, 3, 5)
sns.histplot(rfm['Tenure'], kde=True)
plt.title('Customer Tenure Distribution')

plt.subplot(3, 3, 6)
sns.countplot(data=rfm, x='Segment')
plt.xticks(rotation=45)
plt.title('Customer Segments')

plt.subplot(3, 3, 7)
sns.scatterplot(data=rfm, x='Recency', y='Monetary', hue='Segment', alpha=0.6)
plt.xlim(0, rfm['Recency'].quantile(0.95))
plt.ylim(0, rfm['Monetary'].quantile(0.95))
plt.title('Recency vs Monetary by Segment')

plt.subplot(3, 3, 8)
sns.scatterplot(data=rfm, x='Frequency', y='Monetary', hue='Segment', alpha=0.6)
plt.xlim(0, rfm['Frequency'].quantile(0.95))
plt.ylim(0, rfm['Monetary'].quantile(0.95))
plt.title('Frequency vs Monetary by Segment')

plt.subplot(3, 3, 9)
sns.boxplot(data=rfm, x='Segment', y='RFM_Score')
plt.xticks(rotation=45)
plt.title('RFM Score by Segment')

plt.tight_layout()
plt.savefig('rfm_analysis_fixed.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n=== RFM ANALYSIS COMPLETED SUCCESSFULLY ===")
print(f"Total customers analyzed: {len(rfm)}")
print(f"Features created: {rfm.shape[1]}")
print("Files generated:")
print("✓ rfm_features_fixed.csv")
print("✓ rfm_analysis_fixed.png")