# Customer Segmentation Analysis

This notebook explores customer segmentation based on engagement patterns and reward preferences.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import json
import os
from datetime import datetime

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

## Load and Prepare Data

In [None]:
# Load customer data
with open('../data/customers.json', 'r') as f:
    customers = json.load(f)
    
# Load events data
with open('../data/events.json', 'r') as f:
    events = json.load(f)
    
# Convert to DataFrames
customers_df = pd.DataFrame(customers)
events_df = pd.DataFrame(events)

# Display data
print(f"Loaded {len(customers_df)} customers and {len(events_df)} events")
customers_df.head()

In [None]:
# Extract customer attributes into separate columns
customers_df['age'] = customers_df['attributes'].apply(lambda x: x.get('age', 0))
customers_df['gender'] = customers_df['attributes'].apply(lambda x: x.get('gender', 'unknown'))
customers_df['location'] = customers_df['attributes'].apply(lambda x: x.get('location', 'unknown'))
customers_df['interest_count'] = customers_df['attributes'].apply(lambda x: len(x.get('interests', [])))

# Convert created_at to datetime
customers_df['created_at'] = pd.to_datetime(customers_df['created_at'])
customers_df['days_since_signup'] = (datetime.now() - customers_df['created_at']).dt.days

customers_df.head()

In [None]:
# Calculate engagement metrics per customer
events_df['timestamp'] = pd.to_datetime(events_df['timestamp'])

# Count events by type for each customer
event_counts = events_df.groupby(['customer_id', 'event_type']).size().unstack(fill_value=0)

# Calculate recency (days since last event)
last_event = events_df.groupby('customer_id')['timestamp'].max()
last_event = pd.DataFrame(last_event).rename(columns={'timestamp': 'last_event'})
last_event['days_since_last_event'] = (datetime.now() - last_event['last_event']).dt.days

# Calculate frequency (total events)
event_frequency = events_df.groupby('customer_id').size().to_frame('total_events')

# Merge all metrics
engagement_metrics = pd.merge(event_counts, last_event, on='customer_id', how='outer')
engagement_metrics = pd.merge(engagement_metrics, event_frequency, on='customer_id', how='outer')

# Fill NaN values
engagement_metrics = engagement_metrics.fillna(0)

engagement_metrics.head()

## Perform Clustering

In [None]:
# Merge customer data with engagement metrics
customer_features = pd.merge(customers_df, engagement_metrics, left_on='id', right_on='customer_id', how='left')
customer_features = customer_features.fillna(0)

# Select features for clustering
features = [
    'age', 'interest_count', 'days_since_signup', 'days_since_last_event',
    'email_open', 'email_click', 'reward_claim', 'purchase', 'total_events'
]

# Scale the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features[features])

# Determine optimal number of clusters (elbow method)
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=42)
    kmeans.fit(scaled_features)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
# Apply K-means clustering with k=5
kmeans = KMeans(n_clusters=5, init='k-means++', max_iter=300, n_init=10, random_state=42)
customer_features['cluster'] = kmeans.fit_predict(scaled_features)

# Count customers in each cluster
cluster_counts = customer_features['cluster'].value_counts().sort_index()
print("Customers per cluster:")
print(cluster_counts)

# Analyze cluster characteristics
cluster_analysis = customer_features.groupby('cluster')[features].mean()
print("\nCluster characteristics:")
cluster_analysis

## Visualize Segments

In [None]:
# Visualize clusters
plt.figure(figsize=(12, 10))

# Select two features for visualization
x_feature = 'days_since_last_event'
y_feature = 'total_events'

plt.scatter(
    customer_features[x_feature],
    customer_features[y_feature],
    c=customer_features['cluster'],
    cmap='viridis',
    s=50,
    alpha=0.7
)

plt.colorbar(label='Cluster')
plt.xlabel(x_feature)
plt.ylabel(y_feature)
plt.title('Customer Segments based on Engagement')
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()

In [None]:
# Radar chart for cluster profiles
from matplotlib.path import Path
from matplotlib.spines import Spine
from matplotlib.transforms import Affine2D

def radar_chart(df, features, labels=None):
    # Number of variables
    N = len(features)
    
    # What will be the angle of each axis in the plot
    angles = [n / float(N) * 2 * np.pi for n in range(N)]
    angles += angles[:1]
    
    # Initialize the plot
    fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True))
    
    # If you want the first axis to be on top:
    ax.set_theta_offset(np.pi / 2)
    ax.set_theta_direction(-1)
    
    # Draw one axis per variable + add labels
    plt.xticks(angles[:-1], features, size=12)
    
    # Draw ylabels
    ax.set_rlabel_position(0)
    plt.yticks([0.2, 0.4, 0.6, 0.8], ["0.2", "0.4", "0.6", "0.8"], size=10)
    plt.ylim(0, 1)
    
    # Plot each cluster
    for i, row in enumerate(df.values):
        values = row.tolist()
        values += values[:1]
        ax.plot(angles, values, linewidth=2, linestyle='solid', label=f"Cluster {i}")
        ax.fill(angles, values, alpha=0.1)
    
    # Add legend
    plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    plt.title('Cluster Profiles', size=20, y=1.1)
    
    return plt

# Normalize cluster analysis for radar chart
normalized_clusters = cluster_analysis.copy()
for feature in features:
    max_val = normalized_clusters[feature].max()
    if max_val > 0:  # Avoid division by zero
        normalized_clusters[feature] = normalized_clusters[feature] / max_val

# Create radar chart
radar_chart(normalized_clusters, features)
plt.show()

## Assign Segment Names and Export

In [None]:
# Assign meaningful segment names based on cluster characteristics
segment_mapping = {
    0: "At Risk",      # High days_since_last_event, low engagement
    1: "Standard",     # Average metrics across the board
    2: "Recent",       # Low days_since_signup, moderate engagement
    3: "Active",       # High total_events, high email engagement
    4: "VIP"           # High purchase and reward_claim, high total_events
}

# Map cluster numbers to segment names
customer_features['segment'] = customer_features['cluster'].map(segment_mapping)

# Count customers in each segment
segment_counts = customer_features['segment'].value_counts()
print("Customers per segment:")
print(segment_counts)

# Visualize segment distribution
plt.figure(figsize=(10, 6))
segment_counts.plot(kind='bar', color='skyblue')
plt.title('Customer Segment Distribution')
plt.xlabel('Segment')
plt.ylabel('Number of Customers')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Export segmented customer data
segmented_customers = customer_features[['id', 'email', 'name', 'age', 'gender', 'location', 'segment']]
segmented_customers.to_csv('../data/segmented_customers.csv', index=False)

print(f"Exported {len(segmented_customers)} segmented customers to '../data/segmented_customers.csv'")

## Segment-Based Reward Recommendations

In [None]:
# Define segment-specific reward strategies
segment_strategies = {
    "VIP": {
        "primary_rewards": ["exclusive_access", "high_value_gift_cards", "premium_loyalty_points"],
        "secondary_rewards": ["free_shipping", "personalized_discounts"],
        "content_focus": ["exclusive_offers", "early_access", "premium_content"]
    },
    "Active": {
        "primary_rewards": ["loyalty_points", "tiered_discounts", "referral_bonuses"],
        "secondary_rewards": ["free_items", "birthday_rewards"],
        "content_focus": ["games", "personalized_recommendations", "community_content"]
    },
    "Recent": {
        "primary_rewards": ["welcome_discounts", "first_purchase_bonus", "free_trial_extensions"],
        "secondary_rewards": ["small_gift_cards", "loyalty_program_enrollment"],
        "content_focus": ["product_education", "brand_introduction", "onboarding_content"]
    },
    "Standard": {
        "primary_rewards": ["percentage_discounts", "seasonal_offers", "loyalty_program"],
        "secondary_rewards": ["free_shipping_threshold", "bundle_discounts"],
        "content_focus": ["product_highlights", "educational_content", "seasonal_campaigns"]
    },
    "At Risk": {
        "primary_rewards": ["win_back_discounts", "free_shipping", "no_minimum_purchase_offers"],
        "secondary_rewards": ["extended_returns", "customer_feedback_incentives"],
        "content_focus": ["re-engagement_campaigns", "simplified_options", "direct_value_proposition"]
    }
}

# Display strategy summary
for segment, strategy in segment_strategies.items():
    print(f"\n{segment} Segment Strategy:")
    print(f"Primary Rewards: {', '.join(strategy['primary_rewards'])}")
    print(f"Secondary Rewards: {', '.join(strategy['secondary_rewards'])}")
    print(f"Content Focus: {', '.join(strategy['content_focus'])}")

In [None]:
# Visualize optimal content mix by segment
content_types = ['questions', 'games', 'vouchers', 'newsletters', 'surveys']
content_mix = {
    "VIP": [0.1, 0.2, 0.3, 0.3, 0.1],
    "Active": [0.2, 0.3, 0.2, 0.2, 0.1],
    "Recent": [0.3, 0.2, 0.3, 0.1, 0.1],
    "Standard": [0.2, 0.1, 0.2, 0.4, 0.1],
    "At Risk": [0.1, 0.1, 0.5, 0.2, 0.1]
}

# Create stacked bar chart
content_df = pd.DataFrame(content_mix, index=content_types)

plt.figure(figsize=(12, 6))
content_df.plot(kind='bar', stacked=True, colormap='viridis')
plt.title('Optimal Content Mix by Segment')
plt.xlabel('Content Type')
plt.ylabel('Proportion')
plt.legend(title='Segment')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()