# Customer Segmentation

This notebook performs customer segmentation using clustering algorithms.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

# Add src directory to path
sys.path.append(os.path.join(os.path.dirname(os.path.abspath('')), '../src'))

# Import modules
from src.data_preprocessing import DataPreprocessor
from src.customer_segmentation import CustomerSegmentation

# Set visualization style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')

# Display settings
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

## 1. Load Processed Data

In [None]:
# Initialize data preprocessor
preprocessor = DataPreprocessor()

# Load processed data
df = preprocessor.load_processed_data('../data/processed/processed_data.csv')

# Display first few rows
df.head()

## 2. Prepare Data for Clustering

In [None]:
# Prepare data for clustering (exclude target variable and customer ID)
clustering_data = df.drop(columns=['Churn', 'CustomerID'])

print(f"Data shape for clustering: {clustering_data.shape}")
print("\nFeatures for clustering:")
print(clustering_data.columns.tolist())

## 3. Find Optimal Number of Clusters

In [None]:
# Initialize customer segmentation
segmenter = CustomerSegmentation()

# Find optimal number of clusters
inertias, silhouette_scores = segmenter.find_optimal_clusters(clustering_data, max_clusters=8)

## 4. Perform Clustering

In [None]:
# Fit clustering model with optimal number of clusters
optimal_clusters = 5  # Based on the analysis
clusters = segmenter.fit_predict(clustering_data, n_clusters=optimal_clusters)

# Add cluster labels to the original data
df_with_clusters = df.copy()
df_with_clusters['Cluster'] = clusters

# Display cluster distribution
cluster_counts = df_with_clusters['Cluster'].value_counts().sort_index()
print("Cluster Distribution:")
print(cluster_counts)

# Visualize cluster distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='Cluster', data=df_with_clusters)
plt.title('Cluster Distribution')
plt.xlabel('Cluster')
plt.ylabel('Count')

# Add percentage labels
total = len(df_with_clusters)
for p in plt.gca().patches:
    height = p.get_height()
    plt.gca().text(p.get_x() + p.get_width()/2., height + 50,
                    f'{height/total*100:.1f}%',
                    ha='center', va='bottom')

plt.tight_layout()
plt.savefig('../visualizations/cluster_distribution.png', dpi=300)
plt.show()

## 5. Visualize Clusters

In [None]:
# Perform PCA for visualization
pca_data, explained_variance = segmenter.perform_pca(clustering_data, n_components=2)

print(f"Explained variance by PCA components: {explained_variance}")

# Visualize clusters using PCA
segmenter.visualize_clusters(clustering_data, clusters, pca_data)

## 6. Analyze Cluster Characteristics

In [None]:
# Analyze clusters
cluster_analysis = segmenter.analyze_clusters(clustering_data, clusters)

# Analyze churn by cluster
cluster_churn = df_with_clusters.groupby('Cluster')['Churn'].agg(['mean', 'sum', 'count'])
cluster_churn.columns = ['Churn Rate', 'Churn Count', 'Total Customers']
cluster_churn['Churn Rate'] = cluster_churn['Churn Rate'] * 100

print("Churn Analysis by Cluster:")
print(cluster_churn)

# Visualize churn by cluster
plt.figure(figsize=(12, 6))
sns.barplot(x=cluster_churn.index, y='Churn Rate', data=cluster_churn)
plt.title('Churn Rate by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Churn Rate (%)')

# Add value labels
for p in plt.gca().patches:
    height = p.get_height()
    plt.gca().text(p.get_x() + p.get_width()/2., height + 0.5,
                    f'{height:.1f}%',
                    ha='center', va='bottom')

plt.tight_layout()
plt.savefig('../visualizations/churn_by_cluster.png', dpi=300)
plt.show()

## 7. Feature Analysis by Cluster

In [None]:
# Select key features for analysis
key_features = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary']

# Create a figure with subplots
plt.figure(figsize=(15, 12))

for i, feature in enumerate(key_features):
    plt.subplot(2, 2, i+1)
    sns.boxplot(x='Cluster', y=feature, data=df_with_clusters)
    plt.title(f'{feature} by Cluster')
    plt.tight_layout()

plt.savefig('../visualizations/features_by_cluster.png', dpi=300)
plt.show()

## 8. Save Model and Results

In [None]:
# Save clustering model
segmenter.save_model('../models/cluster_model.pkl')

# Save data with clusters
df_with_clusters.to_csv('../data/processed/data_with_clusters.csv', index=False)

print("Model and results saved successfully.")

## 9. Summary

This notebook performed customer segmentation using K-means clustering. Key findings:

1. Identified 5 distinct customer segments
2. Clusters show different characteristics and churn patterns
3. Some clusters have significantly higher churn rates
4. PCA visualization shows clear separation between clusters

Next steps:
- Build churn prediction models
- Generate business insights and recommendations
- Develop targeted retention strategies