In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import os

class CustomerSegmentation:
    def __init__(self):
        self.raw_data = self.generate_synthetic_data()
        self.preprocessed_data = None
        self.clusters = None

    def generate_synthetic_data(self, n_samples=1000):
        """Generate synthetic customer data"""
        np.random.seed(42)
        data = {
            'customer_id': range(1, n_samples + 1),
            'age': np.random.normal(45, 15, n_samples),
            'annual_income': np.random.normal(75000, 25000, n_samples),
            'total_purchases': np.random.gamma(5, 1000, n_samples),
            'first_purchase_date': pd.date_range(start='2018-01-01', periods=n_samples),
            'product_category': np.random.choice(['Electronics', 'Mobile', 'Home Appliance', 'TV'], n_samples)
        }
        df = pd.DataFrame(data)

        # Ensure data folder exists
        os.makedirs('data', exist_ok=True)
        os.makedirs('output', exist_ok=True)

        # Export synthetic dataset
        df.to_csv('data/lg_customer_data.csv', index=False)
        return df

    def preprocess_data(self):
        # Feature engineering
        self.raw_data['product_category_diversity'] = self.raw_data.groupby('customer_id')['product_category'].transform('nunique')
        self.raw_data['loyalty_duration'] = (pd.Timestamp.now() - self.raw_data['first_purchase_date']).dt.days

        # Select and scale features
        features = [
            'age',
            'annual_income',
            'total_purchases',
            'product_category_diversity',
            'loyalty_duration'
        ]
        X = self.raw_data[features]
        scaler = StandardScaler()
        self.preprocessed_data = scaler.fit_transform(X)

    def plot_elbow_method(self, max_clusters=10):
        inertias = []
        for k in range(1, max_clusters + 1):
            kmeans = KMeans(n_clusters=k, random_state=42)
            kmeans.fit(self.preprocessed_data)
            inertias.append(kmeans.inertia_)

        plt.figure(figsize=(10, 6))
        plt.plot(range(1, max_clusters + 1), inertias, marker='o')
        plt.title('Elbow Method for Optimal k')
        plt.xlabel('Number of Clusters')
        plt.ylabel('Inertia')
        plt.savefig('output/elbow_method.png')
        plt.close()

        return inertias

    def perform_clustering(self, n_clusters=4):
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        self.clusters = kmeans.fit_predict(self.preprocessed_data)

        # Add cluster labels to original dataframe
        self.raw_data['customer_segment'] = self.clusters

        # Export segmented data
        self.raw_data.to_csv('data/lg_customer_segments.csv', index=False)

    def analyze_segments(self):
        segment_profiles = self.raw_data.groupby('customer_segment').agg({
            'age': 'mean',
            'annual_income': 'mean',
            'total_purchases': 'mean',
            'loyalty_duration': 'mean',
            'customer_id': 'count'
        }).rename(columns={'customer_id': 'segment_size'})

        # Export segment profiles
        segment_profiles.to_csv('output/segment_profiles.csv')

        return segment_profiles

# Execution
def main():
    segmentation = CustomerSegmentation()
    segmentation.preprocess_data()

    # Plot elbow method
    segmentation.plot_elbow_method()

    # Perform clustering
    segmentation.perform_clustering(n_clusters=4)

    # Analyze segment characteristics
    segment_insights = segmentation.analyze_segments()
    print(segment_insights)

if __name__ == "__main__":
    main()

                        age  annual_income  total_purchases  loyalty_duration  \
customer_segment                                                                
0                 52.408556   72769.882424      4619.643582       2295.750831   
1                 29.781303   89275.381820      4178.024732       2053.470120   
2                 44.817401   77878.010415      8809.104954       1921.251534   
3                 51.700569   69350.648376      4107.173590       1736.470175   

                  segment_size  
customer_segment                
0                          301  
1                          251  
2                          163  
3                          285  
