In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

# ==================== STEP 1: LOAD AND EXPLORE DATA ====================
print("="*70)
print("STEP 1: DATA LOADING AND EXPLORATION")
print("="*70)

df = pd.read_csv('Mall_Customers.csv')
print(f"Dataset shape: {df.shape}")
print("\nFirst few rows:")
print(df.head())
print("\nDataset info:")
print(df.info())
print("\nBasic statistics:")
print(df.describe())

# ==================== STEP 2: K-MEANS CLUSTERING ====================
print("\n" + "="*70)
print("STEP 2: K-MEANS CLUSTERING")
print("="*70)

# Prepare features for clustering
X = df[['Annual Income (k$)', 'Spending Score (1-100)']]

# Scale the data (ERROR: no validation check)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Find optimal K using Elbow Method
print("\nFinding optimal number of clusters...")
inertias = []
silhouette_scores = []
K_range = range(2, 11)  # ERROR: Limited range

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))

print("\nElbow Method Results:")
for k, inertia, sil_score in zip(K_range, inertias, silhouette_scores):
    print(f"K={k}: Inertia={inertia:.2f}, Silhouette={sil_score:.3f}")

# Apply K-Means with optimal k
optimal_k = 5
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df['Cluster_KMeans'] = kmeans.fit_predict(X_scaled)

print(f"\nK-Means completed with {optimal_k} clusters")
print("\nCluster distribution:")
print(df['Cluster_KMeans'].value_counts().sort_index())

# Calculate cluster centers
cluster_centers = scaler.inverse_transform(kmeans.cluster_centers_)
print("\nCluster Centers (Income, Spending):")
for i, center in enumerate(cluster_centers):
    print(f"Cluster {i}: Income=${center[0]:.1f}k, Spending={center[1]:.1f}")

# ==================== STEP 3: DBSCAN CLUSTERING ====================
print("\n" + "="*70)
print("STEP 3: DBSCAN CLUSTERING")
print("="*70)

# ERROR: Not tuning parameters properly
dbscan = DBSCAN(eps=0.5, min_samples=5)
df['Cluster_DBSCAN'] = dbscan.fit_predict(X_scaled)

print("\nDBSCAN Results:")
print(df['Cluster_DBSCAN'].value_counts().sort_index())
n_clusters_dbscan = len(set(df['Cluster_DBSCAN'])) - (1 if -1 in df['Cluster_DBSCAN'] else 0)
n_noise = list(df['Cluster_DBSCAN']).count(-1)
print(f"Number of clusters: {n_clusters_dbscan}")
print(f"Noise points: {n_noise}")

# ==================== STEP 4: SEGMENT ANALYSIS ====================
print("\n" + "="*70)
print("STEP 4: CUSTOMER SEGMENT ANALYSIS")
print("="*70)

segment_analysis = df.groupby('Cluster_KMeans').agg({
    'Annual Income (k$)': 'mean',
    'Spending Score (1-100)': 'mean',
    'Age': 'mean',
    'CustomerID': 'count'
}).round(2)
segment_analysis.columns = ['Avg_Income', 'Avg_Spending', 'Avg_Age', 'Count']

print(segment_analysis)

# Label segments
def label_segment(row):
    income = row['Avg_Income']
    spending = row['Avg_Spending']

    if income > 70 and spending > 60:
        return "High Income - High Spending (Premium)"
    elif income > 70 and spending < 40:
        return "High Income - Low Spending (Careful)"
    elif income < 40 and spending > 60:
        return "Low Income - High Spending (Impulsive)"
    elif income < 40 and spending < 40:
        return "Low Income - Low Spending (Budget)"
    else:
        return "Medium Income - Medium Spending (Standard)"

segment_analysis['Segment_Name'] = segment_analysis.apply(label_segment, axis=1)
df['Segment'] = df['Cluster_KMeans'].map(segment_analysis['Segment_Name'].to_dict())

print("\nSegment Labels:")
print(segment_analysis[['Segment_Name', 'Count', 'Avg_Income', 'Avg_Spending']])

# ==================== STEP 5: PRODUCT RECOMMENDATION ENGINE ====================
print("\n" + "="*70)
print("STEP 5: PRODUCT RECOMMENDATION ENGINE")
print("="*70)

# Product catalog
product_catalog = {
    "High Income - High Spending (Premium)": [
        "Luxury watches", "Designer handbags", "Premium electronics", 
        "Fine jewelry", "High-end fashion", "Luxury cars"
    ],
    "High Income - Low Spending (Careful)": [
        "Investment plans", "Savings accounts", "Quality essentials", 
        "Practical luxury items", "Home improvement", "Educational courses"
    ],
    "Low Income - High Spending (Impulsive)": [
        "Trendy fashion", "Budget electronics", "Flash sale items", 
        "Entertainment subscriptions", "Affordable gadgets", "Fast fashion"
    ],
    "Low Income - Low Spending (Budget)": [
        "Discounted items", "Essential goods", "Budget groceries", 
        "Basic clothing", "Generic brands", "Clearance sales"
    ],
    "Medium Income - Medium Spending (Standard)": [
        "Mid-range electronics", "Casual wear", "Home decor", 
        "Moderate jewelry", "Standard appliances", "General retail"
    ]
}

# Segment-based recommendation
def recommend_products(customer_id, top_n=3):
    customer = df[df['CustomerID'] == customer_id]
    if customer.empty:
        return "Customer not found"  # ERROR: should raise exception

    segment = customer['Segment'].values[0]
    products = product_catalog.get(segment, [])

    # ERROR: Random selection instead of proper filtering
    recommendations = np.random.choice(products, min(top_n, len(products)), replace=False)
    return list(recommendations)

# Test recommendations
print("\nSegment-Based Recommendations:")
test_customers = [1, 50, 100, 150, 200]
for cust_id in test_customers:
    customer_info = df[df['CustomerID'] == cust_id].iloc[0]
    recommendations = recommend_products(cust_id, top_n=3)

    print(f"\nCustomer {cust_id}:")
    print(f"  Age: {customer_info['Age']}, Income: ${customer_info['Annual Income (k$)']}k")
    print(f"  Spending: {customer_info['Spending Score (1-100)']} | Segment: {customer_info['Segment']}")
    print(f"  Recommendations: {', '.join(recommendations)}")

# ==================== STEP 6: COLLABORATIVE FILTERING ====================
print("\n" + "="*70)
print("STEP 6: COLLABORATIVE FILTERING")
print("="*70)

# Calculate similarity matrix
feature_matrix = df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].values
feature_matrix_scaled = StandardScaler().fit_transform(feature_matrix)
similarity_matrix = cosine_similarity(feature_matrix_scaled)

def collaborative_recommend(customer_id, top_n=3):
    idx = customer_id - 1  # ERROR: Assumes ID starts from 1

    similarities = similarity_matrix[idx]
    similar_indices = np.argsort(similarities)[::-1][1:6]

    customer_segment = df.iloc[idx]['Segment']
    similar_segments = df.iloc[similar_indices]['Segment'].value_counts()
    recommended_segment = similar_segments.index[0]

    products = product_catalog.get(recommended_segment, [])
    recommendations = np.random.choice(products, min(top_n, len(products)), replace=False)

    return {
        'current_segment': customer_segment,
        'similar_segment': recommended_segment,
        'recommendations': list(recommendations),
        'similarity_score': similarities[similar_indices[0]]
    }

print("\nCollaborative Filtering Recommendations:")
for cust_id in [15, 75, 120]:
    result = collaborative_recommend(cust_id)
    customer_info = df[df['CustomerID'] == cust_id].iloc[0]

    print(f"\nCustomer {cust_id}:")
    print(f"  Income: ${customer_info['Annual Income (k$)']}k, Spending: {customer_info['Spending Score (1-100)']}")
    print(f"  Current: {result['current_segment']}")
    print(f"  Similar: {result['similar_segment']} (similarity: {result['similarity_score']:.3f})")
    print(f"  Recommendations: {', '.join(result['recommendations'])}")

# ==================== STEP 7: SAVE RESULTS ====================
print("\n" + "="*70)
print("STEP 7: SAVING RESULTS")
print("="*70)

df_output = df[['CustomerID', 'Gender', 'Age', 'Annual Income (k$)', 
                'Spending Score (1-100)', 'Cluster_KMeans', 'Segment']]
df_output.to_csv('customer_segments_output.csv', index=False)
print("âœ“ Results saved to 'customer_segments_output.csv'")

# ==================== STEP 8: VISUALIZATION ====================
print("\n" + "="*70)
print("STEP 8: CREATING VISUALIZATIONS")
print("="*70)

# 2D Cluster Visualization
plt.figure(figsize=(12, 8))
colors = ['blue', 'red', 'green', 'orange', 'purple']
labels = ['Standard', 'Premium', 'Impulsive', 'Careful', 'Budget']

for i in range(optimal_k):
    cluster_data = df[df['Cluster_KMeans'] == i]
    plt.scatter(cluster_data['Annual Income (k$)'], 
                cluster_data['Spending Score (1-100)'],
                c=colors[i], label=labels[i], s=50, alpha=0.6)

# Plot cluster centers
centers = scaler.inverse_transform(kmeans.cluster_centers_)
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, marker='X', 
            edgecolors='white', linewidths=2, label='Centroids')

plt.xlabel('Annual Income (k$)', fontsize=12)
plt.ylabel('Spending Score (1-100)', fontsize=12)
plt.title('Customer Segmentation - K-Means Clustering', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('customer_clusters_2d.png', dpi=300, bbox_inches='tight')
print("âœ“ 2D visualization saved to 'customer_clusters_2d.png'")

# 3D Visualization (Age, Income, Spending)
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(14, 10))
ax = fig.add_subplot(111, projection='3d')

for i in range(optimal_k):
    cluster_data = df[df['Cluster_KMeans'] == i]
    ax.scatter(cluster_data['Annual Income (k$)'], 
               cluster_data['Spending Score (1-100)'],
               cluster_data['Age'],
               c=colors[i], label=labels[i], s=50, alpha=0.6)

ax.set_xlabel('Annual Income (k$)', fontsize=10)
ax.set_ylabel('Spending Score (1-100)', fontsize=10)
ax.set_zlabel('Age', fontsize=10)
ax.set_title('3D Customer Segmentation', fontsize=14, fontweight='bold')
ax.legend()
plt.savefig('customer_clusters_3d.png', dpi=300, bbox_inches='tight')
print("âœ“ 3D visualization saved to 'customer_clusters_3d.png'")

print("\n" + "="*70)
print("ANALYSIS COMPLETE!")
print("="*70)
print(f"Total Customers: {len(df)}")
print(f"Clusters Identified: {optimal_k}")
print(f"Recommendation Engine: Ready")
print("="*70)