In [3]:
# 📦 Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.impute import SimpleImputer
from scipy.cluster.hierarchy import dendrogram, linkage

# 📥 Load the dataset
file_path = 'Marketing_Campaign_Dataset.csv'  # Replace with your dataset path
df = pd.read_csv(file_path)

# 🧹 Data cleaning and feature engineering
# ✅ Handle missing values
imputer = SimpleImputer(strategy='median')
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

# Convert 'Dt_Customer' to datetime and calculate 'Customer_Tenure'
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], errors='coerce')
df['Customer_Tenure'] = (pd.to_datetime('today') - df['Dt_Customer']).dt.days.fillna(0)

# Remove outliers based on 'Income'
df = df[df['Income'] < df['Income'].quantile(0.99)]

# Create new features
df['Family_Size'] = df['Kidhome'] + df['Teenhome']
spend_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 
              'MntSweetProducts', 'MntGoldProds']
df['Total_Spend'] = df[spend_cols].sum(axis=1)

# Drop irrelevant columns
drop_cols = ['ID', 'Dt_Customer', 'Z_CostContact', 'Z_Revenue']
df = df.drop(columns=[col for col in drop_cols if col in df.columns], errors='ignore')

# 🔄 Encode categorical features
categorical_cols = ['Education', 'Marital_Status']
encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
encoded = encoder.fit_transform(df[categorical_cols])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(categorical_cols), index=df.index)
df = pd.concat([df.drop(categorical_cols, axis=1), encoded_df], axis=1)

# 🔢 Normalize and reduce dimensions
scaler = StandardScaler()
scaled = scaler.fit_transform(df)

# 🧭 PCA for dimensionality reduction
pca = PCA(n_components=2)
reduced = pca.fit_transform(scaled)

# 🧭 Apply clustering models
## K-Means
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans_labels = kmeans.fit_predict(reduced)

## Hierarchical Clustering
hierarchical = AgglomerativeClustering(n_clusters=4)
hierarchical_labels = hierarchical.fit_predict(reduced)

## DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(reduced)

# 📊 Visualization
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.scatter(reduced[:, 0], reduced[:, 1], c=kmeans_labels, cmap='viridis')
plt.title("K-Means Clustering")
plt.xlabel('PC1')
plt.ylabel('PC2')

plt.subplot(1, 3, 2)
plt.scatter(reduced[:, 0], reduced[:, 1], c=hierarchical_labels, cmap='plasma')
plt.title("Hierarchical Clustering")
plt.xlabel('PC1')
plt.ylabel('PC2')

plt.subplot(1, 3, 3)
plt.scatter(reduced[:, 0], reduced[:, 1], c=dbscan_labels, cmap='coolwarm')
plt.title("DBSCAN Clustering")
plt.xlabel('PC1')
plt.ylabel('PC2')

plt.tight_layout()
plt.show()

# 📈 Evaluate clustering performance
print(f"K-Means Silhouette Score: {silhouette_score(reduced, kmeans_labels):.3f}")
print(f"Hierarchical Clustering Silhouette Score: {silhouette_score(reduced, hierarchical_labels):.3f}")

# 📂 Data overview
display(df.head())

# ✅ Steps to Run:
# 1. Save the code into a Jupyter Notebook cell and execute it step-by-step.
# 2. Replace 'Marketing_Campaign_Dataset.csv' with your correct dataset path if needed.
# 3. Ensure all necessary libraries are installed (`pip install pandas scikit-learn matplotlib seaborn`).
# 4. Visualizations and silhouette scores will help assess cluster quality.

# 🚀 Let me know if you need help with dashboard deployment or additional analyses!


MemoryError: Unable to allocate 36.7 GiB for an array with shape (4927812450,) and data type float64