In [3]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import os

# Set up assets folder for GitHub
os.makedirs("assets", exist_ok=True)

# Load dataset
file_path = r"Mall_Customers.csv"  # Use relative path for GitHub
df = pd.read_csv(file_path)

# Preview dataset
print("Dataset Preview:\n", df.head())
print("\nMissing Values:\n", df.isnull().sum())

# Select features for clustering
X = df[['Annual Income (k$)', 'Spending Score (1-100)']]

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Elbow Method to determine optimal clusters
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

# Plot Elbow curve and save it
plt.figure(figsize=(8, 5))
plt.plot(range(1, 11), wcss, marker='o', linestyle='--')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.grid(True)
plt.savefig("assets/elbow_plot.png", dpi=300)
plt.show()

# Apply KMeans with optimal number of clusters (assume k=5)
kmeans = KMeans(n_clusters=5, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_scaled)

# Plot clusters and save the plot
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='Annual Income (k$)', y='Spending Score (1-100)', hue='Cluster', palette='Set1', s=100)
plt.title('Customer Segments')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend(title='Cluster')
plt.grid(True)
plt.savefig("assets/cluster_plot.png", dpi=300)
plt.show()

# Print customer count per cluster
print("\nCustomer Count per Cluster:\n", df['Cluster'].value_counts())

# Save clustered data to CSV for GitHub asset tracking
df.to_csv("assets/clustered_customers.csv", index=False)


FileNotFoundError: [Errno 2] No such file or directory: 'Mall_Customers.csv'