# Customer Segmentation using K-Means Clustering

In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv('../data/prem_data_2.csv')
print('Dataset Loaded Successfully ✅')
df.head()

In [ ]:
print('
Dataset Info:')
print(df.info())
print('
Missing Values:
', df.isnull().sum())

df.dropna(inplace=True)

df.rename(columns=lambda x: x.strip().lower().replace(' ', '_'), inplace=True)
cols = df.columns
print('
Available Columns:', cols)

features = df[['annual_income', 'spending_score']].copy()

scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

inertia = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(scaled_features)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(8, 5))
plt.plot(range(1, 11), inertia, marker='o', linestyle='--')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

In [ ]:
kmeans = KMeans(n_clusters=5, random_state=42)
df['Cluster'] = kmeans.fit_predict(scaled_features)

plt.figure(figsize=(8, 6))
sns.scatterplot(x='annual_income', y='spending_score', hue='Cluster', data=df, palette='viridis', s=80)
plt.title('Customer Segments based on Income and Spending')
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.legend()
plt.show()

cluster_summary = df.groupby('Cluster')[['annual_income', 'spending_score']].mean()
print('
Cluster Summary:')
print(cluster_summary)

df.to_csv('../data/segmented_customers.csv', index=False)
print('
Segmented customer data saved successfully ✅')