In [None]:
import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.cluster import KMeans
print("Setup Complete")

In [None]:
#1 : Read data

path = "/kaggle/input/customer-segmentation/Customer_Segmentation_Dataset.csv"
data = pd.read_csv(path,index_col=0)
#info about data
#data.info()
data.head()
#describe data
#data.describe()

In [None]:
#2.1 : Explore correlations between features
# Correlation measures the strength of relationships between numerical features.

corr_mat = data.corr()

# visualize the corr_mat with seaborn

plt.figure(figsize=(14,6)) # plot size 14x6 inches
sns.heatmap(corr_mat , annot = True, cmap = "coolwarm") # Plot heatmap of corr_mat
plt.title("Correlation matrix")
plt.show()

This Correlation matrix explains that there is no/less correlation between any 2 features

In [None]:
#  2.2) Identify high-spending categories

#sum all spending columns

spending_columns = ["Groceries_Spend","Electronics_Spend","Clothing_Spend","Dining_Spend"]
total_spending = data[spending_columns].sum()  # Outputs total amount spent on each category

#Bar plot of total_spending per category

total_spending.plot(kind = "bar", title = "Total Spending by category", color = ['red','green','blue','brown'])
plt.ylabel("Total Spending (USD)")
plt.show()

The above barplot shows that the categories of spending is in following order . Electronics > Groceries > Dining > Clothing

In [None]:
# Income vs Spending Analysis

sns.scatterplot(data = data , x="Annual_Income", y = "Spending_Score", hue = "Age", palette = "viridis")
plt.title("Annual income vs spending score")
plt.show()

This plot shows that there is no relation between annual income and spending score

In [None]:
# Distribution of Features
# Visualize distribution of age . X -> Age, Y ->No.of people in that age

sns.histplot(data["Age"], kde = True , color = "skyblue", bins =10)
plt.title("Age distribution")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()

In [None]:
# 3) K means clustering

kmeans = KMeans(n_clusters = 3, random_state = 42)
data["cluster"] = kmeans.fit_predict(data[spending_columns])

sns.scatterplot(data=data , x="Annual_Income", y="Spending_Score",hue = "cluster",palette="deep")
plt.title("Customer Segmentation")
plt.xlabel("Annual Income")
plt.ylabel("Spending Score")
plt.show()

In [None]:
# 4 ) Implement PCA

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# step 1 : Select targets columns for PCA

numerical_column = ["Annual_Income","Spending_Score","Groceries_Spend","Electronics_Spend","Clothing_Spend","Dining_Spend"]
data_subset = data[numerical_column]

# Step 2: Standardize / scale the data as PCA is subject to change with magnitude

scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_subset)

# Step 3 : Apply PCA

pca = PCA(n_components = 2) # Reduce dimensions to 2 for visualization
pca_components = pca.fit_transform(data_scaled)

# Step 4 : Create a new DataFrame with PCA Components
pca_df = pd.DataFrame(data = pca_components , columns = ['PC1','PC2'])
pca_df["Cluster"] = data["cluster"]

# step 5 : Visualize

plt.figure(figsize=(14,6))
sns.scatterplot(data=pca_df , x="PC1", y="PC2", hue = "Cluster" , palette = "deep" , s=100)
plt.title("PCA visualization of customer data")
plt.xlabel("Principal component 1")
plt.ylabel("Principal component 2")
plt.show()