Data Collection and Preparation

In [3]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [4]:
# Load the dataset
CustomerDF = pd.read_csv('Mall_Customers.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'Mall_Customers.csv'

 Dataset Overview

In [None]:
# View the first few rows
print(CustomerDF.head())

In [None]:
# Check dataset info
print(CustomerDF.info())

In [None]:
# Check for missing values
print(CustomerDF.isnull().sum())

 Exploratory Data Analysis (EDA)

In [None]:
#Summary Statistics
print(CustomerDF.describe())

Visualize Data Distribution

In [None]:
# Plot gender distribution
# Define a clear and cohesive color palette
color_palette = {'Male': '#3498db',  # Blue for Male
                 'Female': '#e74c3c'}  # Red for Female
# Create the countplot with custom colors
plt.figure(figsize=(6, 5))
ax = sns.countplot(x='Gender', data=CustomerDF, hue='Gender', palette=color_palette)

# Add labels on top of each bar
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}',  # Get the height of each bar
        (p.get_x() + p.get_width() / 2., p.get_height()),  # Position label at the center-top of each bar
        ha='center', va='center', xytext=(0, 5), textcoords='offset points', fontsize=10, fontweight='bold')
    
# Add title and labels
plt.title('Gender Distribution', fontsize=14)
plt.xlabel('Gender', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
# Plot income and spending score distribution
# Set a consistent style for all plots
sns.set_style("whitegrid")

# Create subplots to display both histograms side by side
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot the Annual Income distribution
sns.histplot(CustomerDF['Annual Income (k$)'], kde=True, color='#2c3e50', ax=axes[0])
axes[0].set_title('Annual Income Distribution', fontsize=14)
axes[0].set_xlabel('Annual Income (k$)', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].tick_params(axis='both', labelsize=10)

# Plot the Spending Score distribution
sns.histplot(CustomerDF['Spending Score (1-100)'], kde=True, color='#27ae60', ax=axes[1])
axes[1].set_title('Spending Score Distribution', fontsize=14)
axes[1].set_xlabel('Spending Score (1-100)', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].tick_params(axis='both', labelsize=10)

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()


Clustering Preparation

In [None]:
# Select relevant features
X = CustomerDF[['Annual Income (k$)', 'Spending Score (1-100)']]

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


K-Means Clustering

In [None]:
# Calculate distortions for different cluster counts
distortions = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(X_scaled)
    distortions.append(kmeans.inertia_)

# Plot the elbow graph
plt.plot(range(1, 11), distortions, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('Distortion')
plt.show()


In [None]:
# Fit K-Means with optimal clusters (5) as the Elbow Point shown in the figure
kmeans = KMeans(n_clusters=5, random_state=42)
CustomerDF['Cluster'] = kmeans.fit_predict(X_scaled)

# Check cluster assignments
print(CustomerDF['Cluster'].value_counts())


Clusters' characteristics Analysis

1) Generate summary statistics for each cluster

In [None]:

# Select only numeric columns
numeric_columns = CustomerDF.select_dtypes(include=['int64'])
numeric_columns['Cluster'] = CustomerDF['Cluster']

# Add statistics like mean, median, min, and max
cluster_mean = numeric_columns.groupby('Cluster').mean()
print(f"cluster_mean:\n {cluster_mean}\n")

cluster_median = numeric_columns.groupby('Cluster').median()
print(f"cluster_median:\n {cluster_median}\n")

cluster_min = numeric_columns.groupby('Cluster').min()
print(f"cluster_min:\n {cluster_min}\n")

cluster_max = numeric_columns.groupby('Cluster').max()
print(f"cluster_max:\n {cluster_max}\n")

# Gender statistics for each cluster
gender_proportion = CustomerDF.groupby('Cluster')['Gender'].value_counts(normalize=True)
print(f"gender_proportion:\n {gender_proportion}\n")


2) visualizations for clusters' characteristics

In [None]:
# Scatter Plot for Clusters
fig = px.scatter(CustomerDF,
                 x='Annual Income (k$)',
                 y='Spending Score (1-100)',
                 color='Cluster',
                 hover_data=['Age', 'Gender'],
                 title='Customer Segmentation: Annual Income vs Spending Score',
                 labels={'Annual Income (k$)': 'Annual Income (k$)', 'Spending Score (1-100)': 'Spending Score'})
fig.update_traces(marker=dict(size=10, opacity=0.8))
fig.show()


In [None]:
# Gender Distribution
# Group data by 'Cluster' and 'Gender' to calculate proportions
gender_proportions = CustomerDF.groupby(['Cluster', 'Gender']).size().unstack()
gender_proportions = gender_proportions.div(gender_proportions.sum(axis=1), axis=0)

# Define a clear and cohesive color palette
color_palette = {'Male': '#3498db',  # Blue for Male
                 'Female': '#e74c3c'}  # Red for Female

# Create pie charts for each cluster
fig, axes = plt.subplots(1, len(gender_proportions), figsize=(15, 5))

for i, cluster in enumerate(gender_proportions.index):
    ax = axes[i]
    sizes = gender_proportions.loc[cluster]
    labels = sizes.index
    colors = [color_palette[label] for label in labels]  # Assign colors based on gender
    ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=colors)
    ax.set_title(f'Cluster {cluster}')

# Adjust layout for better appearance
plt.tight_layout()
plt.show()


In [None]:
# Boxplot for Age
fig = px.box(
    CustomerDF,
    x='Cluster',
    y='Age',
    color='Cluster',
    title="Age Distribution by Cluster",
    labels={'Cluster': 'Cluster', 'Age': 'Age'},
    color_discrete_sequence=px.colors.qualitative.Pastel  # Use a distinct palette
)
fig.update_traces(marker=dict(opacity=0.7))  # Adjust transparency
fig.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set style for better aesthetics
sns.set(style="whitegrid")

# Create violin plots
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Violin plot for Spending Score
sns.violinplot(data=CustomerDF, x='Cluster', y='Spending Score (1-100)', hue='Cluster', palette="muted", ax=axes[0])
axes[0].set_title("Spending Score Distribution by Cluster")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("Spending Score (1-100)")

# Violin plot for Annual Income
sns.violinplot(data=CustomerDF, x='Cluster', y='Annual Income (k$)', hue='Cluster', palette="pastel", ax=axes[1])
axes[1].set_title("Annual Income Distribution by Cluster")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Annual Income (k$)")

plt.tight_layout()
plt.show()


Cluster Profiling and Insights

Cluster-Specific Recommendations


General Recommendations
