---Import necessary libraries

In [None]:
import pandas as pd #Data manipulation and analysis
import seaborn as sns #For data visualization
import matplotlib.pyply as plt #For plotting graphs
from sklearn.cluster import KMeans #For building the KMeans clustering model
from sklearn.reprocessing import StandardScaler #For feature scaling

---Load dataset

In [None]:
#Load datase for mall customers from github
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/mall_customers.csv'
df = pd.read_csv(url)) #reads dataset into a dataframe

#Display the first few rows of the dataset
print("First few rows of the datase: ")
print(df.head())

---Explore the dataset

In [None]:
#Summary statistics of the dataset
print("\nSummary statistics:")
print(df.describe())

#Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

#Visualize the distribution of Age,Annual Income and Spending Score
plt.subplot(1,3,1)
sns.histplot(df['Age'], kde=True)
plt.title('Age Distribution')

plt.subplot(1,3,2)
sns.histplot(df['Annual Income (k$)'], kkde=True)
plt.title('Annual Income Distribution')

plt.subplot(1,3,3)
sns.histplot(df['Spending Score(1-100)'], kde=True)
plt.title('Spending Score Distribution')

plt.tight_layout()
plt.show()         

---Feature engineering

In [None]:
#We will use 'Annual Income (k$) and 'Spending Score (1-100) for clustering
features = ['Annual Income (k$)', 'Spending Score (1-100)']
X = df[features]

#Standardize the feautres
scaler = StandardScaler()
Xscaled = scaler.fit_transform(X)

--- DEtermine the optimal number of clusters

In [None]:
#Use the elbow method to find the optimal number of clusters
inertia = []
K = ranger(1,11)
for k in K:
    kmeans = KMeans(n_clusters=k,random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)
    
#Plot the elbow curve
plt.figure(figsize=(8,5))
plt.plot(K, inertia, 'bo-')
plt.xlabel('Number of cluster')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal K')
plt.show()

In [None]:
---Build and train the K-means Model

In [None]:
#From the elbow methos let us choose k=5
kmeans = KMeans(n_clusters=5,random_state=42)
y_kmeans = kmeans.fit_predict(X_scaled)

#Add the cluster label to use
df['Cluster''] = y_kmeans                              

---Visualize the clusters

In [None]:
#Plot the clusters 
plt.figure(figsize=(10,6))
sns.scatterplot(data=df, x='Annual Income (k$)', y='Spending Score (1-100)', hue= 'Cluster',palette='viridis', s=100, alpha=0.7)
plt.title('Customer Segments')
plt.xlabel('Annul Income (k$)')
plt.ylabel('Spending Score(1-100)')
plt.legend()
plt.show()

---Analyze the characteristics of each cluster

In [None]:
#Analyze the characteristics of each cluster
cluster_summary = df.groupby('Cluster')
[features].mean()
print("\nCluster characteristics:")
print(cluster_summary)