In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# Load the dataset
df = pd.read_csv('path_to_dataset.csv')

# Inspect the first few rows of the data
df.head()

In [None]:
# Summary Statistics
df.describe()

# Check for missing values
df.isnull().sum()

# Visualize the distribution of each feature
sns.pairplot(df)

# Plot Correlation Matrix
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')

In [None]:
# Standardizing the features
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df.drop(columns=['customer_id']))

# Check the scaled data
df_scaled[:5]

In [None]:
# Applying KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(df_scaled)

# Visualize the clusters in 2D using PCA
pca = PCA(n_components=2)
pca_components = pca.fit_transform(df_scaled)
df['PCA1'] = pca_components[:, 0]
df['PCA2'] = pca_components[:, 1]

# Plotting the clusters in 2D space
plt.figure(figsize=(10, 6))
sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=df, palette='Set2', s=100)
plt.title('Customer Segments Identified using K-Means')
plt.show()

In [None]:
# Silhouette Score to evaluate the clustering
sil_score = silhouette_score(df_scaled, df['Cluster'])
print(f"Silhouette Score: {sil_score}")

In [None]:
# Analyzing the clusters
df.groupby('Cluster').mean()

# Visualize feature distribution across clusters
sns.boxplot(x='Cluster', y='total_purchases', data=df)
sns.boxplot(x='Cluster', y='avg_cart_value', data=df)
sns.boxplot(x='Cluster', y='total_time_spent', data=df)
sns.boxplot(x='Cluster', y='product_click', data=df)
sns.boxplot(x='Cluster', y='discount_count', data=df)