In [13]:
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd 

In [4]:
df = pd.read_csv('data/user_personalized_features.csv')
df.head(10)

Unnamed: 0.1,Unnamed: 0,User_ID,Age,Gender,Location,Income,Interests,Last_Login_Days_Ago,Purchase_Frequency,Average_Order_Value,Total_Spending,Product_Category_Preference,Time_Spent_on_Site_Minutes,Pages_Viewed,Newsletter_Subscription
0,0,#1,56,Male,Suburban,38037,Sports,5,7,18,2546,Books,584,38,True
1,1,#2,46,Female,Rural,103986,Technology,15,7,118,320,Electronics,432,40,False
2,2,#3,32,Female,Suburban,101942,Sports,28,1,146,3766,Apparel,306,1,True
3,3,#4,60,Female,Suburban,71612,Fashion,18,3,163,4377,Apparel,527,29,False
4,4,#5,25,Male,Suburban,49725,Travel,2,5,141,4502,Health & Beauty,53,10,True
5,5,#6,38,Male,Suburban,25926,Travel,22,8,25,2669,Books,520,28,False
6,6,#7,56,Male,Urban,124555,Fashion,11,7,152,1113,Home & Kitchen,330,31,True
7,7,#8,36,Male,Urban,29496,Technology,26,7,108,2151,Apparel,558,19,True
8,8,#9,40,Male,Rural,76447,Travel,25,6,130,4085,Books,325,3,True
9,9,#10,28,Male,Urban,121604,Food,13,8,61,4999,Electronics,114,36,False


In [9]:
df = df.drop(columns = ['Unnamed: 0'], axis = 1, inplace= True)


KeyError: "['Unnamed: 0'] not found in axis"

In [11]:
df.columns

Index(['User_ID', 'Age', 'Gender', 'Location', 'Income', 'Interests',
       'Last_Login_Days_Ago', 'Purchase_Frequency', 'Average_Order_Value',
       'Total_Spending', 'Product_Category_Preference',
       'Time_Spent_on_Site_Minutes', 'Pages_Viewed',
       'Newsletter_Subscription'],
      dtype='object')

In [10]:
cat_col = df.select_dtypes(include=['object']).columns
num_col = df.select_dtypes(exclude=['object']).columns

label_encoder = {}
for col in cat_col:
    label_encoder[col] = LabelEncoder()
    df[col] = label_encoder[col].fit_transform(df[col])

scaler = StandardScaler()
df[num_col] = scaler.fit_transform(df[num_col])


In [16]:
# Apply KMeans clustering to find customer segments based on their features
kmeans = KMeans(n_clusters=4, random_state=42)
cluster_labels = kmeans.fit_predict(df.drop(columns=['Product_Category_Preference']))

# Assign the cluster labels to the dataframe
df['Cluster'] = cluster_labels

# Analyze the preferred product category for each cluster
preferred_category_per_cluster = df.groupby('Cluster')['Product_Category_Preference'].agg(lambda x: x.value_counts().idxmax())
print("Preferred product category for each cluster:")
print(preferred_category_per_cluster)


Preferred product category for each cluster:
Cluster
0    2
1    0
2    2
3    0
Name: Product_Category_Preference, dtype: int64


In [None]:
from sklearn.metrics import silhouette_score, davies_bouldin_score

# Exclude the target column for clustering evaluation
X_cluster = df.drop(columns=['Product_Category_Preference', 'Cluster'])

# Silhouette Score
sil_score = silhouette_score(X_cluster, df['Cluster'])
print(f"Silhouette Score: {sil_score:.4f}")

# Davies-Bouldin Index
dbi_score = davies_bouldin_score(X_cluster, df['Cluster'])
print(f"Davies-Bouldin Index: {dbi_score:.4f}")

Silhouette Score: 0.5686
Davies-Bouldin Index: 0.5018
