In [2]:
import pandas as pd

df = pd.read_parquet('Final.parquet')

In [4]:
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

df_small = df.sample(1000)
df_small['CustomerID'] = df_small['CustomerID'].astype(int)

# High cardinality features
high_card_cols = ['City', 'ProductGroup', 'ProductSubgroup', 'PostalCode']
high_card_data = df_small[high_card_cols].astype(str).values.tolist()
hasher = FeatureHasher(n_features=10, input_type='string')
hashed_features = hasher.fit_transform(high_card_data)

# Convert hashed features to a DataFrame
hashed_features_df = pd.DataFrame(hashed_features.toarray(), columns=[f'hash_{i}' for i in range(10)])

# One-hot encoding for other categorical variables
encoder = OneHotEncoder(sparse_output=False)
encoded_features = encoder.fit_transform(df_small[['SalesChannel', 'IndustryGroup', 'Type', 'BusinessArea', 'Company', 'RevenueType']])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())

# Reset the indices of all DataFrames before concatenation
df_small_reset = df_small.drop(['City', 'ProductGroup', 'ProductSubgroup', 'PostalCode', 'SalesChannel', 'IndustryGroup', 'Type', 'BusinessArea', 'Company', 'RevenueType'], axis=1).reset_index(drop=True)
hashed_features_df_reset = hashed_features_df.reset_index(drop=True)
encoded_df_reset = encoded_df.reset_index(drop=True)

# Combine all features
df2 = pd.concat([df_small_reset, hashed_features_df_reset, encoded_df_reset], axis=1)

# Scale features
scaler = StandardScaler()
scaled_array = scaler.fit_transform(df2)

df_scaled = pd.DataFrame(scaled_array, columns=df2.columns)

# K-Means Clustering
for i in range(2, 21):
    kmeans = KMeans(n_clusters=i, random_state=42)
    clusters = kmeans.fit_predict(df_scaled)
    score = silhouette_score(df_scaled, clusters)
    print(f"Silhouette Score for {i} clusters: {score}")


Silhouette Score for 2 clusters: 0.1043463460029593
Silhouette Score for 3 clusters: 0.08965376449771789
Silhouette Score for 4 clusters: 0.09292736539942113
Silhouette Score for 5 clusters: 0.0958182592748604
Silhouette Score for 6 clusters: 0.09101404692697283
Silhouette Score for 7 clusters: 0.07944137581989745
Silhouette Score for 8 clusters: 0.09473903391326531
Silhouette Score for 9 clusters: 0.10274226868338443
Silhouette Score for 10 clusters: 0.11979052250030697
Silhouette Score for 11 clusters: 0.1273571167583921
Silhouette Score for 12 clusters: 0.1409216013046602
Silhouette Score for 13 clusters: 0.14658779656694498
Silhouette Score for 14 clusters: 0.1414310472657641
Silhouette Score for 15 clusters: 0.14518495084748148
Silhouette Score for 16 clusters: 0.14840862076387704
Silhouette Score for 17 clusters: 0.15869379201211603
Silhouette Score for 18 clusters: 0.16044275584941964
Silhouette Score for 19 clusters: 0.16257803898791917
Silhouette Score for 20 clusters: 0.16693