In [135]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

from models.naive_model import NaiveClustering


Import needed libraries and models

In [136]:
# Utils functions
def load_data(file_path):
    return pd.read_csv(file_path)

Clean the Dataset

In [137]:
df = load_data("segmented_customers.csv")
df = df.drop(["CustomerID"], axis=1)

# Scale the numerical columns
scaler = StandardScaler()
numerical_columns = ["Age", "Annual_Income", "Spending_Score"]
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])


### Sklearn Model

In [138]:
# Perform hierarchical clustering
model = AgglomerativeClustering(n_clusters=5)
df['cluster_sklearn'] = model.fit_predict(df)

In [139]:
df.head()

Unnamed: 0,Gender,Age,Annual_Income,Spending_Score,cluster,cluster_sklearn
0,1,-1.424569,-1.738999,-0.434801,3,0
1,1,-1.281035,-1.738999,1.195704,4,3
2,0,-1.352802,-1.70083,-1.715913,3,0
3,0,-1.137502,-1.70083,1.040418,4,3
4,0,-0.563369,-1.66266,-0.39598,3,0


In [140]:
# Calculate the silhouette score
score = silhouette_score(df, df['cluster_sklearn'])
print(score)

0.5476242034629109


### Naive Model

In [141]:
# Perform hierarchical clustering
model = NaiveClustering(n_clusters=5)
df["cluster_naive"] = model.fit_predict(df.values)

In [142]:
df.head()

Unnamed: 0,Gender,Age,Annual_Income,Spending_Score,cluster,cluster_sklearn,cluster_naive
0,1,-1.424569,-1.738999,-0.434801,3,0,0
1,1,-1.281035,-1.738999,1.195704,4,3,7
2,0,-1.352802,-1.70083,-1.715913,3,0,0
3,0,-1.137502,-1.70083,1.040418,4,3,7
4,0,-0.563369,-1.66266,-0.39598,3,0,0


In [143]:
# Calculate the silhouette score
score = silhouette_score(df, df['cluster_naive'])
print(score)

0.7497501015197205


### Queue Model

In [144]:
# Perform hierarchical clustering
model = NaiveClustering(n_clusters=5)
df["cluster_queue"] = model.fit_predict(df.values)

In [145]:
df.head()

Unnamed: 0,Gender,Age,Annual_Income,Spending_Score,cluster,cluster_sklearn,cluster_naive,cluster_queue
0,1,-1.424569,-1.738999,-0.434801,3,0,0,0
1,1,-1.281035,-1.738999,1.195704,4,3,7,7
2,0,-1.352802,-1.70083,-1.715913,3,0,0,0
3,0,-1.137502,-1.70083,1.040418,4,3,7,7
4,0,-0.563369,-1.66266,-0.39598,3,0,0,0


In [146]:
#silouette score
silhouette_score(df, df['cluster_queue'])

0.7782526665715522

### Unit Testing

In [88]:
!python3 -m pytest

platform win32 -- Python 3.10.9, pytest-7.2.1, pluggy-1.0.0
rootdir: C:\Users\Vlad\PycharmProjects\CulsteringAlgorithm
plugins: anyio-3.6.2
collected 12 items

tests\unit\test_naive_implementation.py .....                            [ 41%]
tests\unit\test_queue_implementation.py .......                          [100%]



### Performance

In [106]:
# Sklearn
score = silhouette_score(df, df['cluster2'])
print(score)

0.5476242034629109


In [108]:
# Naive Aproach
score = silhouette_score(df_new, df_new['cluster3'])
print(score)

0.7497501015197205


In [110]:
# Queue
score = silhouette_score(df_performance, df_performance['cluster4'])
print(score)

0.7497501015197205
