In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram

In [7]:
df = pd.read_csv('segmentation data legend.xlsx')
df.head(10)

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.shape

In [None]:
df.drop(['ID'], inplace = True, axis = 1)

In [None]:
plt.figure(figsize=(21,15))

plt.subplot2grid((2,2), (0,0))
box1 = sns.boxplot(y=df.Age)
plt.title("Age")

plt.subplot2grid((2,2), (0,1))
box2 = sns.boxplot(y=df.Income)
plt.title("Income")

plt.show()

In [None]:
df.Age.describe()

In [None]:
df.Income.describe()


In [None]:
plt.figure(figsize=(21,15))

plt.subplot2grid((3,3), (0,0))
sns.histplot(df.Sex.astype(str), stat='proportion')

plt.subplot2grid((3,3), (0,1))
sns.histplot(df['Marital status'].astype(str), stat='proportion')

plt.subplot2grid((3,3), (0,2))
sns.histplot(df.Education.astype(str).sort_values(), stat='proportion')

plt.subplot2grid((3,3), (1,0))
sns.histplot(df.Occupation.astype(str).sort_values(), stat='proportion')

plt.subplot2grid((3,3), (1,1))
sns.histplot(df['Settlement size'].astype(str).sort_values(), stat='proportion')

plt.show()

In [None]:
import warnings 

warnings.filterwarnings('ignore')


# Make the plots
sns.displot(df, x=df['Marital status'], hue="Sex", kind="kde", fill=True, ax=axs[0,0], height=3, aspect=1.5)
sns.displot(df, x=df['Age'], hue="Sex", kind="kde", fill=True, ax=axs[0,1], height=3, aspect=1.5)
sns.displot(df, x=df['Education'], hue="Sex", kind="kde", fill=True, ax=axs[1,0], height=3, aspect=1.5)
sns.displot(df, x=df['Income'], hue="Sex", kind="kde", fill=True, ax=axs[1,1], height=3, aspect=1.5)
sns.displot(df, x=df['Occupation'], hue="Sex", kind="kde", fill=True, ax=axs[2,0], height=3, aspect=1.5)

# Display the plots
plt.show()

In [None]:
col_names = df.columns
features = df[col_names]

scaler = StandardScaler().fit(features.values)
features = scaler.transform(features.values)
scaled = pd.DataFrame(features, columns = col_names)
scaled.head()

In [None]:
data=scaled[['Age','Income']]
data.head()

In [None]:
wcss = {'wcss_score':[], 'no_of_clusters':[]}
for i in range(1,11):
    kmeans = KMeans(i, random_state=0)
    kmeans.fit(data)
    wcss['wcss_score'].append(kmeans.inertia_)
    wcss['no_of_clusters'].append(i)
wcss_df = pd.DataFrame(wcss)

wcss_df.head(15)

In [None]:
plt.figure(figsize=(14,10))
plt.plot(wcss_df.no_of_clusters, wcss_df.wcss_score, marker='o')
plt.title("Elbow Method to determine number of clusters(K)")
plt.show()

In [None]:
kmeans=KMeans(n_clusters=4,random_state=0) 
kmeans.fit(data)

In [None]:
prediction=kmeans.fit_predict(data)
prediction

clustered_data = df.copy()
clustered_data["cluster_index"] = prediction

In [None]:
clustered_data

In [None]:
sns.scatterplot(x=clustered_data.Age,
                y=clustered_data.Income,
                hue=clustered_data.cluster_index,
                palette="deep")

In [None]:
score = silhouette_score(df, clustered_data.cluster_index, random_state=0)
print(f"Silhouette score: {score:0.3f} ~ 0")

In [None]:
def s_score(distance, linkage):
    agc = AgglomerativeClustering(n_clusters=4, affinity=distance, linkage=linkage)
    agc.fit_predict(scaled)
    score = silhouette_score(scaled, agc.labels_, random_state=0)
    return score

In [None]:
distances = ['euclidean', 'l1', 'l2', 'manhattan', 'cosine']
linkages = ['ward', 'complete', 'average', 'single']

In [None]:
scoring = {'dist':[], 'link':[], 'sScore':[]}
for i in distances:
    for j in linkages:
        try:
            score = s_score(i, j)
            scoring['dist'].append(i)
            scoring['link'].append(j)
            scoring['sScore'].append(score)
        except:
            scoring['dist'].append(i)
            scoring['link'].append(j)
            scoring['sScore'].append(np.nan)
scoringDf = pd.DataFrame(scoring)

In [None]:
scoringDf.dropna(axis=0, inplace=True)


In [None]:
final_result = scoringDf[scoringDf['sScore'] == max(scoringDf['sScore'])]
final_result

In [None]:
dbs = DBSCAN(eps=2, min_samples=10)
dbs.fit_predict(scaled)
score = silhouette_score(scaled, dbs.labels_, random_state=0)
print(f"Silhouette score: {score:0.3f}")