In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [6]:
df = pd.read_csv('../../Datasets/Mall_customers.csv')

In [7]:
df.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [8]:
df.drop(['CustomerID'], axis=1, inplace=True)

In [9]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform='pandas')

In [11]:
ohet = ohe.fit_transform(df[['Gender']])

In [12]:
ohet

Unnamed: 0,Gender_Female,Gender_Male
0,0.0,1.0
1,0.0,1.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0
...,...,...
195,1.0,0.0
196,1.0,0.0
197,0.0,1.0
198,0.0,1.0


In [13]:
data = pd.concat([ohet, df.drop(['Gender'], axis=1)], axis=1)

In [14]:
data.head()

Unnamed: 0,Gender_Female,Gender_Male,Age,Annual Income (k$),Spending Score (1-100)
0,0.0,1.0,19,15,39
1,0.0,1.0,21,15,81
2,1.0,0.0,20,16,6
3,1.0,0.0,23,16,77
4,1.0,0.0,31,17,40


In [15]:
data_scaled = StandardScaler().fit_transform(data)

In [17]:
km = KMeans(n_clusters=4, init='k-means++', n_init=10)

In [18]:
clusters = km.fit_predict(data_scaled)



In [19]:
km.cluster_centers_

array([[-1.12815215,  1.12815215, -0.76072691,  0.05496398,  0.83369302],
       [ 0.88640526, -0.88640526, -0.74719196, -0.03409802,  0.67876434],
       [ 0.88640526, -0.88640526,  0.6644943 , -0.06648421, -0.59855788],
       [-1.12815215,  1.12815215,  0.75982983,  0.07086791, -0.81492926]])

In [20]:
gmm = GaussianMixture(n_components=4, covariance_type='full', means_init=km.cluster_centers_, random_state=42)

In [21]:
labels = gmm.fit_predict(data_scaled)



In [23]:
probs = gmm.predict_proba(data_scaled)

In [26]:
log_probs = gmm.score_samples(data_scaled)

In [27]:
threshold = np.percentile(log_probs, 5)

In [28]:
anomalies = data_scaled[log_probs < threshold]

In [29]:
anomalies

array([[ 0.88640526, -0.88640526, -1.3528021 , -1.70082976, -1.71591298],
       [ 0.88640526, -0.88640526, -0.27630176, -1.62449091, -1.71591298],
       [-1.12815215,  1.12815215,  1.80493225, -1.58632148, -1.83237767],
       [-1.12815215,  1.12815215,  2.02023231, -1.58632148, -1.4053405 ],
       [ 0.88640526, -0.88640526, -0.27630176, -1.58632148,  1.89449216],
       [-1.12815215,  1.12815215, -0.13276838, -1.54815205, -1.44416206],
       [-1.12815215,  1.12815215, -1.49633548, -1.05194947,  1.62274124],
       [ 0.88640526, -0.88640526,  0.44136514,  2.49780745, -0.86183865],
       [-1.12815215,  1.12815215, -0.49160182,  2.91767117, -1.25005425],
       [-1.12815215,  1.12815215, -0.6351352 ,  2.91767117,  1.27334719]])