In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('Wholesale customers data.csv')

In [3]:
data

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185
...,...,...,...,...,...,...,...,...
435,1,3,29703,12051,16027,13135,182,2204
436,1,3,39228,1431,764,4510,93,2346
437,2,3,14531,15488,30243,437,14841,1867
438,1,3,10290,1981,2232,1038,168,2125


In [4]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(data.iloc[:, 2:])
principalDf = pd.DataFrame(data=principalComponents, columns=['PC1', 'PC2'])

In [5]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5)
kmeans.fit(data.iloc[:, 2:])
data['Cluster'] = kmeans.labels_

In [6]:
from sklearn.cluster import AgglomerativeClustering
clustering = AgglomerativeClustering().fit(data.iloc[:, 2:])
data['Cluster'] = clustering.labels_

In [7]:
from sklearn.cluster import DBSCAN
clustering = DBSCAN(eps=3, min_samples=2).fit(data.iloc[:, 2:])
data['Cluster'] = clustering.labels_

In [8]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(data.iloc[:, 2:])



[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 440 samples in 0.001s...
[t-SNE] Computed neighbors for 440 samples in 0.013s...
[t-SNE] Computed conditional probabilities for sample 440 / 440
[t-SNE] Mean sigma: 3303.873398
[t-SNE] KL divergence after 250 iterations with early exaggeration: 59.351536
[t-SNE] KL divergence after 300 iterations: 0.399638


In [10]:
!pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.23.0-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.0-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m623.0 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: mlxtend
Successfully installed mlxtend-0.23.0


In [12]:

from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# Apply PCA
pca = PCA(n_components=2) 
principal_components = pca.fit_transform(data_scaled)

df_pca = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])

print(df_pca)


          PC1       PC2
0    0.843939 -0.515351
1    1.062676 -0.484601
2    1.269141  0.682055
3   -1.056782  0.610821
4    0.634030  0.974199
..        ...       ...
435  0.222884  2.480505
436 -1.290172  1.560397
437  3.865149 -0.479854
438 -1.097067 -0.069896
439 -1.165951 -0.902157

[440 rows x 2 columns]


In [13]:
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)


gmm = GaussianMixture(n_components=3)  
gmm.fit(data_scaled)
labels = gmm.predict(data_scaled)


data['Cluster'] = labels

print(data[['Channel', 'Region', 'Fresh', 'Milk', 'Grocery', 'Frozen', 'Detergents_Paper', 'Delicassen', 'Cluster']])


     Channel  Region  Fresh   Milk  Grocery  Frozen  Detergents_Paper  \
0          2       3  12669   9656     7561     214              2674   
1          2       3   7057   9810     9568    1762              3293   
2          2       3   6353   8808     7684    2405              3516   
3          1       3  13265   1196     4221    6404               507   
4          2       3  22615   5410     7198    3915              1777   
..       ...     ...    ...    ...      ...     ...               ...   
435        1       3  29703  12051    16027   13135               182   
436        1       3  39228   1431      764    4510                93   
437        2       3  14531  15488    30243     437             14841   
438        1       3  10290   1981     2232    1038               168   
439        1       3   2787   1698     2510      65               477   

     Delicassen  Cluster  
0          1338        1  
1          1776        1  
2          7844        1  
3          1788

In [14]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# Apply Isolation Forest
iso_forest = IsolationForest(n_estimators=100, contamination='auto')
anomalies = iso_forest.fit_predict(data_scaled)

# Mark anomalies in the dataset
data['Anomaly'] = anomalies

print(data[['Channel', 'Region', 'Fresh', 'Milk', 'Grocery', 'Frozen', 'Detergents_Paper', 'Delicassen', 'Anomaly']])


     Channel  Region  Fresh   Milk  Grocery  Frozen  Detergents_Paper  \
0          2       3  12669   9656     7561     214              2674   
1          2       3   7057   9810     9568    1762              3293   
2          2       3   6353   8808     7684    2405              3516   
3          1       3  13265   1196     4221    6404               507   
4          2       3  22615   5410     7198    3915              1777   
..       ...     ...    ...    ...      ...     ...               ...   
435        1       3  29703  12051    16027   13135               182   
436        1       3  39228   1431      764    4510                93   
437        2       3  14531  15488    30243     437             14841   
438        1       3  10290   1981     2232    1038               168   
439        1       3   2787   1698     2510      65               477   

     Delicassen  Anomaly  
0          1338        1  
1          1776        1  
2          7844        1  
3          1788