In [None]:
!pip install pycaret

In [None]:
!pip install numpy pandas matplotlib seaborn scikit-learn xlsxwriter

# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import matplotlib.pyplot as plt
import seaborn as sns
import pycaret
from pycaret.clustering import *
from pycaret.datasets import get_data


# Loading Dataset


In [None]:
data = pd.read_csv('/content/Dataset on Sales Transaction weekly.csv')
data.head()

# Parameter setting for clustering model

In [None]:
set_param = setup(data)

# **K-MEANS CLUSTERING**

## No data processing

In [None]:
setup(data, verbose=False)
r1a = []
r2a = []
r3a = []

print("For Cluster = 3")
x = create_model('kmeans', num_clusters = 3)
labels=x.fit_predict(data)
r1a.append(silhouette_score(data,labels))
r1a.append(calinski_harabasz_score(data,labels))
r1a.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 4")
x = create_model('kmeans', num_clusters = 4)
labels=x.fit_predict(data)
r2a.append(silhouette_score(data,labels))
r2a.append(calinski_harabasz_score(data,labels))
r2a.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 5")
x = create_model('kmeans', num_clusters = 5)
labels=x.fit_predict(data)
r3a.append(silhouette_score(data,labels))
r3a.append(calinski_harabasz_score(data,labels))
r3a.append(davies_bouldin_score(data,labels))

plot_model(x, plot = 'cluster')


## Using Normalization

In [None]:
setup(data = data, normalize = True, normalize_method = 'zscore', verbose=False)
r4a = []
r5a = []
r6a = []

print("For Cluster = 3")
x = create_model('kmeans', num_clusters = 3)
labels=x.fit_predict(data)
r4a.append(silhouette_score(data,labels))
r4a.append(calinski_harabasz_score(data,labels))
r4a.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 4")
x = create_model('kmeans', num_clusters = 4)
labels=x.fit_predict(data)
r5a.append(silhouette_score(data,labels))
r5a.append(calinski_harabasz_score(data,labels))
r5a.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 5")
x = create_model('kmeans', num_clusters = 5)
plot_model(x, plot = 'cluster')
labels=x.fit_predict(data)
r6a.append(silhouette_score(data,labels))
r6a.append(calinski_harabasz_score(data,labels))
r6a.append(davies_bouldin_score(data,labels))

plot_model(x, plot='cluster')

## Using transformation

In [None]:
setup(data = data, transformation = True, transformation_method = 'yeo-johnson', verbose=False)
r7a = []
r8a = []
r9a = []

print("For Cluster = 3")
x = create_model('kmeans', num_clusters = 3)
labels=x.fit_predict(data)
r7a.append(silhouette_score(data,labels))
r7a.append(calinski_harabasz_score(data,labels))
r7a.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 4")
x = create_model('kmeans', num_clusters = 4)
labels=x.fit_predict(data)
r8a.append(silhouette_score(data,labels))
r8a.append(calinski_harabasz_score(data,labels))
r8a.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 5")
x = create_model('kmeans', num_clusters = 5)
plot_model(x, plot = 'cluster')
labels=x.fit_predict(data)
r9a.append(silhouette_score(data,labels))
r9a.append(calinski_harabasz_score(data,labels))
r9a.append(davies_bouldin_score(data,labels))

plot_model(x, plot='cluster')

## Using PCA (Principal Component Analysis)

In [None]:
setup(data = data, pca = True, pca_method = 'linear', verbose=False)
r10a = []
r11a = []
r12a = []

print("For Cluster = 3")
x = create_model('kmeans', num_clusters = 3)
labels=x.fit_predict(data)
r10a.append(silhouette_score(data,labels))
r10a.append(calinski_harabasz_score(data,labels))
r10a.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 4")
x = create_model('kmeans', num_clusters = 4)
labels=x.fit_predict(data)
r11a.append(silhouette_score(data,labels))
r11a.append(calinski_harabasz_score(data,labels))
r11a.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 5")
x = create_model('kmeans', num_clusters = 5)
labels=x.fit_predict(data)
r12a.append(silhouette_score(data,labels))
r12a.append(calinski_harabasz_score(data,labels))
r12a.append(davies_bouldin_score(data,labels))
print("\n")

plot_model(x, plot = 'cluster')

## Using Normalization and Transformation

In [None]:
setup(data = data, normalize = True, normalize_method = 'zscore', transformation = True, transformation_method = 'yeo-johnson', verbose=False)
r13a = []
r14a = []
r15a = []

print("For Cluster = 3")
x = create_model('kmeans', num_clusters = 3)
labels=x.fit_predict(data)
r13a.append(silhouette_score(data,labels))
r13a.append(calinski_harabasz_score(data,labels))
r13a.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 4")
x = create_model('kmeans', num_clusters = 4)
labels=x.fit_predict(data)
r14a.append(silhouette_score(data,labels))
r14a.append(calinski_harabasz_score(data,labels))
r14a.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 5")
x = create_model('kmeans', num_clusters = 5)
plot_model(x, plot = 'cluster')
labels=x.fit_predict(data)
r15a.append(silhouette_score(data,labels))
r15a.append(calinski_harabasz_score(data,labels))
r15a.append(davies_bouldin_score(data,labels))

plot_model(x, plot='cluster')

## Using Normalization, Transformation and PCA

In [None]:
setup(data = data, normalize = True, normalize_method = 'zscore',
      transformation = True, transformation_method = 'yeo-johnson',
      pca = True, pca_method = 'linear', verbose=False)

r16a = []
r17a = []
r18a = []

print("For Cluster = 3")
x = create_model('kmeans', num_clusters = 3)
labels=x.fit_predict(data)
r16a.append(silhouette_score(data,labels))
r16a.append(calinski_harabasz_score(data,labels))
r16a.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 4")
x = create_model('kmeans', num_clusters = 4)
labels=x.fit_predict(data)
r17a.append(silhouette_score(data,labels))
r17a.append(calinski_harabasz_score(data,labels))
r17a.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 5")
x = create_model('kmeans', num_clusters = 5)
labels=x.fit_predict(data)
r18a.append(silhouette_score(data,labels))
r18a.append(calinski_harabasz_score(data,labels))
r18a.append(davies_bouldin_score(data,labels))

plot_model(x, plot = 'cluster')

## Building results table

In [None]:
result1 = {'Parameters': ['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldins'],
           'No Data Processing(c=3)': r1a,'No Data Processing(c=4)': r2a,'No Data Processing(c=5)': r3a,
           'Using Normalization(c=3)' : r4a,'Using Normalization(c=4)' : r5a,'Using Normalization(c=5)' : r6a,
           'Using Transform(c=3)' : r7a,'Using Transform(c=4)' : r8a,'Using Transform(c=5)' : r9a,
           'Using PCA(c=3)' : r10a,'Using PCA(c=4)' : r11a,'Using PCA(c=5)' : r12a,
           'Using T + N(c=3)': r13a,'Using T + N(c=4)': r14a,'Using T + N(c=5)': r15a,
           'T + N + PCA(c=3)': r16a,'T + N + PCA(c=4)': r17a,'T + N + PCA(c=5)': r18a}

result1_df = pd.DataFrame(result1)
result1_df

# **AGGLOMERATIVE CLUSTERTING**

## No data processing

In [None]:
setup(data, verbose=False)
r1b = []
r2b = []
r3b = []

print("For Cluster = 3")
x = create_model('hclust', num_clusters = 3)
labels=x.fit_predict(data)
r1b.append(silhouette_score(data,labels))
r1b.append(calinski_harabasz_score(data,labels))
r1b.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 4")
x = create_model('hclust', num_clusters = 4)
labels=x.fit_predict(data)
r2b.append(silhouette_score(data,labels))
r2b.append(calinski_harabasz_score(data,labels))
r2b.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 5")
x = create_model('hclust', num_clusters = 5)
labels=x.fit_predict(data)
r3b.append(silhouette_score(data,labels))
r3b.append(calinski_harabasz_score(data,labels))
r3b.append(davies_bouldin_score(data,labels))

plot_model(x, plot = 'cluster')


## Using normalization

In [None]:
setup(data = data, normalize = True, normalize_method = 'zscore', verbose=False)
r4b = []
r5b = []
r6b = []

print("For Cluster = 3")
x = create_model('hclust', num_clusters = 3)
labels=x.fit_predict(data)
r4b.append(silhouette_score(data,labels))
r4b.append(calinski_harabasz_score(data,labels))
r4b.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 4")
x = create_model('hclust', num_clusters = 4)
labels=x.fit_predict(data)
r5b.append(silhouette_score(data,labels))
r5b.append(calinski_harabasz_score(data,labels))
r5b.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 5")
x = create_model('hclust', num_clusters = 5)
plot_model(x, plot = 'cluster')
labels=x.fit_predict(data)
r6b.append(silhouette_score(data,labels))
r6b.append(calinski_harabasz_score(data,labels))
r6b.append(davies_bouldin_score(data,labels))

plot_model(x, plot='cluster')

## Using transformation

In [None]:
setup(data = data, transformation = True, transformation_method = 'yeo-johnson', verbose=False)
r7b = []
r8b = []
r9b = []

print("For Cluster = 3")
x = create_model('hclust', num_clusters = 3)
labels=x.fit_predict(data)
r7b.append(silhouette_score(data,labels))
r7b.append(calinski_harabasz_score(data,labels))
r7b.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 4")
x = create_model('hclust', num_clusters = 4)
labels=x.fit_predict(data)
r8b.append(silhouette_score(data,labels))
r8b.append(calinski_harabasz_score(data,labels))
r8b.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 5")
x = create_model('hclust', num_clusters = 5)
plot_model(x, plot = 'cluster')
labels=x.fit_predict(data)
r9b.append(silhouette_score(data,labels))
r9b.append(calinski_harabasz_score(data,labels))
r9b.append(davies_bouldin_score(data,labels))

plot_model(x, plot='cluster')

## Using PCA

In [None]:
setup(data = data, pca = True, pca_method = 'linear', verbose=False)
r10b = []
r11b = []
r12b = []

print("For Cluster = 3")
x = create_model('hclust', num_clusters = 3)
labels=x.fit_predict(data)
r10b.append(silhouette_score(data,labels))
r10b.append(calinski_harabasz_score(data,labels))
r10b.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 4")
x = create_model('hclust', num_clusters = 4)
labels=x.fit_predict(data)
r11b.append(silhouette_score(data,labels))
r11b.append(calinski_harabasz_score(data,labels))
r11b.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 5")
x = create_model('hclust', num_clusters = 5)
plot_model(x, plot = 'cluster')
labels=x.fit_predict(data)
r12b.append(silhouette_score(data,labels))
r12b.append(calinski_harabasz_score(data,labels))
r12b.append(davies_bouldin_score(data,labels))

plot_model(x, plot='cluster')

## Using Normalization and Transformation

In [None]:
setup(data = data, normalize = True, normalize_method = 'zscore', transformation = True, transformation_method = 'yeo-johnson', verbose=False)
r13b = []
r14b = []
r15b = []

print("For Cluster = 3")
x = create_model('hclust', num_clusters = 3)
labels=x.fit_predict(data)
r13b.append(silhouette_score(data,labels))
r13b.append(calinski_harabasz_score(data,labels))
r13b.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 4")
x = create_model('hclust', num_clusters = 4)
labels=x.fit_predict(data)
r14b.append(silhouette_score(data,labels))
r14b.append(calinski_harabasz_score(data,labels))
r14b.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 5")
x = create_model('hclust', num_clusters = 5)
plot_model(x, plot = 'cluster')
labels=x.fit_predict(data)
r15b.append(silhouette_score(data,labels))
r15b.append(calinski_harabasz_score(data,labels))
r15b.append(davies_bouldin_score(data,labels))

plot_model(x, plot='cluster')

## Using Normalization, Transformation and PCA

In [None]:
setup(data = data, normalize = True, normalize_method = 'zscore', transformation = True, transformation_method = 'yeo-johnson',
      pca = True, pca_method = 'linear', verbose=False)
r16b = []
r17b = []
r18b = []

print("For Cluster = 3")
x = create_model('hclust', num_clusters = 3)
labels=x.fit_predict(data)
r16b.append(silhouette_score(data,labels))
r16b.append(calinski_harabasz_score(data,labels))
r16b.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 4")
x = create_model('hclust', num_clusters = 4)
labels=x.fit_predict(data)
r17b.append(silhouette_score(data,labels))
r17b.append(calinski_harabasz_score(data,labels))
r17b.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 5")
x = create_model('hclust', num_clusters = 5)
plot_model(x, plot = 'cluster')
labels=x.fit_predict(data)
r18b.append(silhouette_score(data,labels))
r18b.append(calinski_harabasz_score(data,labels))
r18b.append(davies_bouldin_score(data,labels))

plot_model(x, plot='cluster')

## Building results table

In [None]:
result2 = {'Parameters': ['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldins'],
           'No Data Processing(c=3)': r1b,'No Data Processing(c=4)': r2b,'No Data Processing(c=5)': r3b,
           'Using Normalization(c=3)' : r4b,'Using Normalization(c=4)' : r5b,'Using Normalization(c=5)' : r6b,
           'Using Transform(c=3)' : r7b,'Using Transform(c=4)' : r8b,'Using Transform(c=5)' : r9b,
           'Using PCA(c=3)' : r10b,'Using PCA(c=4)' : r11b,'Using PCA(c=5)' : r12b,
           'Using T + N(c=3)': r13b,'Using T + N(c=4)': r14b,'Using T + N(c=5)': r15b,
           'T + N + PCA(c=3)': r16b,'T + N + PCA(c=4)': r17b,'T + N + PCA(c=5)': r18b}

result2_df = pd.DataFrame(result2)
result2_df

# **BIRCH CLUSTERING**

## No data processing

In [None]:
setup(data, verbose=False)
r1c = []
r2c = []
r3c = []

print("For Cluster = 3")
x = create_model('birch', num_clusters = 3)
labels=x.fit_predict(data)
r1c.append(silhouette_score(data,labels))
r1c.append(calinski_harabasz_score(data,labels))
r1c.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 4")
x = create_model('birch', num_clusters = 4)
labels=x.fit_predict(data)
r2c.append(silhouette_score(data,labels))
r2c.append(calinski_harabasz_score(data,labels))
r2c.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 5")
x = create_model('birch', num_clusters = 5)
labels=x.fit_predict(data)
r3c.append(silhouette_score(data,labels))
r3c.append(calinski_harabasz_score(data,labels))
r3c.append(davies_bouldin_score(data,labels))

plot_model(x, plot = 'cluster')


## Using normalization

In [None]:
setup(data = data, normalize = True, normalize_method = 'zscore', verbose=False)
r4c = []
r5c = []
r6c = []

print("For Cluster = 3")
x = create_model('birch', num_clusters = 3)
labels=x.fit_predict(data)
r4c.append(silhouette_score(data,labels))
r4c.append(calinski_harabasz_score(data,labels))
r4c.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 4")
x = create_model('birch', num_clusters = 4)
labels=x.fit_predict(data)
r5c.append(silhouette_score(data,labels))
r5c.append(calinski_harabasz_score(data,labels))
r5c.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 5")
x = create_model('birch', num_clusters = 5)
labels=x.fit_predict(data)
r6c.append(silhouette_score(data,labels))
r6c.append(calinski_harabasz_score(data,labels))
r6c.append(davies_bouldin_score(data,labels))

plot_model(x, plot='cluster')

## Using transformation

In [None]:
setup(data = data, transformation = True, transformation_method = 'yeo-johnson', verbose=False)
r7c = []
r8c = []
r9c = []

print("For Cluster = 3")
x = create_model('birch', num_clusters = 3)
labels=x.fit_predict(data)
r7c.append(silhouette_score(data,labels))
r7c.append(calinski_harabasz_score(data,labels))
r7c.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 4")
x = create_model('birch', num_clusters = 4)
labels=x.fit_predict(data)
r8c.append(silhouette_score(data,labels))
r8c.append(calinski_harabasz_score(data,labels))
r8c.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 5")
x = create_model('birch', num_clusters = 5)
labels=x.fit_predict(data)
r9c.append(silhouette_score(data,labels))
r9c.append(calinski_harabasz_score(data,labels))
r9c.append(davies_bouldin_score(data,labels))

plot_model(x, plot='cluster')

## Using PCA

In [None]:
setup(data = data, pca = True, pca_method = 'linear', verbose=False)
r10c = []
r11c = []
r12c = []

print("For Cluster = 3")
x = create_model('birch', num_clusters = 3)
labels=x.fit_predict(data)
r10c.append(silhouette_score(data,labels))
r10c.append(calinski_harabasz_score(data,labels))
r10c.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 4")
x = create_model('birch', num_clusters = 4)
labels=x.fit_predict(data)
r11c.append(silhouette_score(data,labels))
r11c.append(calinski_harabasz_score(data,labels))
r11c.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 5")
x = create_model('birch', num_clusters = 5)
labels=x.fit_predict(data)
r12c.append(silhouette_score(data,labels))
r12c.append(calinski_harabasz_score(data,labels))
r12c.append(davies_bouldin_score(data,labels))

plot_model(x, plot='cluster')

## Using Normalization and Transformation

In [None]:
setup(data = data, normalize = True, normalize_method = 'zscore', transformation = True, transformation_method = 'yeo-johnson', verbose=False)
r13c = []
r14c = []
r15c = []

print("For Cluster = 3")
x = create_model('birch', num_clusters = 3)
labels=x.fit_predict(data)
r13c.append(silhouette_score(data,labels))
r13c.append(calinski_harabasz_score(data,labels))
r13c.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 4")
x = create_model('birch', num_clusters = 4)
labels=x.fit_predict(data)
r14c.append(silhouette_score(data,labels))
r14c.append(calinski_harabasz_score(data,labels))
r14c.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 5")
x = create_model('birch', num_clusters = 5)
labels=x.fit_predict(data)
r15c.append(silhouette_score(data,labels))
r15c.append(calinski_harabasz_score(data,labels))
r15c.append(davies_bouldin_score(data,labels))

plot_model(x, plot='cluster')

## Using Normalization, Transformation and PCA

In [None]:
setup(data = data, normalize = True, normalize_method = 'zscore', transformation = True, transformation_method = 'yeo-johnson',
      pca = True, pca_method = 'linear', verbose=False)
r16c = []
r17c = []
r18c = []

print("For Cluster = 3")
x = create_model('birch', num_clusters = 3)
labels=x.fit_predict(data)
r16c.append(silhouette_score(data,labels))
r16c.append(calinski_harabasz_score(data,labels))
r16c.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 4")
x = create_model('birch', num_clusters = 4)
labels=x.fit_predict(data)
r17c.append(silhouette_score(data,labels))
r17c.append(calinski_harabasz_score(data,labels))
r17c.append(davies_bouldin_score(data,labels))
print("\n")

print("For Cluster = 5")
x = create_model('birch', num_clusters = 5)
labels=x.fit_predict(data)
r18c.append(silhouette_score(data,labels))
r18c.append(calinski_harabasz_score(data,labels))
r18c.append(davies_bouldin_score(data,labels))

plot_model(x, plot='cluster')

## Building results table

In [None]:
result3 = {'Parameters': ['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldins'],
           'No Data Processing(c=3)': r1c,'No Data Processing(c=4)': r2c,'No Data Processing(c=5)': r3c,
           'Using Normalization(c=3)' : r4c,'Using Normalization(c=4)' : r5c,'Using Normalization(c=5)' : r6c,
           'Using Transform(c=3)' : r7c,'Using Transform(c=4)' : r8c,'Using Transform(c=5)' : r9c,
           'Using PCA(c=3)' : r10c,'Using PCA(c=4)' : r11c,'Using PCA(c=5)' : r12c,
           'Using T + N(c=3)': r13c,'Using T + N(c=4)': r14c,'Using T + N(c=5)': r15c,
           'T + N + PCA(c=3)': r16c,'T + N + PCA(c=4)': r17c,'T + N + PCA(c=5)': r18c}

result3_df = pd.DataFrame(result3)
result3_df