In [1]:
!pip install uv

Collecting uv
  Downloading uv-0.9.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading uv-0.9.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m63.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: uv
Successfully installed uv-0.9.5


In [2]:
!pip install ucimlrepo --quiet

# **Iris Dataset**

In [3]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
iris = fetch_ucirepo(id=53) 
  
# data (as pandas dataframes) 
X = iris.data.features 
y = iris.data.targets 
  
# metadata 
print(iris.metadata) 
  
# variable information 
print(iris.variables)

{'uci_id': 53, 'name': 'Iris', 'repository_url': 'https://archive.ics.uci.edu/dataset/53/iris', 'data_url': 'https://archive.ics.uci.edu/static/public/53/data.csv', 'abstract': 'A small classic dataset from Fisher, 1936. One of the earliest known datasets used for evaluating classification methods.\n', 'area': 'Biology', 'tasks': ['Classification'], 'characteristics': ['Tabular'], 'num_instances': 150, 'num_features': 4, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1936, 'last_updated': 'Tue Sep 12 2023', 'dataset_doi': '10.24432/C56C76', 'creators': ['R. A. Fisher'], 'intro_paper': {'ID': 191, 'type': 'NATIVE', 'title': 'The Iris data set: In search of the source of virginica', 'authors': 'A. Unwin, K. Kleinman', 'venue': 'Significance, 2021', 'year': 2021, 'journal': 'Significance, 2021', 'DOI': '1740-9713.01589', 'URL': 'https://www.semanticscholar.org

In [4]:
X.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [5]:
y.head()

Unnamed: 0,class
0,Iris-setosa
1,Iris-setosa
2,Iris-setosa
3,Iris-setosa
4,Iris-setosa


In [6]:
y['class'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [7]:
mp = {x:i for i,x in enumerate(list(y['class'].unique()))}
mp

{'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}

In [8]:
y['class'] = y['class'].map(mp)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['class'] = y['class'].map(mp)


In [9]:
y

Unnamed: 0,class
0,0
1,0
2,0
3,0
4,0
...,...
145,2
146,2
147,2
148,2


In [10]:
!uv pip install scikit-learn-extra --quiet

In [11]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris, load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    rand_score, adjusted_rand_score,
    mutual_info_score, adjusted_mutual_info_score, normalized_mutual_info_score,
    silhouette_score, calinski_harabasz_score, davies_bouldin_score
)
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, OPTICS
from sklearn_extra.cluster import KMedoids
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

In [12]:
def evaluate_clustering(X, y_true, y_pred):
    """Compute all metrics."""
    sse = np.sum((X - X[y_pred].mean(axis=0)) ** 2)
    ss_total = np.sum((X - X.mean(axis=0)) ** 2)
    ssb = ss_total - sse

    return {
        'Rand': rand_score(y_true, y_pred),
        'Adj Rand': adjusted_rand_score(y_true, y_pred),
        'Mutual Info': mutual_info_score(y_true, y_pred),
        'Adj MI': adjusted_mutual_info_score(y_true, y_pred),
        'Norm MI': normalized_mutual_info_score(y_true, y_pred),
        'Silhouette': silhouette_score(X, y_pred) if len(np.unique(y_pred)) > 1 else np.nan,
        'Calinski': calinski_harabasz_score(X, y_pred) if len(np.unique(y_pred)) > 1 else np.nan,
        'Davies': davies_bouldin_score(X, y_pred) if len(np.unique(y_pred)) > 1 else np.nan,
        'SSE': sse,
        'SSB': ssb
    }

In [13]:
hp = {
    'n_clusters':20,
    'init':'random'
}

In [14]:
model=KMeans(**hp)

In [15]:
model

K-Means (with tuning)
K-Medoids (PAM)
K-Means++
Bisecting K-Means
Hierarchical Clustering (Agglomerative + Dendrogram)
DBSCAN
OPTICS

In [16]:
from sklearn.cluster import BisectingKMeans

In [17]:
def get_model(name:str, hp:dict):
    model=None
    if name=='K-Means':
        model = KMeans(**hp)
    elif name=='K-Medoids':
        model = KMedoids(**hp)
    elif name=='K-Means++':
        if hp['init']=='k-means++':
            model = KMeans(**hp)
    elif name=='Bisecting K-Means':
        model = BisectingKMeans(**hp)
    elif name=='Hierarchical Clustering':
        model = AgglomerativeClustering(**hp)
    elif name=='DBSCAN':
        model = DBSCAN(**hp)
    elif name=='OPTICS':
        model = OPTICS(**hp)
    return model

In [18]:
models = [
    'K-Means',
    'K-Medoids',
    'K-Means++',
    'Bisecting K-Means',
    'Hierarchical Clustering',
    'DBSCAN',
    'OPTICS'
]

## Pipeline
1. DataSplit
2. model load
3. evaluation

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
result = pd.DataFrame()

In [21]:
def pipeline(name: str, hp: dict, split: float, result, X, y):
    from sklearn.model_selection import train_test_split

    x_train, x_test, y_train, y_test = train_test_split(X.values, y['class'].values, test_size=split, random_state=42)
    model = get_model(name, hp)

    if model:
        print(f"model loaded : {name}")
    else:
        print(f"{name} model can't be loaded. Exitting")
        return

    # Special cases: models without .predict()
    non_predictive_models = ['Hierarchical Clustering', 'DBSCAN', 'OPTICS']
    if name in non_predictive_models:
        y_pred = model.fit_predict(x_train)
        full_res = evaluate_clustering(x_train, y_train, y_pred)
        result[f"{int(split*100)}:{100-int(split*100)}"] = full_res
        print('-' * 50)
        print(f"For entire dataset ({name}):")
        print(full_res)
        return

    # Regular case: models with .predict()
    model.fit(x_train)
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    train_res = evaluate_clustering(x_train, y_train, y_train_pred)
    test_res = evaluate_clustering(x_test, y_test, y_test_pred)

    col = f"{int(split*100)}:{100-int(split*100)}"
    result[col] = test_res

    print('-'*50)
    print("For train split:")
    print(train_res)
    print('-'*50)
    print("For test split:")
    print(test_res)
    print(f"Accuracy : {sum(y_test_pred==y_test)/len(y_test)}")


## Test Pipeline on kmeans

In [22]:
hp = {
    'n_clusters':3,
    'init':'random',
    'n_init':30
}
for split in range(20,60,10):
    print("="*100)
    print(f"For split :=> {split}:{100-split}")
    pipeline(models[0],hp,split/100,result,X,y)

For split :=> 20:80
model loaded : K-Means
--------------------------------------------------
For train split:
{'Rand': 0.8565826330532212, 'Adj Rand': 0.6788174668657071, 'Mutual Info': 0.7767729022331029, 'Adj MI': 0.7108831075553822, 'Norm MI': 0.7154317355463076, 'Silhouette': 0.5416577519119027, 'Calinski': 424.8058495622399, 'Davies': 0.6741920093413872, 'SSE': 1001.554166666666, 'SSB': -464.5684999999993}
--------------------------------------------------
For test split:
{'Rand': 0.9563218390804598, 'Adj Rand': 0.8981703936425799, 'Mutual Info': 0.9869123863067224, 'Adj MI': 0.8923689116628177, 'Norm MI': 0.8996935451597475, 'Silhouette': 0.5749429418885256, 'Calinski': 131.20663956639564, 'Davies': 0.6113492859247058, 'SSE': 159.59999999999997, 'SSB': -17.208666666666602}
Accuracy : 0.9666666666666667
For split :=> 30:70
model loaded : K-Means
--------------------------------------------------
For train split:
{'Rand': 0.8461538461538461, 'Adj Rand': 0.6547489409636046, 'Mutual

In [23]:
result

Unnamed: 0,20:80,30:70,40:60,50:50
Rand,0.956322,0.930303,0.925424,0.901622
Adj Rand,0.89817,0.84355,0.830293,0.77762
Mutual Info,0.986912,0.909901,0.881681,0.835964
Adj MI,0.892369,0.840248,0.800345,0.761032
Norm MI,0.899694,0.847427,0.806744,0.767117
Silhouette,0.574943,0.579836,0.543134,0.543412
Calinski,131.20664,211.997652,232.771109,281.812292
Davies,0.611349,0.681315,0.694464,0.713199
SSE,159.6,236.670667,351.546833,507.286
SSB,-17.208667,-17.076,-57.499167,-149.8244


## Hyper Parameters

In [24]:
# ============================================
# 🔧 BEST HYPERPARAMETERS FOR EACH DATASET
# ============================================

iris_hyperparams = [
    {
        'algorithm': 'K-Means',
        'params': {
            'n_clusters': 3,
            'init': 'random',
            'n_init': 20,
            'random_state': 42
        }
    },
    {
        'algorithm': 'K-Medoids',
        'params': {
            'n_clusters': 3,
            'metric': 'euclidean',
            'init': 'k-medoids++',
            'random_state': 42
        }
    },
    {
        'algorithm': 'K-Means++',
        'params': {
            'n_clusters': 3,
            'init': 'k-means++',
            'n_init': 30,
            'random_state': 42
        }
    },
    {
        'algorithm': 'Bisecting K-Means',
        'params': {
            'n_clusters': 3,
            'bisecting_strategy': 'largest_cluster',
            'random_state': 42
        }
    },
    {
        'algorithm': 'Hierarchical Clustering',
        'params': {
            'n_clusters': 3,
            'linkage': 'ward',
            'affinity': 'euclidean'
        }
    },
    {
        'algorithm': 'DBSCAN',
        'params': {
            'eps': 0.5,
            'min_samples': 5,
            'metric': 'euclidean'
        }
    },
    {
        'algorithm': 'OPTICS',
        'params': {
            'min_samples': 5,
            'xi': 0.05,
            'min_cluster_size': 0.05
        }
    }
]

# ----------------------------------------------------

wine_hyperparams = [
    {
        'algorithm': 'K-Means',
        'params': {
            'n_clusters': 3,
            'init': 'random',
            'n_init': 30,
            'random_state': 42
        }
    },
    {
        'algorithm': 'K-Medoids',
        'params': {
            'n_clusters': 3,
            'metric': 'euclidean',
            'init': 'k-medoids++',
            'random_state': 42
        }
    },
    {
        'algorithm': 'K-Means++',
        'params': {
            'n_clusters': 3,
            'init': 'k-means++',
            'n_init': 50,
            'random_state': 42
        }
    },
    {
        'algorithm': 'Bisecting K-Means',
        'params': {
            'n_clusters': 3,
            'bisecting_strategy': 'largest_cluster',
            'random_state': 42
        }
    },
    {
        'algorithm': 'Hierarchical Clustering',
        'params': {
            'n_clusters': 3,
            'linkage': 'ward',
            'affinity': 'euclidean'
        }
    },
    {
        'algorithm': 'DBSCAN',
        'params': {
            'eps': 0.6,
            'min_samples': 5,
            'metric': 'euclidean'
        }
    },
    {
        'algorithm': 'OPTICS',
        'params': {
            'min_samples': 6,
            'xi': 0.05,
            'min_cluster_size': 0.05
        }
    }
]


### Running on iris dataset

In [25]:
import os
os.mkdir("iris")
os.mkdir("wine")

In [26]:
for node in iris_hyperparams:
    print("="*60)
    print("="*60)
    print('\t',f"MODEL : {node['algorithm']}")
    results = pd.DataFrame()
    for split in range(20,60,10):
        print("="*100)
        print(f"For split :=> {split}:{100-split}")
        pipeline(node['algorithm'],node['params'],split/100,results,X,y)
    print(results)
    results.to_csv(f"iris/{node['algorithm']}_result.csv")
    print("="*60)
    print("="*60)
    print("\n\n")

	 MODEL : K-Means
For split :=> 20:80
model loaded : K-Means
--------------------------------------------------
For train split:
{'Rand': 0.8565826330532212, 'Adj Rand': 0.6788174668657071, 'Mutual Info': 0.7767729022331031, 'Adj MI': 0.7108831075553824, 'Norm MI': 0.7154317355463078, 'Silhouette': 0.5416577519119027, 'Calinski': 424.80584956223987, 'Davies': 0.6741920093413872, 'SSE': 857.4651666666664, 'SSB': -320.4794999999997}
--------------------------------------------------
For test split:
{'Rand': 0.9563218390804598, 'Adj Rand': 0.8981703936425799, 'Mutual Info': 0.9869123863067223, 'Adj MI': 0.8923689116628175, 'Norm MI': 0.8996935451597474, 'Silhouette': 0.5749429418885256, 'Calinski': 131.20663956639564, 'Davies': 0.6113492859247035, 'SSE': 159.60000000000002, 'SSB': -17.20866666666666}
Accuracy : 0.3
For split :=> 30:70
model loaded : K-Means
--------------------------------------------------
For train split:
{'Rand': 0.8461538461538461, 'Adj Rand': 0.6547489409636046, 'Mut



--------------------------------------------------
For entire dataset (OPTICS):
{'Rand': 0.603501400560224, 'Adj Rand': 0.10009124929256867, 'Mutual Info': 0.47006934136387013, 'Adj MI': 0.33507488516814804, 'Norm MI': 0.3641399426096202, 'Silhouette': -0.10876422626901287, 'Calinski': 11.751020615909294, 'Davies': 1.913795239700271, 'SSE': 606.8169166666669, 'SSB': -69.83125000000018}
For split :=> 30:70
model loaded : OPTICS
--------------------------------------------------
For entire dataset (OPTICS):
{'Rand': 0.5820512820512821, 'Adj Rand': 0.08919096848953913, 'Mutual Info': 0.41015450219123817, 'Adj MI': 0.30924254078755187, 'Norm MI': 0.3402917512520197, 'Silhouette': -0.16757479444798246, 'Calinski': 11.544806377924939, 'Davies': 2.5910557768371496, 'SSE': 900.3160952380946, 'SSB': -444.92942857142793}
For split :=> 40:60
model loaded : OPTICS
--------------------------------------------------
For entire dataset (OPTICS):
{'Rand': 0.6252184769038701, 'Adj Rand': 0.136974761324

## Running on wine dataset

In [27]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
wine = fetch_ucirepo(id=109) 
  
# data (as pandas dataframes) 
X = wine.data.features 
y = wine.data.targets 
  
# metadata 
print(wine.metadata) 
  
# variable information 
print(wine.variables) 

{'uci_id': 109, 'name': 'Wine', 'repository_url': 'https://archive.ics.uci.edu/dataset/109/wine', 'data_url': 'https://archive.ics.uci.edu/static/public/109/data.csv', 'abstract': 'Using chemical analysis to determine the origin of wines', 'area': 'Physics and Chemistry', 'tasks': ['Classification'], 'characteristics': ['Tabular'], 'num_instances': 178, 'num_features': 13, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1992, 'last_updated': 'Mon Aug 28 2023', 'dataset_doi': '10.24432/C5PC7J', 'creators': ['Stefan Aeberhard', 'M. Forina'], 'intro_paper': {'ID': 246, 'type': 'NATIVE', 'title': 'Comparative analysis of statistical pattern recognition methods in high dimensional settings', 'authors': 'S. Aeberhard, D. Coomans, O. Vel', 'venue': 'Pattern Recognition', 'year': 1994, 'journal': None, 'DOI': '10.1016/0031-3203(94)90145-7', 'URL': 'https:

In [28]:
y['class']=y['class']-1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['class']=y['class']-1


In [29]:
y['class'].unique()

array([0, 1, 2])

In [30]:
for node in iris_hyperparams:
    print("="*60)
    print("="*60)
    print('\t',f"MODEL : {node['algorithm']}")
    results = pd.DataFrame()
    for split in range(20,60,10):
        print("="*100)
        print(f"For split :=> {split}:{100-split}")
        pipeline(node['algorithm'],node['params'],split/100,results,X,y)
    print(results)
    results.to_csv(f"wine/{node['algorithm']}_result.csv")
    print("="*60)
    print("="*60)
    print("\n\n")

	 MODEL : K-Means
For split :=> 20:80
model loaded : K-Means
--------------------------------------------------
For train split:
{'Rand': 0.7031265607831385, 'Adj Rand': 0.33732833623880215, 'Mutual Info': 0.439415298534742, 'Adj MI': 0.39763390374103436, 'Norm MI': 0.40563119444972684, 'Silhouette': 0.56836763967021, 'Calinski': 458.4047220323674, 'Davies': 0.5337526530129646, 'SSE': 15181409.74330445, 'SSB': -2260736.814729266}
--------------------------------------------------
For test split:
{'Rand': 0.7777777777777778, 'Adj Rand': 0.4927536231884058, 'Mutual Info': 0.5814988483019488, 'Adj MI': 0.50868474303945, 'Norm MI': 0.5365783935489851, 'Silhouette': 0.5698497328652201, 'Calinski': 102.28143693890465, 'Davies': 0.5202402865293397, 'SSE': 4655458.7094901595, 'SSB': -85269.35570817254}
Accuracy : 0.1111111111111111
For split :=> 30:70
model loaded : K-Means
--------------------------------------------------
For train split:
{'Rand': 0.7027275111460792, 'Adj Rand': 0.3385065967

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


--------------------------------------------------
For entire dataset (OPTICS):
{'Rand': 0.63510138847268, 'Adj Rand': 0.1160893057130147, 'Mutual Info': 0.3525525599576411, 'Adj MI': 0.21783938304562828, 'Norm MI': 0.24744250613260954, 'Silhouette': 0.05421633335281721, 'Calinski': 19.387142457275647, 'Davies': 4.38484187025696, 'SSE': 19382989.625980504, 'SSB': -6462316.69740532}
For split :=> 30:70
model loaded : OPTICS
--------------------------------------------------
For entire dataset (OPTICS):
{'Rand': 0.6418830317335431, 'Adj Rand': 0.11681171606386488, 'Mutual Info': 0.3554649116979392, 'Adj MI': 0.21117565854309867, 'Norm MI': 0.24508360629333442, 'Silhouette': 0.08702479242444493, 'Calinski': 20.927113301878144, 'Davies': 2.9388552254677616, 'SSE': 14013665.599863583, 'SSB': -2562022.302608518}
For split :=> 40:60
model loaded : OPTICS
--------------------------------------------------
For entire dataset (OPTICS):
{'Rand': 0.6098831985624439, 'Adj Rand': 0.0926929112215805,

In [31]:
import shutil

# Path where Kaggle stores working files
shutil.make_archive('/kaggle/working/assignment4_output', 'zip', '/kaggle/working')


'/kaggle/working/assignment4_output.zip'