In [11]:
import pandas as pd
import numpy as np

from pathlib import Path

from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

from sklearn_extra.cluster import KMedoids

from kmodes.kprototypes import KPrototypes

In [3]:
norm_train = pd.read_csv(Path('..','adults_data','adults_norm_train.csv'))
norm_test = pd.read_csv(Path('..','adults_data','adults_norm_test.csv'))

std_train = pd.read_csv(Path('..','adults_data','adults_std_train.csv'))
std_test = pd.read_csv(Path('..','adults_data','adults_std_test.csv'))

ohn_train = pd.read_csv(Path('..','adults_data','adults_ohn_train.csv'))
ohn_test = pd.read_csv(Path('..','adults_data','adults_ohn_test.csv'))

ohs_train = pd.read_csv(Path('..','adults_data','adults_ohs_train.csv'))
ohs_test = pd.read_csv(Path('..','adults_data','adults_ohs_test.csv'))


In [9]:
def optimal_K_means(train, test, opt_k, settype):
    
    kmeans = KMeans(n_clusters=opt_k, max_iter=5000, random_state=42)
    kmeans.fit_predict(train)  
    
    test_clusters = kmeans.predict(test)
    
    silhouette_scores = silhouette_score(test, test_clusters)
    db_scores = davies_bouldin_score(test, test_clusters)
    ch_scores = calinski_harabasz_score(test, test_clusters)
    
    measures = pd.DataFrame({'silh':[silhouette_scores],
                            'dbi':[db_scores],
                            'ch':[ch_scores]})

    measures.index += 1
    
    measures.to_csv(f'kmeans_{opt_k}_{settype}.csv')

In [None]:
def optimal_K_medoids(train, test, opt_k, settype):
    
    kmedoids = KMedoids(n_clusters=opt_k, max_iter=5000, random_state=42)
    kmedoids.fit_predict(train)  
    
    test_clusters = kmedoids.predict(test)
    
    silhouette_scores = silhouette_score(test, test_clusters)
    db_scores = davies_bouldin_score(test, test_clusters)
    ch_scores = calinski_harabasz_score(test, test_clusters)
    
    measures = pd.DataFrame({'silh':[silhouette_scores],
                            'dbi':[db_scores],
                            'ch':[ch_scores]})

    measures.index += 1
    
    measures.to_csv(f'kmedoids_{opt_k}_{settype}.csv')

In [12]:
def optimal_K_prototype(train, test, opt_k, settype):
    
    kprototype = KPrototypes(n_clusters=opt_k, max_iter=5000, random_state=42)
    kprototype.fit_predict(train, categorical=[...])  # Replace [...] with categorical column indices
    
    test_clusters = kprototype.predict(test)
    
    silhouette_avg = silhouette_score(test, test_clusters)
    db_score = davies_bouldin_score(test, test_clusters)
    ch_score = calinski_harabasz_score(test, test_clusters)
    
    measures = pd.DataFrame({'silh':[silhouette_avg],
                             'dbi':[db_score],
                             'ch':[ch_score]})
    
    measures.index += 1
    
    measures.to_csv(f'kprototype_{opt_k}_{settype}.csv')

    return measures

In [13]:
def optimal_K_agglomerative(train, test, opt_k, settype):
    
    agglo = AgglomerativeClustering(n_clusters=opt_k)
    train_clusters = agglo.fit_predict(train)
    test_clusters = agglo.fit_predict(test)
    
    silhouette_avg = silhouette_score(test, test_clusters)
    db_score = davies_bouldin_score(test, test_clusters)
    ch_score = calinski_harabasz_score(test, test_clusters)
    
    measures = pd.DataFrame({'silh':[silhouette_avg],
                             'dbi':[db_score],
                             'ch':[ch_score]})
    
    measures.index += 1
    
    measures.to_csv(f'agglomerative_{opt_k}_{settype}.csv')

    return measures