In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.spatial import distance

In [2]:
def count_clustering_scores(X, cluster_num, model, score_fun):
    if isinstance(cluster_num, int):
        cluster_num_iter = [cluster_num]
    else:
        cluster_num_iter = cluster_num
        
    scores = []    
    for k in cluster_num_iter:
        model_instance = model(n_clusters=k)
        labels = model_instance.fit_predict(X)
        wcss = score_fun(X, labels)
        scores.append(wcss)
    
    if isinstance(cluster_num, int):
        return scores[0]
    else:
        return scores

def min_interclust_dist(X, label):
    clusters = set(label)
    global_min_dist = np.inf
    for cluster_i in clusters:
        cluster_i_idx = np.where(label == cluster_i)
        for cluster_j in clusters:
            if cluster_i != cluster_j:
                cluster_j_idx = np.where(label == cluster_j)
                interclust_min_dist = np.min(distance.cdist(X[cluster_i_idx], X[cluster_j_idx]))
                global_min_dist = np.min([global_min_dist, interclust_min_dist])
    return 1/global_min_dist

def _inclust_mean_dists(X, label):
    clusters = set(label)
    inclust_dist_list = []
    for cluster_i in clusters:
        cluster_i_idx = np.where(label == cluster_i)
        inclust_dist = np.mean(distance.pdist(X[cluster_i_idx]))
        inclust_dist_list.append(inclust_dist)
    return inclust_dist_list

def mean_inclust_dist(X, label):
    inclust_dist_list = _inclust_mean_dists(X, label)
    return 1/np.mean(inclust_dist_list)

def std_dev_of_inclust_dist(X, label):
    inclust_dist_list = _inclust_mean_dists(X, label)
    return np.std(inclust_dist_list)

def mean_dist_to_center(X, label):
    clusters = set(label)
    inclust_dist_list = []
    for cluster_i in clusters:
        cluster_i_idx = np.where(label == cluster_i)
        cluster_i_mean = np.mean(X[cluster_i_idx], axis=0, keepdims=True)
        inclust_dist = np.mean(distance.cdist(X[cluster_i_idx], cluster_i_mean))
        inclust_dist_list.append(inclust_dist)
    return 1/np.mean(inclust_dist_list)


In [3]:
from sklearn.discriminant_analysis import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, PowerTransformer
from transformer_pipeline import drop_rows, get_features_transformer


df = pd.read_csv("data/movie_statistic_dataset.csv")
df = drop_rows(df)

AG_transformer = make_pipeline(get_features_transformer(PowerTransformer(method="box-cox"),
                                          PowerTransformer(method="box-cox"),
                                          StandardScaler(),
                                          StandardScaler()), MinMaxScaler())
AG_X = AG_transformer.fit_transform(df)

# Hierarchcal clustering tuning

In [4]:
from sklearn.cluster import AgglomerativeClustering
from scipy.stats import uniform

In [28]:
model = AgglomerativeClustering()
param_dists = {"n_clusters": [None],
               "linkage": ["ward"],
               "distance_threshold": uniform(10,20)}


In [29]:
from sklearn.metrics import silhouette_score
def scorer(estimator, X):
    labels = estimator.fit_predict(X)
    return silhouette_score(X,labels)

In [32]:

from sklearn.model_selection import RandomizedSearchCV

tester = RandomizedSearchCV(estimator=model,
                            param_distributions= param_dists,
                            random_state=420, 
                            scoring= scorer,
                            n_iter= 20,
                            verbose=True) 

In [33]:
random_result = tester.fit(AG_X)
print("Best: %f using %s" % (random_result.best_score_, random_result.best_params_))


Fitting 5 folds for each of 20 candidates, totalling 100 fits


Traceback (most recent call last):
  File "/Users/krzysztof/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 969, in _score
    scores = scorer(estimator, X_test, **score_params)
  File "/var/folders/gp/n7ttdtg56pg0s6ydv9zj9wth0000gn/T/ipykernel_21379/3277683570.py", line 4, in scorer
    return silhouette_score(X,labels)
  File "/Users/krzysztof/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/Users/krzysztof/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/cluster/_unsupervised.py", line 141, in silhouette_score
    return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))
  File "/Users/krzysztof/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/_param_validation.py", line 186, in wrapper
    return func(*args, **kwargs)
  File "/Users/krzysztof/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/cluster/_unsupervised.py"

Best: 0.175636 using {'distance_threshold': 14.742643994562748, 'linkage': 'ward', 'n_clusters': None}
