In [None]:


import pandas as pd
import numpy as np
import seaborn as sns

from itertools import combinations, permutations
from matplotlib import pyplot as plt
from scipy.stats import entropy
from scipy.special import kl_div
from utils import get_data_train



In [None]:
df = get_data_train()
n_bins = 5 # tu możemy wybrać różną liczbę kuełków go generowania histogramów
            # liczbę przedziałów na jakich będziemy liczyć dywergencję

In [None]:
activities = np.unique(df['activity'])
activities_dict = {activity: None for activity in activities}
activities_dict


In [None]:
for activity in activities_dict.keys():
    numerical_features = df[df['activity']==activity].drop(['activity', 'subject'], axis='columns')
    activities_dict[activity] = numerical_features.apply(
        lambda column: 
        np.histogram(column, bins=n_bins, density=True, range=(-1,1))[0])

In [None]:


combi_colnames = list(map('-'.join, list(permutations(activities,2))))
combi_df = pd.DataFrame(columns = combi_colnames)

numerical_features = df.drop(['activity', 'subject'], axis='columns').columns
for feature in numerical_features:
    combi_df = combi_df.append(pd.Series(np.repeat(feature, 30), name=feature, index=combi_colnames))



In [None]:
def replace_inf_and_sum(feature1, feature2, x):
    kl = kl_div(
        activities_dict[feature1][x],
        activities_dict[feature2][x])
    return sum(map(lambda x: 100 if x>100 else x, kl))

def count_inf(feature1, feature2, x):
    return sum(
        np.isinf(          # tu zliczamy infy
            kl_div(
                activities_dict[feature1][x],
                activities_dict[feature2][x])))

def take_median(feature1, feature2, x):
    return np.median(          
            kl_div(
                activities_dict[feature1][x],
                activities_dict[feature2][x]))

In [None]:
def apply_to_column(column):
    feature1, feature2 = column.name.split('-')
    return column.apply(lambda x: replace_inf_and_sum(feature1, feature2, x)) # w tej lambdzie można wybrać inną funkcję


result = combi_df.apply(apply_to_column)

In [None]:
chosen_best = result.apply(lambda x: max(x), axis=1).sort_values(ascending=False).head(20).index.to_list()
avg_best = result.apply(lambda x: sum(x), axis=1).sort_values(ascending=False).head(20).index.to_list()

In [None]:
def plot_var(varname):
    sns.displot(df, x=varname, hue='activity', kind='kde')#bins=n_bins, multiple='dodge')
    plt.show()

In [None]:
for varname in chosen_best: # lub alternatywnie chosen_best
    plot_var(varname)

In [None]:
chosen_best

In [None]:
best_with_categorical = chosen_best + ['activity', 'subject']

In [None]:
df_chosen = pd.DataFrame( df[ best_with_categorical])

In [None]:
df_chosen

In [None]:
df_chosen_sep_by_act = []
for i in range(6):
    df_chosen_sep_by_act.append([])

for i in range(6):
    df_chosen_sep_by_act[i] = df_chosen[ df_chosen['activity'] == activities[i]].copy()
    df_chosen_sep_by_act[i].drop( columns=['activity', 'subject'], inplace = True)

In [None]:
from scipy.cluster import hierarchy

# Generujemy nowe dane o mnjejszej liczbie punktów (dla czytelności)
Z = hierarchy.linkage( df_chosen_sep_by_act[0], method='average')
plt.figure(figsize=(10, 5), dpi= 200, facecolor='w', edgecolor='k')
hierarchy.dendrogram(Z)
plt.show()

In [None]:
# A w praktyce wygląda to tak:
def count_clustering_scores(X, cluster_num, model, score_fun):
    # Napiszmy tę funkcje tak ogólnie, jak to możliwe. 
    # Zwróćcie uwagę na przekazanie obiektów typu callable: model i score_fun.
    if isinstance(cluster_num, int):
        cluster_num_iter = [cluster_num]
    else:
        cluster_num_iter = cluster_num
        
    scores = []    
    for k in cluster_num_iter:
        model_instance = model(n_clusters=k)
        labels = model_instance.fit_predict(X)
        wcss = score_fun(X, labels)
        scores.append(wcss)
    
    if isinstance(cluster_num, int):
        return scores[0]
    else:
        return scores

In [None]:
df_chosen_sep_by_act[1]

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

silhouette_vec = []
plots = []
for i in range(6):
    # In each iteration, add an empty list to the main list
    silhouette_vec.append([])
    plots.append([])

cluster_num_seq = list( range(2, 11)) # Niektóre metryki nie działają gdy mamy tylko jeden klaster

for i in range(6):
    
    silhouette_vec[i] = count_clustering_scores(
        df_chosen_sep_by_act[i], cluster_num_seq, KMeans, silhouette_score)

In [None]:
frames = []
for i in range(6):
    # In each iteration, add an empty list to the main list
    frames.append([])

for i in range(6):
    frames[i] = pd.DataFrame(
        np.array(
            [silhouette_vec[i], list( cluster_num_seq)]).transpose(),
        columns = ["silhouette", "no of clusters"])
    frames[i]["acitivity"] = activities[i]

In [None]:
stacked = pd.concat( frames, ignore_index=True)

In [None]:
import seaborn as sns
sns.set(rc={"figure.figsize":(12, 8)})
sns.lineplot(data= stacked, 
             x="no of clusters", y="silhouette", hue="acitivity")



In [None]:
subjects = np.unique( df["subject"])
n_subjects = len( subjects)
df_by_subject = []
clusters = []

df_by_subject_by_cluster = []

for i in range( n_subjects):
    # In each iteration, add an empty list to the main list
    df_by_subject.append([])
    clusters.append([])
    df_by_subject_by_cluster.append([])
    
    for j in range( 6):
        df_by_subject_by_cluster[i].append([])

for i in range( n_subjects):
    df_by_subject[i] = df[df["subject"] == subjects[i]].copy()
    df_by_subject[i].drop( columns=['activity', 'subject'], inplace = True)
    
    clusters[i] = KMeans(n_clusters=6, random_state=1618).fit( df_by_subject[i])
    df_by_subject[i]["cluster"] = clusters[i].labels_
    
    for j in range( 6):
        df_by_subject_by_cluster[i][j] = df_by_subject[i][ df_by_subject[i]["cluster"] == j ].copy()
        df_by_subject_by_cluster[i][j].drop( columns = ["cluster"], inplace = True)
        df_by_subject_by_cluster[i][j] = df_by_subject_by_cluster[i][j].to_numpy()

In [None]:
clusters[0].cluster_centers_

In [None]:
from scipy.spatial import distance
import statistics

distances = []

for i in range( n_subjects):
    # In each iteration, add an empty list to the main list
    distances.append([])

    for j in range( n_subjects):
        distances[i].append([])


for i in range( n_subjects):
    for j in range( n_subjects):
        
        dists_to_clusters = []
        for k in range(6):
            dists_to_clusters.append([])
            
            dist_from_single = []
            for l in range(6):
                dist_from_single.append([])
                dist_from_single[l] = distance.euclidean(
                    clusters[i].cluster_centers_[k], clusters[j].cluster_centers_[l]
                )
            dists_to_clusters[k] = min( dist_from_single)
        distances[i][j] =  statistics.mean( dists_to_clusters)

In [None]:
from scipy.spatial import distance
import statistics

distances2 = []

for i in range( n_subjects):
    # In each iteration, add an empty list to the main list
    distances2.append([])

    for j in range( n_subjects):
        distances2[i].append([])


for i in range( n_subjects):
    for j in range( n_subjects):
        
        dists_to_clusters = []
        for k in range(6):
            dists_to_clusters.append([])
            
            dist_from_single = []
            for l in range(6):
                dist_from_single.append([])
                dist_from_single[l] = distance.euclidean(
                    clusters[i].cluster_centers_[k], clusters[j].cluster_centers_[l]
                )
            dists_to_clusters[k] = min( dist_from_single)
        distances2[i][j] =  max( dists_to_clusters)

In [None]:
sns.heatmap( distances)

In [None]:
sns.heatmap( distances2)

## KMeans

In [None]:
from sklearn.metrics import silhouette_score

cluster_num_seq = [x for x in range(2, 11)]
silhouette_vec = count_clustering_scores(df.drop(columns=['subject', 'activity']), cluster_num_seq, KMeans, silhouette_score)
plt.plot(cluster_num_seq, silhouette_vec, 'bx-')
plt.xlabel('k')
plt.ylabel('Silhouette score')
plt.title('Clustering on a full data set')
plt.show()

In [None]:
silhouette_vec_chosen = count_clustering_scores(df[chosen_best], cluster_num_seq, KMeans, silhouette_score)
plt.plot(cluster_num_seq, silhouette_vec, 'bx-')
plt.xlabel('k')
plt.ylabel('Silhouette score')
plt.title('Clustering on chosen best columns')
plt.show()

In [None]:
silhouette_vec_avg_best = count_clustering_scores(df[avg_best], cluster_num_seq, KMeans, silhouette_score)
plt.plot(cluster_num_seq, silhouette_vec, 'bx-')
plt.xlabel('k')
plt.ylabel('Silhouette score')
plt.title('Clustering on avarage best columns')
plt.show()

In [None]:
tmp = pd.DataFrame(
    {'Full dataset': silhouette_vec, 
     '20 best chosen columns': silhouette_vec_chosen, 
     '20 best in average columns': silhouette_vec_avg_best})
display(tmp)
tmp = pd.concat([tmp,pd.Series(cluster_num_seq, name='k')], axis=1, join='inner', ignore_index=True)
tmp.columns = ['Full dataset', '20 best chosen columns', '20 best in average columns', 'k']
tmp.plot(x='k', style=['r-.', 'b-', 'g--'], title='K-Means clustering', ylabel='Silhuette score')

In [None]:
!pip install yellowbrick

In [None]:
from yellowbrick.cluster import (
    SilhouetteVisualizer,
    KElbowVisualizer)

In [None]:
km = KMeans(random_state=123)
visualizer = KElbowVisualizer(km, k=(2,10))
visualizer.fit(df.drop(columns=['subject', 'activity']))
visualizer.show()

In [None]:
visualizer = KElbowVisualizer(km, k=(2,10))
visualizer.fit(df[avg_best])
visualizer.show()

In [None]:
visualizer = KElbowVisualizer(km, k=(2,10))
visualizer.fit(df[chosen_best])
visualizer.show()

In [None]:
all_cols_clusterer = KMeans(random_state=123, n_clusters=4)
all_cols_clusterer.fit(df.drop(columns=['subject', 'activity']))
visualizer = silhouette_visualizer(all_cols_clusterer,
                                    X=df.drop(columns=['subject', 'activity']), 
                                    colors='yellowbrick',
                                    is_fitted=True)
visualizer.show()

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(15,12), sharex=True)

km = KMeans(n_clusters=4, max_iter=100, random_state=42)
visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[0])
visualizer.fit(df.drop(columns=['subject', 'activity']))
visualizer.finalize()
ax[0].set_title('Full dataset')



km = KMeans(n_clusters=4, max_iter=100, random_state=42)
visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[1])
visualizer.fit(df[chosen_best])
visualizer.finalize()
ax[1].set_title('20 best chosen columns')



km = KMeans(n_clusters=4, max_iter=100, random_state=42)
visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[2])
visualizer.fit(df[avg_best])
visualizer.finalize()
ax[2].set_title('20 best in average columns')


## Agglomerative Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering

cluster_num_seq = [x for x in range(2, 11)]
silhouette_vec = count_clustering_scores(df.drop(columns=['subject', 'activity']),
                                         cluster_num_seq, AgglomerativeClustering, 
                                         silhouette_score)

plt.plot(cluster_num_seq, silhouette_vec, 'bx-')
plt.xlabel('k')
plt.ylabel('Silhouette score')
plt.title('Agglomerative Clustering on full dataset')
plt.show()

In [None]:
silhouette_vec_best = count_clustering_scores(df[avg_best],
                                         cluster_num_seq, AgglomerativeClustering, 
                                         silhouette_score)

plt.plot(cluster_num_seq, silhouette_vec_best, 'bx-')
plt.xlabel('k')
plt.ylabel('Silhouette score')
plt.title('Agglomerative Clustering on full dataset')
plt.show()

In [None]:
fig, ax = plt.subplots(5, 2, figsize=(25,24), sharex=True)
for i in range(2,12):
    km = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=100, random_state=42)
    q, mod = divmod(i, 2)

    visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[q-1][mod])
    visualizer.fit(df[chosen_best])