In [None]:
import numpy as np
%load_ext autoreload
%autoreload 2

In [None]:
#%load_ext memory_profiler

In [None]:
from importlib import reload

from hydra import initialize, compose

from src.unit_proccessing import *


In [None]:
import pandas as pd

In [None]:
def get_df_save_path(fc: UnitDataProcessing, fname: str):
    return os.path.join(fc.config.data.raw, fc.config.surveys[0],
                        fc.config.survey_version[0], 'processed_data', fname)

def save_df(fc: UnitDataProcessing, df: pd.DataFrame, fname: str) -> None:
    save_path = get_df_save_path(fc, fname)
    pd.to_pickle(df, save_path)

def load_df(fc: UnitDataProcessing, fname: str) -> pd.DataFrame:
    load_path = get_df_save_path(fc, fname)
    return pd.read_pickle(load_path)




In [None]:
with initialize(config_path='../configuration', version_base='1.1'):
    config = compose(config_name='main.yaml')

In [None]:
features_class = UnitDataProcessing(config)

In [None]:
df_item = load_df(features_class, 'df_item.pkl')
df_unit = load_df(features_class, 'df_unit.pkl')
df_unit_score = load_df(features_class, 'df_unit_score.pkl')

In [None]:
import utils.alg_test_utils as alg_test_utils
from sklearn.model_selection import train_test_split


#reload(alg_test_utils)

In [None]:
df_for_clustering = alg_test_utils.drop_all_na(df_unit_score)
keep_cols = [col for col in df_for_clustering.columns if col.startswith('s__')]
data_for_clustering = df_for_clustering[keep_cols].values
data_for_clustering = alg_test_utils.scale_data(data_for_clustering)

In [181]:
ITERATIONS = 5
TEST_SIZE = 0.15
alg_test_utils.N_CLUSTERS = 2
alg_test_utils.RANDOM_STATE = 321
randgen = np.random.RandomState(alg_test_utils.RANDOM_STATE)




In [182]:
"""
This code checks the performance of algorithms against different subsets of data. Performance is measured using classical metrics for clustering algorithms. See source file and sklearn doc to learn more about the metrics used. Since different subsets of data are used at each iteration, we can only compare algorithms within the same iteration. Once the array of all pairs of scores is obtained, one should probably compute some mean value for each pair.
"""

train_data, _ = train_test_split(data_for_clustering, test_size=TEST_SIZE, random_state=randgen)

res_labels = np.ndarray((ITERATIONS, len(alg_test_utils.CLUSTER_CLASSES), train_data.shape[0]), dtype=np.int32)

# todo add random label assignment to compare scores

res = []
for i in range(ITERATIONS):
    train_data, _ = train_test_split(data_for_clustering, test_size=TEST_SIZE, random_state=randgen)
    d = dict(iteration=i, data=train_data, n_clusters=alg_test_utils.N_CLUSTERS, random_state=alg_test_utils.RANDOM_STATE)

    for j in range(len(alg_test_utils.CLUSTER_CLASSES)):
        cluster_class = alg_test_utils.CLUSTER_CLASSES[j]

        cls = alg_test_utils.fit_cluster(train_data, cluster_class)

        labels = cls.labels_

        d[f'{cluster_class.__name__}_labels'] = labels
        res_labels[i, j] = labels
        #alg_test_utils.plot_data_pca(train_data, n_components=10,labels=labels, pc_start=0, pc_end=1)

    res.append(d)
    #print(labels[:20])
#res_labels

res_scores = np.ndarray((ITERATIONS, len(alg_test_utils.CLUSTER_CLASSES), len(alg_test_utils.CLUSTER_CLASSES), len(alg_test_utils.METRIC_SCORES)), dtype=np.float64)
for i in range(res_scores.shape[0]):
    for j1 in range(res_scores.shape[1]):
        labels_a = res_labels[i, j1]
        for j2 in range(res_scores.shape[3]):
                labels_b = res_labels[i, j2]

                pair_scores = alg_test_utils.calc_pair_metrics(labels_a, labels_b)

                #print(pair_scores)

                res_scores[i, j1, j2] = pair_scores

In [197]:
"""
This code checks the performance of algorithms against same subsets of data, but to which some transformation function is applied. Hence here comparisons between iterations make sense.
"""

res_labels = np.ndarray((ITERATIONS, len(alg_test_utils.CLUSTER_CLASSES), data_for_clustering.shape[0]), dtype=np.int32)

# todo add random label assignment to compare scores

res = []
for i in range(ITERATIONS):
    train_data = alg_test_utils.transform_data(data_for_clustering, random_state=randgen)
    d = dict(iteration=i, data=train_data, n_clusters=alg_test_utils.N_CLUSTERS, random_state=alg_test_utils.RANDOM_STATE)

    for j in range(len(alg_test_utils.CLUSTER_CLASSES)):
        cluster_class = alg_test_utils.CLUSTER_CLASSES[j]

        cls = alg_test_utils.fit_cluster(train_data, cluster_class)

        labels = cls.labels_

        d[f'{cluster_class.__name__}_labels'] = labels
        res_labels[i, j] = labels
        #alg_test_utils.plot_data_pca(train_data, n_components=10,labels=labels, pc_start=0, pc_end=1)

    res.append(d)
    #print(labels[:20])
#res_labels

res_scores = np.ndarray((ITERATIONS, len(alg_test_utils.CLUSTER_CLASSES), ITERATIONS, len(alg_test_utils.CLUSTER_CLASSES), len(alg_test_utils.METRIC_SCORES)), dtype=np.float64)
for i1 in range(res_scores.shape[0]):
    for j1 in range(res_scores.shape[1]):
        labels_a = res_labels[i1, j1]
        for i2 in range(res_scores.shape[2]):
            for j2 in range(res_scores.shape[3]):
                labels_b = res_labels[i2, j2]

                pair_scores = alg_test_utils.calc_pair_metrics(labels_a, labels_b)

                #print(pair_scores)

                res_scores[i1, j1, i2, j2] = pair_scores

In [198]:
res_scores

array([[[[[1.        , 0.35944515, 1.        , 1.        ],
          [0.99709302, 0.34867335, 0.97003215, 0.97808169],
          [1.        , 0.35944515, 1.        , 1.        ],
          [0.70969585, 0.00761056, 0.02117308, 0.03369797]],

         [[1.        , 0.35944515, 1.        , 1.        ],
          [0.99709302, 0.34867335, 0.97003215, 0.97808169],
          [1.        , 0.35944515, 1.        , 1.        ],
          [0.70969585, 0.00761056, 0.02117308, 0.03369797]],

         [[1.        , 0.35944515, 1.        , 1.        ],
          [0.99709302, 0.34867335, 0.97003215, 0.97808169],
          [1.        , 0.35944515, 1.        , 1.        ],
          [0.70969585, 0.00761056, 0.02117308, 0.03369797]],

         [[1.        , 0.35944515, 1.        , 1.        ],
          [0.99709302, 0.34867335, 0.97003215, 0.97808169],
          [1.        , 0.35944515, 1.        , 1.        ],
          [0.70969585, 0.00761056, 0.02117308, 0.03369797]],

         [[1.        , 0.3594451

In [195]:
arr[:,[0,1,2,0]]

array([[0, 1, 0, 0],
       [1, 0, 1, 1],
       [0, 0, 1, 0],
       [1, 1, 1, 1],
       [1, 1, 0, 1]])