# IMPORT STATEMENTS

In [1]:
# enter the directory containing CloMu.py
%cd CloMu-main

# import some useful libraries
import os
import numpy as np
import CloMu
import matplotlib.pyplot as plt
import random as rd
import time

# import the function that will be used to train an instance of the CloMu model
from CloMu import trainModel

/Users/paolobresolin/Desktop/current working directory/code/CloMu-main


# UTILITY FUNCTIONS

In [2]:
# Given dataset, it removes all trees longer than max_length. Then it saves the obtained dataset in path.
# max_length: maximum length of a tree, i.e., maximum number of different mutations a tree can contain.
# dataset: input dataset to process. It is a numpy array of lists. Each list is a patient and it is represented
#          as a list of trees. Given a patient, all associated trees have the same length. A tree is a list of edges.
# path: path where to save the obtained dataset. The dataset is saved as a .npy file with trees stored with the same
#       format of dataset.
# The function returns the created dataset as a numpy array.
def remove_long_trees(max_length, dataset, path):
    
    # create a new numpy array with the same size of dataset
    new_dataset = np.empty(shape=dataset.shape, dtype="object")
    # number of patients added to new_dataset so far
    number_added_patients = 0

    # iterate over the patients in the dataset
    for patient in dataset:

        # length of the trees associated with the current patient
        trees_length = len(patient[0])

        # save the patient in the new dataset only if it is not longer than max_length
        if trees_length <= max_length:
            new_dataset[number_added_patients] = patient
            number_added_patients += 1
        
    # reshape the numpy array so that it contains all and only the added patients
    new_dataset = new_dataset[:number_added_patients]

    # save the dataset as a .npy file
    np.save(path, new_dataset)

    # return the dataset
    return new_dataset

In [3]:
# Function that, given an input dataset, returns a shuffled version of it.
# input_dataset: the dataset to be shuffled.
# path: path where to save the obtained dataset. The dataset is saved as a .npy file with trees stored with the same
#       format of input_dataset.
# seed: random seed used to allow for reproducibility.
# Returns a shuffled version of input_dataset.
def shuffle_dataset(input_dataset, path, seed):

    # generate a permutation of the indices in input_dataset
    indices = [i for i in range(len(input_dataset))]
    rd.seed(seed)
    rd.shuffle(indices)

    # create a shuffled version of the dataset according to the permutation of indices
    shuffled_dataset = np.empty(shape=input_dataset.shape, dtype="object")

    for i in range(len(indices)):
        shuffled_dataset[i] = input_dataset[indices[i]]
    
    # save the dataset as a .npy file
    np.save(path, shuffled_dataset)
    
    return shuffled_dataset

In [4]:
# Splits the input dataset into two subsets, where the first one contains n patients chosen uniformly at random
# and the other one contains all other patients in the dataset.
# dataset: input dataset to be split.
# n: number of patients to be included in the first set.
# random_seed: random seed used for choosing patients. For reproducibility seek.
# Returns two sets:
#   ~ n_set contains n patients chosen uniformly at random from dataset;
#   ~ other_set contains all other patients in dataset.
def split_two_n(dataset, n, random_seed):

    # the two subsets to be returned, where set_n will contain almost n trees
    n_set = []
    other_set = []

    # choose uniformly at random n indices of patients in dataset without replacement
    rd.seed(random_seed)
    indices_n_set = rd.sample(range(len(dataset)), n)

    # fill n_set with the patients in dataset with index in indices_n_set
    n_set = [dataset[i] for i in indices_n_set]

    # fill other_set with all other patients in dataset
    other_set = [dataset[i] for i in range(len(dataset)) if i not in indices_n_set]

    return np.array(n_set, dtype=object), np.array(other_set, dtype=object)

In [5]:
# Function to compute the number of trees in a dataset of patients.
# dataset: the dataset in which we want to compute the number of trees.
# Returns the number of trees in dataset.
def compute_n_trees(dataset):

    n = 0
    for patient in dataset:
        n += len(patient)
    
    return n

In [6]:
# Function to concatenate two arrays.
# set_1: base array.
# set_2: array to be appended to set_1.
# Returns set_2 appended after set_1.
def concatenate_sets(set_1, set_2):
    
    # array that will be returned
    concatenation = []

    # add the elements of the first array
    for patient in set_1:
        concatenation.append(patient)
    
    # add the elements of the second array
    for patient in set_2:
        concatenation.append(patient)
    
    return np.array(concatenation, dtype=object)

In [7]:
# Function that checks whether a folder exists and creates it if not.
# path: path to the folder to create.
# Returns True if the folder has been created because it did not exist before and False otherwise.
# It prints a string saying whether the creation of the folder has been completed or not.
def create_folder(path):
    if os.path.exists(path):
        print("Folder " + "path " + "already exists.")
        return False
    else:
        os.mkdir(path)
        print("Folder " + "path " + "created successfully.")
        return True

In [8]:
# Function that, given a set of clusters, trains a different instance of CloMu on each cluster and returns the
# computed probabilities for the clusters and for the test set.
# clustering: list of clusters.
# test_cluster: test set to be appended to each cluster during training. The models will not be trained on the test
#               set, but they will assign probabilities to trees in the test set.
# dest: path to the directory where all results will be stored.
# n_iter: number of iterations for each training.
# Returns two list of lists of probabilities:
#   ~ the first list contains an array of probabilities for each cluster, containing the probabilities assigned to
#     those trees belonging to that cluster;
#   ~ the second list contains an array of probabilities for each cluster, but containing the values assigned by the
#     model trained on the corresponding cluster to trees in the test set.
def train_on_clustering(clustering, test_cluster, dest, n_iter=1000):

    # check whether the destination folder exists and create it otherwise
    _ = create_folder(dest)
    
    # list that will contain the probabilities assigned to trees belonging to patients in the clusters
    clusters_prob = []
    # list that will contain the probabilities assigned to trees belonging to patients in the test set
    test_prob = []

    # set some parameters necessary for training
    input_format = "raw"
    regularize_factor = "default"
    infinite_sites = False
    verbose = True
    n_iterations = n_iter
    max_tree_length = 10

    # compute the number of trees in the test set
    n_trees_test_cl = compute_n_trees(test_cluster)

    # iterate over the single clusters
    for i in range(len(clustering)):

        # create the folders where to store datasets and training results related with the current cluster
        subfolder_name = "cluster_" + str(i + 1)
        cluster_i_folder = os.path.join(dest, subfolder_name)
        _ = create_folder(cluster_i_folder)

        datasets_folder = os.path.join(cluster_i_folder, "datasets")
        _ = create_folder(datasets_folder)
        training_folder = os.path.join(cluster_i_folder, "training_results")
        _ = create_folder(training_folder)

        # append the test cluster to the current cluster and save the resulting dataset
        complete_cluster = concatenate_sets(clustering[i], test_cluster)
        complete_cluster_name = "cluster_" + str(i) + "_and_test_set.npy"
        complete_cluster_path = os.path.join(datasets_folder, complete_cluster_name)
        np.save(complete_cluster_path, complete_cluster)

        # set some parameters needed for training
        input_files = [complete_cluster_path]
        curr_model_name = "cluster_" + str(i) + "_model.pt"
        model_file = os.path.join(training_folder, curr_model_name)
        curr_prob_name = "cluster_" + str(i) + "_prob.npy"
        prob_file = os.path.join(training_folder, curr_prob_name)
        curr_mut_name = "cluster_" + str(i) + "_mutations.npy"
        mutation_file = os.path.join(training_folder, curr_mut_name)

        n_patients_training = len(clustering[i])
        
        # train an instance of the CloMu model on the cluster considered in this iteration
        trainModel(input_files, model_file, prob_file, mutation_file, patientNames='', inputFormat=input_format, infiniteSites=infinite_sites, trainSize=n_patients_training, maxM=max_tree_length, regularizeFactor=regularize_factor, iterations=n_iterations, verbose=verbose)

        # load the probabilities and append them to the corresponding lists
        curr_prob = np.load(prob_file, allow_pickle=True)
        clusters_prob.append(curr_prob[:-n_trees_test_cl])
        test_prob.append(curr_prob[-n_trees_test_cl:])

    return clusters_prob, test_prob


In [9]:
# Function that extracts clusters ids of patients from the .txt file that results from the application of RECAP
# to a dataset of patients.
# path: path to the output .txt file from the application of RECAP.
# Returns a list of cluster indices, where indices_list[i] is the cluster index for patient i in the original dataset
# given as input to RECAP.
def extract_cluster_indices(path):

    # list of clusters indices for patients
    indices_list = []
    
    # open the file and read it line by line
    with open(path, 'r') as file:
        
        # iterate over the lines in the file
        for line in file:
            
            # check whether the line contains the word "cluster" and extract the index in case
            if "cluster" in line:
                # the cluster index is always the first word in the line
                indices_list.append(line[0])
    
    # remove the first value in the list, because it is the total number of clusters
    indices_list = indices_list[1:]

    return indices_list

In [10]:
# Function that, given a dataset of patients and a list of cluster indices for them, returns the corresponding
# clusters of patients.
# dataset: input dataset of patients.
# cluster_indices: list of cluster indices for patients in dataset. cluster_indices[i] is the cluster index for
#                  patient dataset[i].
# Returns a list of clusters of patients.
def get_clusters(dataset, cluster_indices):

    # create an empty dictionary that will contain clusters ids as keys and lists of patients as values
    dict_clusters = {}

    # assign each patient in dataset to the cluster with index in cluster_indices
    for i in range(len(cluster_indices)):
        
        # check whether the cluster with index cluster_indices[i] has already been initialized
        if cluster_indices[i] in dict_clusters:
            dict_clusters[cluster_indices[i]].append(dataset[i])
        else:
            dict_clusters[cluster_indices[i]] = [dataset[i]]

    # create a list of clusters from the dictionary
    list_clusters = []
    for key in dict_clusters.keys():
        list_clusters.append(dict_clusters[key])
    
    return list_clusters

# PATHS AND FOLDERS CREATION

In [12]:
# path to the current directory
current_dir = os.getcwd()

# create the folder where to store the training results
main_folder = os.path.join(current_dir, "..", "RECAP_clustering_AML")
_ = create_folder(main_folder)

# create the folder where to store the dataset
dataset_folder = os.path.join(main_folder, "datasets")
_ = create_folder(dataset_folder)

# create the folder where to store the clusterings results
results_clusterings_folder = os.path.join(main_folder, "results_clusterings")
_ = create_folder(results_clusterings_folder)

# create a folder for each value of K inside results_clusterings
results_clusterings_k_2_folder = os.path.join(results_clusterings_folder, "K_2")
_ = create_folder(results_clusterings_k_2_folder)
results_clusterings_k_3_folder = os.path.join(results_clusterings_folder, "K_3")
_ = create_folder(results_clusterings_k_3_folder)
results_clusterings_k_4_folder = os.path.join(results_clusterings_folder, "K_4")
_ = create_folder(results_clusterings_k_4_folder)

# create the folder where to store the trainings results
results_trainings_folder = os.path.join(main_folder, "results_trainings")
_ = create_folder(results_trainings_folder)

# create a folder for each value of K inside results_trainings
results_trainings_k_2_folder = os.path.join(results_trainings_folder, "K_2")
_ = create_folder(results_trainings_k_2_folder)
results_trainings_k_3_folder = os.path.join(results_trainings_folder, "K_3")
_ = create_folder(results_trainings_k_3_folder)
results_trainings_k_4_folder = os.path.join(results_trainings_folder, "K_4")
_ = create_folder(results_trainings_k_4_folder)

# path to the input file for training
input_file_AML = os.path.join(current_dir, "data", "realData", "AML.npy")

Folder path already exists.
Folder path already exists.
Folder path already exists.
Folder path already exists.
Folder path already exists.
Folder path already exists.
Folder path already exists.
Folder path already exists.
Folder path already exists.
Folder path already exists.


# DATASET

### Pre-processing

We remove too long trees from the dataset and shuffle the resulting set as a pre-processing step.

In [13]:
# random seed for reproducibility of the shuffle
RANDOM_SEED = 27

# maximum number of mutations allowed for each tree, i.e., maximum tree length
maxM = 10

# load the original dataset as a numpy array
treeData = np.load(input_file_AML, allow_pickle=True)

# remove too long trees
treeData = remove_long_trees(maxM, treeData, os.path.join(dataset_folder, "processed_AML_dataset.npy"))

# shuffle the dataset and save it as a .npy file
dataset_file = os.path.join(dataset_folder, "shuffled_AML_dataset.npy")
treeData = shuffle_dataset(treeData, dataset_file, RANDOM_SEED)

# compute the number of trees in the dataset
n_trees_dataset = compute_n_trees(treeData)

# print the number of patients in the dataset and the number of trees
print("Number of patients in the dataset: " + str(len(treeData)))
print("Number of trees in the dataset: " + str(n_trees_dataset))

Number of patients in the dataset: 75
Number of trees in the dataset: 105


### Split into training set and test set

In [14]:
# number of patients to be included in the training set: 2/3 of the overall number of patients in the dataset
n_patients_train = len(treeData) // 3 * 2
n_patients_test = len(treeData) - n_patients_train

# split the dataset into training set and test set
training_set, test_set = split_two_n(treeData, n_patients_train, RANDOM_SEED)

# save the two datasets and the new complete dataset with test_set appended after training_set
training_set_file = os.path.join(dataset_folder, "training_set_AML.npy")
np.save(training_set_file, training_set)

test_set_file = os.path.join(dataset_folder, "test_set_AML.npy")
np.save(test_set_file, test_set)

treeData = np.concatenate((training_set, test_set))
treeData_file = os.path.join(dataset_folder, "complete_final_treeData.npy")
np.save(treeData_file, treeData)

# compute the number of trees in each set
n_trees_train = compute_n_trees(training_set)
n_trees_test = compute_n_trees(test_set)

# print number of patients and trees in each set
print("Number of patients in the training set: " + str(len(training_set)))
print("Number of patients in the test set: " + str(len(test_set)))
print()
print("Number of trees in the training set: " + str(n_trees_train))
print("Number of trees in the test set: " + str(n_trees_test))

Number of patients in the training set: 50
Number of patients in the test set: 25

Number of trees in the training set: 76
Number of trees in the test set: 29


# CLUSTER PATIENTS WITH RECAP

## Input dataset

### Collapse mutations at gene level

#### Function

In [25]:
# Function that, given a dataset of patients, collapeses the mutations appearing in the trees at gene level.
# dataset: input dataset of patients.
# Returns a version of dataset where mutations are collapsed at gene level.
def collapse_gene_level(dataset):

    # new version of the dataset that will be returned
    new_dataset = []

    # iterate over patients
    for patient in dataset:
        
        # new version of the current patient
        new_patient = []

        # iterate over trees of the current patient
        for tree in patient:

            # new version of the current tree of the current patient
            new_tree = []

            # iterate over edges of the current tree
            for edge in tree:

                # new version of the edge
                new_edge = []

                # iterate over the pair of mutations contained in the edge
                for mutation in edge:

                    # new version of the current mutation
                    new_mutation = ""

                    # collapse the mutation at gene level
                    for i in range(len(mutation)):
                        if mutation[i] == "_" or mutation[i] == ".":
                            break
                        else:
                            new_mutation += mutation[i]
                    
                    # add the collapsed mutation to the new edge
                    new_edge.append(new_mutation)
                
                # add new_edge to new_tree
                new_tree.append(new_edge)
            
            # add new_tree to new_patient
            new_patient.append(new_tree)
        
        # add new_patient to new_dataset
        new_dataset.append(new_patient)
    
    return new_dataset

#### Apply the function

In [26]:
# collapse mutations in the training set at gene level
collapsed_training_set = collapse_gene_level(training_set)

### Dataset format conversion

#### Function

In [16]:
# Function to save a dataset stored as a numpy array in a .txt file following the format required for RECAP.
# dataset: numpy array containing patients stored as lists of trees, where each tree is a list of edges.
# path: path where to store the converted dataset.
# Creates a .txt file containing dataset, but stored according to the RECAP format.
def convert_to_RECAP(dataset, path):

    # open the file
    with open(path, 'w') as file:
        
        # the first thing to write is the number of patients
        first_line = str(len(dataset)) + " # patients\n"
        file.write(first_line)

        # consider each patient one at the time
        for i in range(len(dataset)):

            # now we need to write the number of trees contained by the current patient
            curr_line = str(len(dataset[i])) + " # trees patient " + str(i) + "\n"
            file.write(curr_line)

            # iterate over the trees of the current patient
            for j in range(len(dataset[i])):

                # write the number of edges in the current tree
                curr_line = str(len(dataset[i][j])) + " # edges tree " + str(j) + "\n"
                file.write(curr_line)

                # iterate over the edges of the current tree
                for edge in dataset[i][j]:

                    # save the two ordered mutations related to the current edge
                    mut_1 = edge[0]
                    mut_2 = edge[1]

                    # write the two ordered mutations on a line of the file
                    curr_line = mut_1 + " " + mut_2 + "\n"
                    file.write(curr_line)

    # print that the file has been created and written
    print("The file has been correctly created and written.")


#### Apply the function to the training set

In [40]:
path_RECAP_dataset = os.path.join(dataset_folder, "RECAP_input_dataset.txt")
convert_to_RECAP(collapsed_training_set, path_RECAP_dataset)
# convert_to_RECAP(training_set, path_RECAP_dataset)

The file has been correctly created and written.


## Cluster patients with RECAP

### 50 Restarts

#### K = 2

In [41]:
# clustering with K = 2 and 50 restarts
start_time = time.time()
!./../RECAP/build/recap -k 2 -p "./../RECAP_clustering_AML/results_clusterings/K_2/k2_50" -R "Root" "./../RECAP_clustering_AML/datasets/RECAP_input_dataset.txt"
end_time = time.time()

elapsed_time = end_time - start_time
elapsed_hours = int(elapsed_time // (60 * 60))
elapsed_minutes = int((elapsed_time % (60 * 60)) // 60)
elapsed_seconds = int((elapsed_time % (60 * 60)) % 60)
string_time = str(elapsed_hours) + "h " + str(elapsed_minutes) + "min " + str(elapsed_seconds) + "sec"

print()
print("Time required for clustering with K = 2: " +  string_time)


Time required for clustering with K = 2: 0h 0min 0sec


#### K = 3

In [42]:
# clustering with K = 3 and 50 restarts
start_time = time.time()
!./../RECAP/build/recap -k 3 -p "./../RECAP_clustering_AML/results_clusterings/K_3/k3_50" -R "Root" "./../RECAP_clustering_AML/datasets/RECAP_input_dataset.txt"
end_time = time.time()

elapsed_time = end_time - start_time
elapsed_hours = int(elapsed_time // (60 * 60))
elapsed_minutes = int((elapsed_time % (60 * 60)) // 60)
elapsed_seconds = int((elapsed_time % (60 * 60)) % 60)
string_time = str(elapsed_hours) + "h " + str(elapsed_minutes) + "min " + str(elapsed_seconds) + "sec"

print()
print("Time required for clustering with K = 3: " +  string_time)


Time required for clustering with K = 3: 0h 0min 0sec


#### K = 4

In [43]:
# clustering with K = 4 and 50 restarts
start_time = time.time()
!./../RECAP/build/recap -k 4 -p "./../RECAP_clustering_AML/results_clusterings/K_4/k4_50" -R "Root" "./../RECAP_clustering_AML/datasets/RECAP_input_dataset.txt"
end_time = time.time()

elapsed_time = end_time - start_time
elapsed_hours = int(elapsed_time // (60 * 60))
elapsed_minutes = int((elapsed_time % (60 * 60)) // 60)
elapsed_seconds = int((elapsed_time % (60 * 60)) % 60)
string_time = str(elapsed_hours) + "h " + str(elapsed_minutes) + "min " + str(elapsed_seconds) + "sec"

print()
print("Time required for clustering with K = 4: " +  string_time)


Time required for clustering with K = 4: 0h 0min 0sec


### 1000 Restarts

#### K = 2

In [44]:
# clustering with K = 2 and 1000 restarts
start_time = time.time()
!./../RECAP/build/recap -k 2 -r 1000 -p "./../RECAP_clustering_AML/results_clusterings/K_2/k2" -R "Root" "./../RECAP_clustering_AML/datasets/RECAP_input_dataset.txt"
end_time = time.time()

elapsed_time = end_time - start_time
elapsed_hours = int(elapsed_time // (60 * 60))
elapsed_minutes = int((elapsed_time % (60 * 60)) // 60)
elapsed_seconds = int((elapsed_time % (60 * 60)) % 60)
string_time = str(elapsed_hours) + "h " + str(elapsed_minutes) + "min " + str(elapsed_seconds) + "sec"

print()
print("Time required for clustering with K = 2: " +  string_time)


Time required for clustering with K = 2: 0h 0min 0sec


#### K = 3

In [45]:
# clustering with K = 3 and 1000 restarts
start_time = time.time()
!./../RECAP/build/recap -k 3 -r 1000 -p "./../RECAP_clustering_AML/results_clusterings/K_3/k3" -R "Root" "./../RECAP_clustering_AML/datasets/RECAP_input_dataset.txt"
end_time = time.time()

elapsed_time = end_time - start_time
elapsed_hours = int(elapsed_time // (60 * 60))
elapsed_minutes = int((elapsed_time % (60 * 60)) // 60)
elapsed_seconds = int((elapsed_time % (60 * 60)) % 60)
string_time = str(elapsed_hours) + "h " + str(elapsed_minutes) + "min " + str(elapsed_seconds) + "sec"

print()
print("Time required for clustering with K = 3: " +  string_time)


Time required for clustering with K = 3: 0h 0min 0sec


#### K = 4

In [46]:
# clustering with K = 4 and 1000 restarts
start_time = time.time()
!./../RECAP/build/recap -k 4 -r 1000 -p "./../RECAP_clustering_AML/results_clusterings/K_4/k4" -R "Root" "./../RECAP_clustering_AML/datasets/RECAP_input_dataset.txt"
end_time = time.time()

elapsed_time = end_time - start_time
elapsed_hours = int(elapsed_time // (60 * 60))
elapsed_minutes = int((elapsed_time % (60 * 60)) // 60)
elapsed_seconds = int((elapsed_time % (60 * 60)) % 60)
string_time = str(elapsed_hours) + "h " + str(elapsed_minutes) + "min " + str(elapsed_seconds) + "sec"

print()
print("Time required for clustering with K = 4: " +  string_time)


Time required for clustering with K = 4: 0h 0min 0sec


## Extract clusters

### 1000 restarts, K = 2

In [47]:
# path to the file containing the output from RECAP that provides a cluster id per patient in the dataset
clustering_path = os.path.join(results_clusterings_k_2_folder, "k2.solution.txt")

# extract indices for patients
cluster_indices = extract_cluster_indices(clustering_path)

# retrieve the actual clusters of patients
clustering_K_2 = get_clusters(training_set, cluster_indices)

### 1000 restarts, K = 3

In [50]:
# path to the file containing the output from RECAP that provides a cluster id per patient in the dataset
clustering_path = os.path.join(results_clusterings_k_3_folder, "k3.solution.txt")

# extract indices for patients
cluster_indices = extract_cluster_indices(clustering_path)

# retrieve the actual clusters of patients
clustering_K_3 = get_clusters(training_set, cluster_indices)

### 1000 restarts, K = 4

In [51]:
# path to the file containing the output from RECAP that provides a cluster id per patient in the dataset
clustering_path = os.path.join(results_clusterings_k_4_folder, "k4.solution.txt")

# extract indices for patients
cluster_indices = extract_cluster_indices(clustering_path)

# retrieve the actual clusters of patients
clustering_K_4 = get_clusters(training_set, cluster_indices)

# TRAIN ON CLUSTERS

## K = 2

In [60]:
# train on the clustering with K = 2 and extract the probabilities assigned by the model
start_time = time.time()
clusters_K_2_prob, test_K_2_prob = train_on_clustering(clustering_K_2, test_set, results_trainings_k_2_folder, n_iter=1000)
end_time = time.time()

elapsed_time = end_time - start_time
elapsed_hours = int(elapsed_time // (60 * 60))
elapsed_minutes = int((elapsed_time % (60 * 60)) // 60)
elapsed_seconds = int((elapsed_time % (60 * 60)) % 60)
string_time = str(elapsed_hours) + "h " + str(elapsed_minutes) + "min " + str(elapsed_seconds) + "sec"

print()
print("Time required for training on clustering with K = 2: " +  string_time)

Folder path already exists.
Folder path created successfully.
Folder path created successfully.
Folder path created successfully.
iteration 0 of 1000

Mean Probability:  1.7530657688781802e-05
Training Score:  -12.828952962495691 Testing Score: -12.164518917810467
Loss:  11.61777594737829
iteration 1 of 1000

Mean Probability:  3.491859671158163e-05
Training Score:  -12.547995148293877 Testing Score: -11.791066771110422
Loss:  7.648923281757793
iteration 2 of 1000

Mean Probability:  5.208783355085209e-05
Training Score:  -12.364058587513252 Testing Score: -11.557916059063695
Loss:  5.211794723540308
iteration 3 of 1000

Mean Probability:  6.829000011959357e-05
Training Score:  -12.228427873062879 Testing Score: -11.381426239758584
Loss:  4.52388772889513
iteration 4 of 1000

Mean Probability:  8.344654925909917e-05
Training Score:  -12.129800277504796 Testing Score: -11.255430216841807
Loss:  3.4829871448003287
iteration 5 of 1000

Mean Probability:  9.771069749944759e-05
Training Sco

## K = 3

In [61]:
# train on the clustering with K = 3 and extract the probabilities assigned by the model
start_time = time.time()
clusters_K_3_prob, test_K_3_prob = train_on_clustering(clustering_K_3, test_set, results_trainings_k_3_folder, n_iter=1000)
end_time = time.time()

elapsed_time = end_time - start_time
elapsed_hours = int(elapsed_time // (60 * 60))
elapsed_minutes = int((elapsed_time % (60 * 60)) // 60)
elapsed_seconds = int((elapsed_time % (60 * 60)) % 60)
string_time = str(elapsed_hours) + "h " + str(elapsed_minutes) + "min " + str(elapsed_seconds) + "sec"

print()
print("Time required for training on clustering with K = 2: " +  string_time)

Folder path already exists.
Folder path created successfully.
Folder path created successfully.
Folder path created successfully.
iteration 0 of 1000

Mean Probability:  2.32812437503873e-05
Training Score:  -12.368721523398838 Testing Score: -12.085750221982865
Loss:  10.09184615222265
iteration 1 of 1000

Mean Probability:  4.640076843484822e-05
Training Score:  -11.999715866048817 Testing Score: -11.743957863185381
Loss:  6.618241554919215
iteration 2 of 1000

Mean Probability:  6.825136040098363e-05
Training Score:  -11.759195722742184 Testing Score: -11.51606913419218
Loss:  4.916902152333651
iteration 3 of 1000

Mean Probability:  8.939618027235794e-05
Training Score:  -11.584448425203716 Testing Score: -11.345962319769942
Loss:  3.546508655469763
iteration 4 of 1000

Mean Probability:  0.00010802572902744499
Training Score:  -11.461829741632492 Testing Score: -11.23069954812091
Loss:  3.118568850510953
iteration 5 of 1000

Mean Probability:  0.0001256352237251543
Training Score:

## K = 4

In [62]:
# train on the clustering with K = 4 and extract the probabilities assigned by the model
start_time = time.time()
clusters_K_4_prob, test_K_4_prob = train_on_clustering(clustering_K_4, test_set, results_trainings_k_4_folder, n_iter=1000)
end_time = time.time()

elapsed_time = end_time - start_time
elapsed_hours = int(elapsed_time // (60 * 60))
elapsed_minutes = int((elapsed_time % (60 * 60)) // 60)
elapsed_seconds = int((elapsed_time % (60 * 60)) % 60)
string_time = str(elapsed_hours) + "h " + str(elapsed_minutes) + "min " + str(elapsed_seconds) + "sec"

print()
print("Time required for training on clustering with K = 2: " +  string_time)

Folder path already exists.
Folder path created successfully.
Folder path created successfully.
Folder path created successfully.
iteration 0 of 1000

Mean Probability:  3.0211658349317e-05
Training Score:  -11.919454283572614 Testing Score: -12.095665135869327
Loss:  9.148786977452295
iteration 1 of 1000

Mean Probability:  5.920693991482183e-05
Training Score:  -11.55398886093725 Testing Score: -11.70033545887358
Loss:  5.152843293978203
iteration 2 of 1000

Mean Probability:  8.617274002529685e-05
Training Score:  -11.306815039756343 Testing Score: -11.49908600795954
Loss:  4.513672310443447
iteration 3 of 1000

Mean Probability:  0.00011273045062713321
Training Score:  -11.114488107504416 Testing Score: -11.346695822235564
Loss:  3.708421756940048
iteration 4 of 1000

Mean Probability:  0.00013704171339274708
Training Score:  -10.975152994728901 Testing Score: -11.235607102090965
Loss:  3.1718048425764778
iteration 5 of 1000

Mean Probability:  0.00015963887672281507
Training Score