## Frequent Itemset Mining on health data Implementation

Contains three main functions:
- apply_fim: apply the method using defined parameters to a sample size population; creates a report with generated clusters in output_files folder
- generalize_timeline: use the result of apply_fim to generalize timeline of one person; stores a generalized_timeline values that can be used in health_event_timelines.ipynb notebook
- test_fim: test using multiple values and iterations; creates multiple report files with generated clusters in output_files folder

Usage:
- uncomment functions
- set values for parameters
- run cells

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpmax
from datetime import timedelta
import math
import random
import statistics

# Received from the corresponding OMOP format database tables
%store -r df_condition_occurrence # condition_occurrence
%store -r df_drug_exposure # drug_exposure
%store -r df_concept # concept
%store -r df_procedure # procedure_occurrence

# Received from test_cases.ipynb
%store -r use_case_list

In [None]:
# FIM Parameters (number of person, minimum threshold support, number of clusters, window length)
num_person = 1
min_support = 0.001
num_cluster = 5
win_length = 30

# Alternative Method Selection
use_alt_sliding_window_method = False
use_alt_clustering_method = True

# Output File Name
output_file = 'output'

# fim_result = apply_fim(num_person, min_support, win_length, num_cluster, use_alt_sliding_window_method, use_alt_clustering_method, output_file)

In [None]:
# ID Of Person To Be Generalized
person_id = 5

generalized_timeline = generalize_timeline(5, fim_result, win_length)
%store generalized_timeline

In [None]:
# Test FIM Parameters
win_lengths = [15, 30]
num_clusters = [5, 10]

# Number Of Iterations
iterations_per_test = 1

# test_fim(num_clusters, win_lengths, iterations_per_test, num_person, min_support, use_alt_sliding_window_method, use_alt_clustering_method)

In [None]:
def create_datasets(person_nr, person_id):
    person_ids = list(set(df_condition_occurrence["person_id"].values.tolist()))
    selected_person_ids = random.sample(person_ids, person_nr) if person_id is None else [person_id]
    
    condition_occurrence_dataset = df_condition_occurrence[df_condition_occurrence['person_id'].isin(selected_person_ids)]
    drug_exposure_dataset = df_drug_exposure[df_drug_exposure['person_id'].isin(selected_person_ids)]
    procedure_dataset = df_procedure[df_procedure['person_id'].isin(selected_person_ids)]
    
    drug_exposure_dataset = drug_exposure_dataset.rename(columns={
        'person_id': 'person_id',
        'drug_concept_id': 'concept_id',
        'drug_exposure_start_date': 'start_datetime'
    })
    
    condition_occurrence_dataset = condition_occurrence_dataset.rename(columns={
        'person_id': 'person_id',
        'condition_concept_id': 'concept_id',
        'condition_start_date': 'start_datetime'
    })
    
    procedure_dataset = procedure_dataset.rename(columns={
        'person_id': 'person_id',
        'procedure_concept_id': 'concept_id',
        'procedure_date': 'start_datetime'
    })
    
    medical_data_df = pd.concat([
        drug_exposure_dataset[['person_id', 'concept_id', 'start_datetime']],
        procedure_dataset[['person_id', 'concept_id', 'start_datetime']],
        condition_occurrence_dataset[['person_id', 'concept_id', 'start_datetime']]
    ], ignore_index=True)
    
    all_concept_ids = medical_data_df['concept_id'].tolist()
    all_concept_ids_counts = medical_data_df['concept_id'].value_counts()
    
    if person_id is None and len(selected_person_ids) > 100:
        medical_data_df = medical_data_df[medical_data_df['concept_id'].map(all_concept_ids_counts) >= 10]
    medical_data_df = medical_data_df[medical_data_df['concept_id'] != 0]
    
    event_mappings = []
    
    for person_id in selected_person_ids: 
        person_dataset = medical_data_df[medical_data_df['person_id'] == person_id]
        dict_with_codes = format_dataset_to_dict(person_dataset)
        dict_with_codes_sorted = {k: dict_with_codes[k] for k in sorted(dict_with_codes.keys())}
        event_mappings.append(dict_with_codes_sorted)
    
    return event_mappings, all_concept_ids, all_concept_ids_counts.to_dict(), medical_data_df['start_datetime'].min()

In [None]:
def assign_integer_to_datetime(datetime, start_date, end_date, total):
    if datetime >= end_date:
        return total
    else:
        return (datetime - start_date).days

In [None]:
def sliding_window_method_1(events_dict, max_days_difference):
    result_dict = {}
    for key in sorted(events_dict.keys()):
        group_key = int(key/max_days_difference)
        if group_key in result_dict:
            result_dict[group_key].extend(events_dict[key])
        else:
            result_dict[group_key] = events_dict[key]
    return [list(set(value)) for value in result_dict.values()]

In [None]:
def sliding_window_method_2(events_dict, window_length):
    items_list = []
    keys = list(events_dict.keys())

    for i in range(len(keys)):
        window_items = []
        key = keys[i]
        window_items.extend(events_dict[key])
        j = 1
        while (i + j) < len(keys):
            if keys[i + j] <= (key + window_length):
                window_items.extend(events_dict[keys[i + j]])
                j += 1
            else:
                break
        items_list.append(list(set(window_items)))

    return items_list

In [None]:
def generate_itemsets_with_time(events_dict, window_length):
    result_dict = {}
    for key in sorted(events_dict.keys()):
        group_key = int(key/window_length)
        if group_key in result_dict:
            result_dict[group_key].extend(events_dict[key])
        else:
            result_dict[group_key] = events_dict[key]
    return result_dict

In [None]:
def format_dataset_to_dict(dataset):
    min_start_date = dataset['start_datetime'].min()
    max_start_date = dataset['start_datetime'].max()
    difference = max_start_date - min_start_date
    
    result_dict = {}
    for item in dataset.values:
        number = assign_integer_to_datetime(item[2], min_start_date, max_start_date, difference.days)
        value = str(item[1])
        if number not in result_dict:
            result_dict[number] = []
        if value not in result_dict[number]:
            result_dict[number].append(value)

    return result_dict

In [None]:
def generate_item_sets(event_dict_list, window_length, sliding_window_method):
    result_list = []
    for event_dict in event_dict_list:
        if not sliding_window_method:
            result_list.extend(sliding_window_method_1(event_dict, window_length))
        else:
            result_list.extend(sliding_window_method_2(event_dict, window_length))
    return result_list

In [None]:
def calculate_tf_idf_all_clusters(clusters):
    list_of_cluster_dicts = []
    for i in range(len(clusters)):
        cluster_tf_idf_result = calculate_tf_idf(clusters[i], clusters, len(clusters))
        list_of_cluster_dicts.append(dict(cluster_tf_idf_result))
    return list_of_cluster_dicts

In [None]:
def merge_dicts_with_highest_score(list_of_dicts):
    merged_dict = {}
    for d in list_of_dicts:
        key = list(d.keys())[0]
        if key in merged_dict:
            merged_dict[key].extend(list(d.keys()))
            merged_dict[key] = list(set(merged_dict[key]))
        else:
            merged_dict[key] = list(d.keys())
            
    modified_dict = {}

    for key, values in merged_dict.items():
        if key in values:  
            values.remove(key) 
            values.insert(0, key)
        modified_dict[key] = values
    return modified_dict

In [None]:
def calculate_tf_idf(cluster_items, concat_clusters, n_clusters):
    cluster_features = list(set(cluster_items))
    result_tf_scores = []
    
    for feature in cluster_features:
        tf = cluster_items.count(feature)/len(cluster_items)
        feature_count = sum(1 for cluster_items in concat_clusters if feature in cluster_items)
        idf = math.log(n_clusters/feature_count, 10)
        tf_idf = round(tf * idf, 3)
        result_tf_scores.append((feature, tf_idf))
    sorted_result_by_tf_idf_score = sorted(result_tf_scores, key=lambda x: x[1], reverse=True)
    return sorted_result_by_tf_idf_score

In [None]:
def get_medical_label_by_concept_id(concept_id):
    concept_id = int(concept_id)
    concept_labels = df_concept.loc[df_concept['concept_id'] == concept_id, 'concept_name'].values
    if len(concept_labels) > 0:
        return concept_labels[0]
    else:
        return ""

In [None]:
def calculate_clusters_per_use_case(cluster_concept_ids, use_cases):
    counts = []
    for use_case in use_cases:
        count = 0
        for cluster in cluster_concept_ids:
            if set(use_case) & set(cluster):
                count += 1
        counts.append(count)
    return round(sum(counts) / len(use_cases), 5), round(statistics.stdev(counts), 5)

In [None]:
def calculate_use_cases_per_cluster(cluster_concept_ids, use_cases):
    counts = []
    for cluster in cluster_concept_ids:
        count = 0
        for use_case in use_cases:
            if set(use_case) & set(cluster):
                count += 1
        counts.append(count)
    filtered_counts = [item for item in counts if item != 0]
    if len(filtered_counts) < 1:
        return 0, 0
    return round(sum(filtered_counts) / len(filtered_counts), 5), round(statistics.stdev(filtered_counts), 5)

In [None]:
def calculate_score_of_use_cases_in_item_sets(cluster_concept_ids):
    cluster_concept_ids_set = set(sum(cluster_concept_ids, []))
    use_case_list_set = set(sum(use_case_list, []))
    return round(len(use_case_list_set.intersection(cluster_concept_ids_set)) / len(use_case_list_set), 5)

In [None]:
def calculate_frequency_score_of_missing_concept_ids(cluster_concept_ids, all_concept_ids, all_concept_ids_counts):
    concept_id_count = len(all_concept_ids)
    cluster_concept_ids_set = set(sum(cluster_concept_ids, []))
    frequent_items_count = 0
    for cluster_concept_id in cluster_concept_ids_set:
        frequent_items_count += all_concept_ids_counts[int(cluster_concept_id)]
    return round((frequent_items_count / concept_id_count), 5)

In [None]:
def create_clusters_alternative(frequent_item_sets_list):
    tf_idf_result = calculate_tf_idf_all_clusters(frequent_item_sets_list)
    merged = merge_dicts_with_highest_score(tf_idf_result)
    return list(merged.values())

In [None]:
def apply_fim_to_dataset(support_threshold, n_cluster, window_length, event_mappings, all_concept_ids, all_concept_ids_counts, output_file, sliding_window_method, use_alt_clustering_method):
    item_sets = generate_item_sets(event_mappings, window_length, sliding_window_method)
    print(f"{output_file} - item sets created")
    frequent_item_sets_list = generate_frequent_item_sets(item_sets, support_threshold)
    print(f"{output_file} - frequent item sets created")
    if use_alt_clustering_method:
        print(f"{output_file} - clusters created")
        return validate_and_score_clusters(create_clusters_alternative(frequent_item_sets_list), all_concept_ids_counts, all_concept_ids, output_file, window_length, support_threshold, True)
    else:
        clusters = create_clusters(n_cluster, frequent_item_sets_list)
        print(f"{output_file} - clusters created")
        return validate_and_score_clusters(clusters, all_concept_ids_counts, all_concept_ids, output_file, window_length, support_threshold, use_alt_clustering_method)

In [None]:
def generate_frequent_item_sets(item_sets, support_threshold):
    te = TransactionEncoder()
    te_ary = te.fit(item_sets).transform(item_sets)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    
    frequent_itemsets = fpmax(df, min_support=support_threshold, use_colnames=True)
    frequent_itemsets_list = []
    
    for index, row in frequent_itemsets.iterrows():
        items = list(row['itemsets'])
        frequent_itemsets_list.append(items)
    
    return [inner_list for inner_list in frequent_itemsets_list if 1 < len(inner_list)]

In [None]:
def create_clusters(n_cluster, frequent_item_sets):
    vectorizer = CountVectorizer(binary=True)
    frequent_item_sets_string_list = [' '.join(sublist) for sublist in frequent_item_sets]
    bow = vectorizer.fit_transform(frequent_item_sets_string_list).toarray()
    
    kmeans = KMeans(n_clusters=n_cluster, random_state=42, n_init='auto')
    clusters = kmeans.fit_predict(bow)

    cluster_data = {}
    for i, cluster in enumerate(clusters):
        if cluster not in cluster_data:
            cluster_data[cluster] = []
        cluster_data[cluster].append(frequent_item_sets[i])
       
    cluster_data_sorted = sorted(cluster_data.items())
    
    cluster_data = {}
    concat_clusters = []
    for key, value in cluster_data_sorted:
        cluster_data[key] = value
    
    for cluster, data_points in cluster_data.items():
        concat_clusters.append(sum(data_points, []))
    
    return concat_clusters

In [None]:
def validate_and_score_clusters(clusters, all_concept_ids_counts, all_concept_ids, output_file, window_length, support,
                                use_alt_clustering_method):
    with open("../output_files/" + output_file + ".txt", "w") as f:
        print(output_file + ".txt", file=f)
        print("---------------------------", file=f)
        print(f"Number of clusters: {len(clusters)}", file=f)
        print(f"Window length: {window_length}", file=f)
        print(f"Minimum support: {support}", file=f)
        print(f"Number of use cases: {len(use_case_list)}", file=f)
        print("---------------------------", file=f)
        clusters_per_use_case = calculate_clusters_per_use_case(clusters, use_case_list)
        use_cases_per_cluster = calculate_use_cases_per_cluster(clusters, use_case_list)
        print(f"1) Average number of clusters per use case: {clusters_per_use_case[0]}", file=f)
        print(f"2) SD of clusters per use case: {clusters_per_use_case[1]}", file=f)
        print(f"3) Average number of use cases per cluster: {use_cases_per_cluster[0]}", file=f)
        print(f"4) SD of use cases per cluster: {use_cases_per_cluster[1]}", file=f)
        print(f"5) Use case concept ids in frequent item sets: {calculate_score_of_use_cases_in_item_sets(clusters)}", file=f)
        print(
            f"6) Concept ids included in clusters with frequency: {calculate_frequency_score_of_missing_concept_ids(clusters, all_concept_ids, all_concept_ids_counts)}",file=f)
        print("---------------------------", file=f)
        print(" ", file=f)

        list_of_cluster_dicts = []
        if use_alt_clustering_method:
            for i in range(len(clusters)):
                cluster_tf_idf_result = calculate_tf_idf(clusters[i], clusters, len(clusters))
                list_of_cluster_dicts.append(dict(cluster_tf_idf_result))
                print(f"Cluster {i}:", file=f)
                for concept in clusters[i]:
                    print(f"{concept}; {get_medical_label_by_concept_id(concept)}", file=f)
                print(" ", file=f)
        else:
            for i in range(len(clusters)):
                cluster_tf_idf_result = calculate_tf_idf(clusters[i], clusters, len(clusters))
                list_of_cluster_dicts.append(dict(cluster_tf_idf_result))
                print(f"Cluster {i}:", file=f)
                for count, tf_idf_result in enumerate(cluster_tf_idf_result):
                    print(f"{tf_idf_result[0]}; {tf_idf_result[1]} {get_medical_label_by_concept_id(tf_idf_result[0])}", file=f)
                print(" ", file=f)
        return list_of_cluster_dicts

In [None]:
def assign_cluster_to_event_method_1(event, clusters):
    cluster_index = -1
    tf_idf = 0
    for idx, cluster in enumerate(clusters):
        if event in cluster:
            if cluster[event] > tf_idf:
                cluster_index = idx
                tf_idf = cluster[event]
    return cluster_index

In [None]:
def convert_months_to_date(start_date, months, window_length):
    return (start_date.to_pydatetime() + timedelta(days=months * window_length)).strftime("%Y-%m-%d")

In [None]:
def assign_clusters_to_events(events, clusters):
    return list({assign_cluster_to_event_method_1(event, clusters) for event in events})

In [None]:
def assign_itemsets_to_clusters(itemsets, clusters, start_date, window_length):
    result_dict = {key: assign_clusters_to_events(value, clusters) for key, value in itemsets.items()}
    result_list = [key for key, value in result_dict.items() for _ in range(len(value))]
    cluster_indexes = [item for sublist in result_dict.values() for item in sublist]
    return [list(map(lambda months: convert_months_to_date(start_date, months, window_length), result_list)), cluster_indexes]

In [None]:
def generalize_timeline(person_id, clusters, window_length):
    data = create_datasets(None, person_id)
    itemsets = generate_itemsets_with_time(data[0][0], window_length)
    generalized_data = assign_itemsets_to_clusters(itemsets, clusters, data[3], window_length)
    cluster_labels = create_labels_for_clusters(clusters)
    cluster_ids = generalized_data[1]
    labelled_clusters = ["Nan" if cluster_id == -1 else cluster_labels[cluster_id][:30] for cluster_id in cluster_ids]
    generalized_data[1] = labelled_clusters
        
    return generalized_data

In [None]:
def create_labels_for_clusters(clusters):
    top_concept_ids = [next(iter(d)) for d in clusters]
    return list(map(get_medical_label_by_concept_id, top_concept_ids))

In [None]:
def test_fim(num_of_clusters, window_lengths, iterations, n_person, support, use_alt_sliding_window_method, use_alt_clustering_method):
    for i in range(iterations):
        event_mappings, all_concept_ids, all_concept_ids_counts, _ = create_datasets(person_nr=n_person, person_id=None)
        print(f"{i} - dataset created")
        test_index = 1
        for j in range(len(window_lengths)):
            item_sets = generate_item_sets(event_mappings, window_lengths[j], use_alt_sliding_window_method)
            item_set_idx = str(i) + "_" + str(j)
            print(f"{item_set_idx} - item sets created")
            frequent_item_sets_list = generate_frequent_item_sets(item_sets, support)
            print(f"{item_set_idx} - frequent item sets created")
            for k in range(len(num_of_clusters)):
                output_file = str(i) + "_" + str(test_index) + "_" + str(support)
                if use_alt_clustering_method:
                    clusters_result = create_clusters_alternative(frequent_item_sets_list)
                else:
                    clusters_result = create_clusters(num_of_clusters[k], frequent_item_sets_list)
                print(f"{output_file} - clusters created")
                validate_and_score_clusters(clusters_result, all_concept_ids_counts, all_concept_ids, output_file, window_lengths[j], support, True)
                test_index += 1

In [None]:
def apply_fim(num_person, min_support, win_length, num_cluster, use_alt_sliding_window_method, use_alt_clustering_method, output_file):
    event_mappings, all_concept_ids, all_concept_ids_counts, _ = create_datasets(person_nr=num_person, person_id=None)
    return apply_fim_to_dataset(min_support, num_cluster, win_length, event_mappings, all_concept_ids, all_concept_ids_counts, output_file, use_alt_sliding_window_method, use_alt_clustering_method)