In [2]:
import networkx as nx
import astropy.stats
import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist, squareform
# import itertools
# from palettable import wesanderson
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
# import warnings
import scipy
# from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity


## Data preparation

In [3]:
# less_important_features = ['race_other', 'mood_or_anx', 'prev_study_oth_calc', 'race_asian', 'mood_anx', 
#                                  'multiple_birth', 'race_native_amer', 'gen_test_oth_calc', 
#                                  'gen_dx_oth_calc_self_report', 'growth_oth_calc', 'race_native_hawaiian']

In [4]:
# age_cols = ['age_at_eval_months', 'age_at_eval_years', 'age_at_registration_months']
race_cols = ['race_asian','race_african_amer', 'race_native_amer', 'race_native_hawaiian',
       'race_white', 'race_other', 'hispanic']

In [5]:
spark_df = pd.read_csv('./spark_cleaned_data_more_features.csv')
# spark_df_asd = spark_df['asd']
# id_cols = ['asd','subject_sp_id', 'respondent_sp_id', 'family_id', 'biomother_id','biofather_id']
id_cols = ['subject_sp_id', 'respondent_sp_id', 'family_id', 'biomother_id','biofather_id']
spark_df = spark_df.drop(id_cols, axis=1)
# spark_df_selected_feat  = spark_df.drop(less_important_features, axis=1)
# spark_df_selected_feat  = spark_df.drop(age_cols, axis=1)
spark_df_selected_feat  = spark_df.drop(race_cols, axis=1)
selected_features =  spark_df_selected_feat.columns

In [6]:
# spark_df_selected_feat.columns


In [7]:
# Load saved feature importances
# importances_df = pd.read_pickle('./more_features(mf)/sorted_feature_importances_RF_mf.pkl')
importances_df = pd.read_pickle('./more_features(mf)/sorted_feature_importances_RF_mf_newest.pkl')

features_sorted_by_importance = importances_df['feature'].values
feat_importances = importances_df['importance'].values

# Sort the features in the dataframe by importance and add the target variable
spark_df_selected_feat = spark_df_selected_feat[np.append('asd', features_sorted_by_importance)]

In [8]:

def get_sample_datasets(df, sample_size, n_samples=5):
    """
    Function to sample the dataset n_samples times and return the indices of the samples
    """

    # Calculate the proportions of ASD and non-ASD patients
    total_count = len(df)
    asd_count = df['asd'].sum()
    non_asd_count = total_count - asd_count
    asd_proportion = asd_count / total_count

    sample_dfs = []
    
    for i in range(n_samples):
        # Determine the number of ASD and non-ASD patients needed for the sample
        asd_sample_size = int(asd_proportion * sample_size)
        non_asd_sample_size = sample_size - asd_sample_size

        # Select a stratified sample
        asd_sample_df = df[df['asd'] == 1].sample(n=asd_sample_size, random_state=i)
        non_asd_sample_df = df[df['asd'] == 0].sample(n=non_asd_sample_size, random_state=i)

        # Combine the samples to get the final stratified sample
        final_sample_df = pd.concat([asd_sample_df, non_asd_sample_df])
        sample_dfs.append(final_sample_df.reset_index(drop=False))

    return sample_dfs



In [9]:
spark_df_selected_feat.columns

Index(['asd', 'dev_lang', 'dev_lang_dis', 'sex', 'attn_behav', 'dev_speech',
       'dev_soc_prag', 'birth_oth_calc', 'dev_motor', 'behav_adhd', 'dev_ld',
       'neuro_oth_calc', 'psych_oth_calc', 'behav_odd', 'mood_dep',
       'mood_soc_anx', 'mood_anx', 'mood_ocd', 'mood_sep_anx', 'mood_dmd',
       'mood_bipol', 'multiple_birth', 'mood_or_anx',
       'gen_dx_oth_calc_self_report', 'mood_hoard', 'behav_intermitt_explos',
       'behav_conduct', 'dev_mutism'],
      dtype='object')

In [10]:

print(selected_features)
print(selected_features.shape)

Index(['mood_hoard', 'dev_motor', 'mood_sep_anx', 'mood_soc_anx',
       'birth_oth_calc', 'behav_intermitt_explos', 'behav_adhd', 'dev_lang',
       'attn_behav', 'dev_soc_prag', 'dev_ld', 'mood_dmd', 'psych_oth_calc',
       'dev_speech', 'mood_anx', 'mood_or_anx', 'asd', 'neuro_oth_calc', 'sex',
       'dev_mutism', 'mood_ocd', 'mood_bipol', 'behav_odd', 'dev_lang_dis',
       'gen_dx_oth_calc_self_report', 'mood_dep', 'behav_conduct',
       'multiple_birth'],
      dtype='object')
(28,)


In [11]:
spark_df_selected_feat
# spark_df_asd

Unnamed: 0,asd,dev_lang,dev_lang_dis,sex,attn_behav,dev_speech,dev_soc_prag,birth_oth_calc,dev_motor,behav_adhd,...,mood_sep_anx,mood_dmd,mood_bipol,multiple_birth,mood_or_anx,gen_dx_oth_calc_self_report,mood_hoard,behav_intermitt_explos,behav_conduct,dev_mutism
0,1,1.0,1.0,1,1.0,0.0,0.0,0,0.0,1.0,...,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0,1.0,0.0,0.0,0,0.0,1.0,...,0.0,0.0,1.0,0,1.0,0,0.0,0.0,0.0,0.0
2,0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0
3,1,0.0,0.0,1,1.0,0.0,0.0,0,0.0,1.0,...,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0
4,1,0.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131806,0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0
131807,1,1.0,1.0,1,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0
131808,0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0
131809,1,0.0,0.0,0,1.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,1.0,0,1.0,0,0.0,0.0,0.0,0.0


## Calculating similarity matrix

In [12]:

# # Calculate cosine similarity
# cosine_sim_matrix = cosine_similarity(spark_df_selected_feat)

# # Convert the cosine similarity matrix to a DataFrame
# cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=spark_df_selected_feat.index, columns=spark_df_selected_feat.index)

In [13]:
# spark_df_selected_feat = spark_df_selected_feat.astype(int)

In [14]:
# import sys

# # Assuming spark_df_selected_feat is your object
# memory_usage = sys.getsizeof(spark_df_selected_feat)

# print(f"Memory usage: {memory_usage} bytes")

In [15]:
def pairwise_hamming_similarity(df):
    # Convert the DataFrame to a boolean array if not already
    # data_bool = df.astype(bool)
    data_bool = df
    
    # Compute pairwise Hamming distances
    pairwise_dist = pdist(data_bool, metric='hamming')
    
    # Convert distances to similarities
    pairwise_sim = 1 - pairwise_dist
    
    # Convert the condensed distance matrix to a square matrix
    similarity_matrix = squareform(pairwise_sim)
    similarity_df = pd.DataFrame(similarity_matrix, index=df.index, columns=df.index)
    
    return similarity_df

# hamming_sim_df_50k = pairwise_hamming_similarity(spark_df_selected_feat.iloc[:50000])
# 1.5 mins

In [16]:

def pairwise_weighted_hamming_similarity(df, weights):
    '''
    Compute pairwise weighted Hamming similarity between rows of a DataFrame.
    
    df: DataFrame containing binary features
    weights: 1D array of normalised weights for each feature (weights must sum to 1)
    '''

    # Ensure weights array matches the number of columns in df
    assert len(weights) == df.shape[1], "Weights length must match number of features"
    
    # Convert DataFrame to numpy array for efficiency
    data_array = df.to_numpy()
    
    # Custom function to compute weighted Hamming similarity
    def weighted_hamming(u, v):
        # Calculate weighted distance as sum of weights where elements are the same
        return np.sum(weights[u == v])
    
    # Compute pairwise similarities using the custom metric
    pairwise_sim = pdist(data_array, metric=weighted_hamming)
    
    # Convert the condensed distance matrix to a square matrix
    similarity_matrix = squareform(pairwise_sim)
    # Force the diagonal values to 1 (because squareform leaves them as 0 as it's meant for distances)
    np.fill_diagonal(similarity_matrix, 1)
    similarity_df = pd.DataFrame(similarity_matrix, index=df.index, columns=df.index)
    
    return similarity_df

In [17]:
# hamming_sim_df_80k = pairwise_hamming_similarity(spark_df_selected_feat.iloc[:80000])
# 4 mins

In [18]:
# hamming_sim_df_90k = pairwise_hamming_similarity(spark_df_selected_feat.iloc[:20000])
# 5 mins


## Building Patient Similarity Network from similarity matrix

In [19]:
def print_graph_info(G):
    """
    Prints basic information about the graph.
    
    Parameters:
    G (nx.Graph): The NetworkX graph.
    """
    print(f"Number of nodes: {G.number_of_nodes()}")
    print(f"Number of edges: {G.number_of_edges()}")
    print("Sample nodes:", list(G.nodes)[:10])  # Print first 10 nodes as a sample
    print("Sample edges:", list(G.edges(data=True))[:10])  # Print first 10 edges as a sample
    
    # Check for self-loops
    self_loops = list(nx.selfloop_edges(G))
    if self_loops:
        print(f"Number of self-loops: {len(self_loops)}")
        print("Self-loops:", self_loops)
    else:
        print("No self-loops in the graph.")

def calculate_average_clustering(G):
    """
    Calculates and prints the average clustering coefficient of the graph.
    
    Parameters:
    G (nx.Graph): The NetworkX graph.
    """
    avg_clustering = nx.average_clustering(G)
    print(f"Average clustering coefficient: {avg_clustering}")

### KNN

#### KNN graph builing algorithm

In [20]:
def get_k_neighbors(sim_matrix_df, k_neighbors):
    '''
    Helper function to get the k-nearest neighbors for each node in the graph
    sim_matrix_df : pd.DataFrame containing the similarity matrix of patient data
    k_neighbors : number of nearest neighbors to consider

    retruns : dictionary where keys are the indices of the nodes/patients and values are lists of the indices of the k-nearest neighbors
    e.g. {0: [3, 1, 4], 1: [0, 3, 2], 2: [3, 1, 0], 3: [0, 1, 2], 4: [0, 3, 1]}
    '''
    print('Using Eucledian distance')
    # Compute euclidean distance matrix
    dist_mtx = scipy.spatial.distance_matrix(sim_matrix_df.values ,  sim_matrix_df.values)
    dist_mtx = pd.DataFrame(dist_mtx , index = sim_matrix_df.index , columns = sim_matrix_df.index)
    # print(dist_mtx)
    
    k_neighbors_dict = {}
    for node in dist_mtx.index:

        neighbors = dist_mtx.loc[node].nsmallest(k_neighbors + 1).index.tolist() 
        try: 
            neighbors.remove(node) # Exclude the node itself
        except ValueError:
            pass
            # print('ValueError: Node {} is not in top k neighbors'.format(node))
            # print(dist_mtx.loc[node].nsmallest(k_neighbors + 1))
        
        k_neighbors_dict[node] = neighbors

    # print(k_neighbors_dict)
    return k_neighbors_dict

def get_k_neighbors_directly(sim_matrix_df, k_neighbors):
    '''
    Helper function to get the k-nearest neighbors for each node in the graph
    sim_matrix_df : pd.DataFrame containing the similarity matrix of patient data
    k_neighbors : number of nearest neighbors to consider

    retruns : dictionary where keys are the indices of the nodes/patients and values are lists of the indices of the k-nearest neighbors
    e.g. {0: [3, 1, 4], 1: [0, 3, 2], 2: [3, 1, 0], 3: [0, 1, 2], 4: [0, 3, 1]}
    '''
    # Compute euclidean distance matrix
    # dist_mtx = scipy.spatial.distance_matrix(sim_matrix_df.values ,  sim_matrix_df.values)
    # dist_mtx = pd.DataFrame(dist_mtx , index = sim_matrix_df.index , columns = sim_matrix_df.index)
    
    # print('NO Eucledian distance')


    k_neighbors_dict = {}
    for node in sim_matrix_df.index:

        # neighbors = sim_matrix_df.loc[node].nsmallest(k_neighbors + 1).index.tolist() 
        neighbors = sim_matrix_df.loc[node].nlargest(k_neighbors + 1).index.tolist() 
        try: 
            neighbors.remove(node) # Exclude the node itself
        except ValueError:
            pass
            # print('ValueError: Node {} is not in top k neighbors'.format(node))
            # print sim_matrix_df.loc[node].nsmallest(k_neighbors + 1))
        
        k_neighbors_dict[node] = neighbors
        
    # print(k_neighbors_dict)

    return k_neighbors_dict

def build_network_knn(sim_matrix_df , labels,  k_neighbors = 20, compute_knn_directly = True) : 
    '''
    sim_matrix_df : pd.DataFrame containing the similarity matrix of patient data
    labels : pd.Series indicating asd status for each patient
    k_neighbors : number of nearest neighbors to consider
    '''

    if compute_knn_directly:
        k_neighbors_dict = get_k_neighbors_directly(sim_matrix_df, k_neighbors)
    else:
        k_neighbors_dict = get_k_neighbors(sim_matrix_df, k_neighbors)

    # Create a NetworkX graph
    G = nx.Graph()

    # Add nodes to the graph
    G.add_nodes_from(sim_matrix_df.index)

    # nx.set_node_attributes(G , labels , 'label')
    nx.set_node_attributes(G , labels , 'asd')
    nx.set_node_attributes(G , pd.Series(np.arange(len(sim_matrix_df.index)) , index=sim_matrix_df.index) , 'idx')

    # Add edges based on the k-nearest neighbors
    for node, neighbors in k_neighbors_dict.items():
        for neighbor in neighbors:
            if node == neighbor:
                print('Node {} is its own neighbor'.format(node))

            weight = sim_matrix_df.iloc[node, neighbor]
            G.add_edge(node, neighbor, weight=weight)
            # G.add_edge(node, neighbor)

    return G 


def plot_network(G , label_colours = ['skyblue', 'orange'] , node_size = 50, 
                 title = 'Patient Similarity Network', with_labels = False) :

    # plt.figure(figsize=(10, 8))
    # Define color map based on labels, 0 is non-ASD, 1 is ASD
    label_colour_map = {0: label_colours[0] , 1 : label_colours[1]}
    node_colours = [label_colour_map[G.nodes[node]['label']] for node in G]

    pos = nx.spring_layout(G, seed=42)  # seed ensures layout is the same each run

    # nx.draw(G, pos, with_labels=False, node_color=node_colours,
                #  node_size=node_size, font_size=8, alpha=0.3)

    nx.draw_networkx_nodes(G, pos, node_color=node_colours, node_size=node_size, alpha=0.7)
    nx.draw_networkx_edges(G, pos, alpha=0.2)

    if with_labels:
        node_labels = nx.get_node_attributes(G, 'idx')  
        nx.draw_networkx_labels(G, pos, labels=node_labels, font_size=8)

    # Create a patch list for the legend
    label_names = ['No ASD', 'ASD']
    legend_patches = [mpatches.Patch(color=colour, label=label) for label, colour in zip(label_names, label_colours)]

    plt.legend(handles=legend_patches)
    plt.title(title)

    plt.show()
    

#### Example graph with KNN

In [21]:
# n = 500
# k = 20
# cosine_sim_df_small = hamming_sim_df_90k.iloc[:n, :n]

# # G_knn_small = build_network_knn(cosine_sim_df_small, spark_df_asd.iloc[:n], k_neighbors=k, compute_knn_directly=False)
# # title = f"Patient Similarity Network with KNN (n={n}, k={k})"
# # print_graph_info(G_knn_small)
# # plot_network(G_knn_small, node_size=10, title = title)

# G_knn_small = build_network_knn(cosine_sim_df_small, spark_df_asd.iloc[:n], k_neighbors=k, compute_knn_directly=True)
# title = f"Patient Similarity Network with KNN (n={n}, k={k})"
# print_graph_info(G_knn_small)
# plot_network(G_knn_small, node_size=10, title = title)

### Thresholding

In [22]:
def build_network_threshold(sim_matrix_df, labels, threshold=0.8, features_df = None):
    """
    Creates a graph from a correlation matrix using a specified threshold.

    Parameters:
    sim_matrix_df (pd.DataFrame): DataFrame containing the correlation matrix.
    labels (pd.Series): Series containing the labels for each patient.
    threshold (float): Threshold for including edges based on similarity value.

    Returns:
    G (nx.Graph): Graph created from the similarity matrix.
    """
    G = nx.Graph()

    # # Add nodes
    # for node in sim_matrix_df.columns:
    #     G.add_node(node)

    
    # Add nodes to the graph
    G.add_nodes_from(sim_matrix_df.index)

    # nx.set_node_attributes(G , labels , 'label')
    nx.set_node_attributes(G , labels , 'asd')
    # nx.set_node_attributes(G , pd.Series(np.arange(len(sim_matrix_df.index)) , index=sim_matrix_df.index) , 'idx')

    # Add edges with weights above the threshold
    for i in range(sim_matrix_df.shape[0]):
        for j in range(i + 1, sim_matrix_df.shape[1]):
            if i != j:  # Ignore the diagonal elements
                weight = sim_matrix_df.iloc[i, j]
                if abs(weight) >= threshold:
                    G.add_edge(sim_matrix_df.index[i], sim_matrix_df.columns[j], weight=weight)

    return G



In [23]:
# n = 10000
# n = 100
# hamming_sim_df = pairwise_hamming_similarity(spark_df_selected_feat.iloc[:n])
# hamming_sim_df = pairwise_weighted_hamming_similarity(spark_df_selected_feat.iloc[:n], feat_importances)


In [24]:
# n = 10000
# n = 100
# NOTE - new best parameter used
# t = 0.6

# cosine_sim_df_small = hamming_sim_df.iloc[:n, :n]
# G_small_threshold = build_network_threshold(cosine_sim_df_small, spark_df_asd.iloc[:n] ,threshold=t, features_df=spark_df_selected_feat)
# G_small_threshold = build_network_threshold(cosine_sim_df_small, spark_df_asd.iloc[:n] ,threshold=t)
# print_graph_info(G_small_threshold)
# calculate_average_clustering(G_small_threshold)

# print(G_small_threshold.nodes(data=True))
# title = f"Patient Similarity Network with Threshold (n={n}, threshold={t})"
# plot_network(G_small_threshold, node_size=10, title = title)

# 17 mins for 10k patients

## Cleaning the graph

In [25]:
def clean_graph(G, degree_threshold=1, keep_largest_component=True):
    """
    Cleans the graph by performing several cleaning steps:
    - Removes unconnected nodes (isolates)
    - Removes self-loops
    - Removes nodes with a degree below a specified threshold
    - Keeps only the largest connected component (optional)

    Parameters:
    G (nx.Graph): The NetworkX graph to clean.
    degree_threshold (int): Minimum degree for nodes to keep.
    keep_largest_component (bool): Whether to keep only the largest connected component.

    Returns:
    G (nx.Graph): Cleaned graph.
    """
    # Remove self-loops
    G.remove_edges_from(nx.selfloop_edges(G))

    # Remove nodes with no edges (isolates)
    G.remove_nodes_from(list(nx.isolates(G)))

    # Remove nodes with degree below the threshold
    low_degree_nodes = [node for node, degree in dict(G.degree()).items() if degree < degree_threshold]
    G.remove_nodes_from(low_degree_nodes)

    # Keep only the largest connected component
    if keep_largest_component:
        largest_cc = max(nx.connected_components(G), key=len)
        G = G.subgraph(largest_cc)

    return G

In [26]:
# # Clean the graph by removing unconnected nodes
# G_small_threshold_cleaned = clean_graph(G_small_threshold,
#                                     degree_threshold=5,
#                                     keep_largest_component=False)
# plot_network(G_small_threshold_cleaned, node_size=10, title=f'Cleaned Patient Similarity Network (n={n}, threshold={t})'.format(n,t))

## Louvain Clustering

In [27]:
# from sklearn.cluster import SpectralClustering
# from sklearn.metrics import adjusted_rand_score, rand_score

In [28]:

# communities = nx.community.louvain_communities(G)

def plot_network_clusters(G , communities, node_size = 50, 
                          title = 'Patient Clusters (Louvain clustering)') :
    
    '''
    G : nx.Graph
    communities : list of sets containing the nodes in each community
    '''

    # plt.figure(figsize=(10, 8))

    # Convert communities to an array of cluster labels
    cluster_map = {}
    for i, community in enumerate(communities):
        for node in community:
            cluster_map[node] = i  # Assign a unique cluster label based on community

    
    cluster_labels = [cluster_map[node] for node in G.nodes()]



    pos = nx.spring_layout(G, seed=42)  # seed ensures layout is the same each run


    # Create a color map from labels to colors
    unique_cluster_labels = np.unique(cluster_labels)
    node_colours = [plt.cm.jet(label / max(unique_cluster_labels)) for label in cluster_labels]

    nx.draw_networkx_nodes(G, pos, node_color=node_colours, node_size=node_size, alpha=0.7)
    nx.draw_networkx_edges(G, pos, alpha=0.2)

    # if with_labels:
    #     node_labels = nx.get_node_attributes(G, 'idx')  
    #     nx.draw_networkx_labels(G, pos, labels=node_labels, font_size=8)

    # Create a patch list for the legend
    legend_handles = [mpatches.Patch(color=plt.cm.jet(i / max(unique_cluster_labels)), label=f'Cluster {i+1}') for i in unique_cluster_labels]

    plt.legend(handles=legend_handles)
    plt.title(title)

    plt.show()



In [29]:
import os
import pickle
import time

import gc
import psutil

def print_memory_usage():
    process = psutil.Process()
    mem_info = process.memory_info()
    virtual_mem = psutil.virtual_memory()
    print("---------------------------------")
    print(f"Process Memory Usage: {mem_info.rss / (1024 ** 3):.2f} GB")
    # print(f"Total Memory: {virtual_mem.total / (1024 ** 3):.2f} GB")
    # print(f"Available Memory: {virtual_mem.available / (1024 ** 3):.2f} GB")
    # print(f"Used Memory: {virtual_mem.used / (1024 ** 3):.2f} GB")
    print(f"Memory Percentage: {virtual_mem.percent}%")
    print("---------------------------------")

In [35]:
sample_sizes = [1000, 2000, 4000, 6000, 8000]
for sample_size in sample_sizes:
    
    k = int(sample_size/10)
    print(f"Using sample size: {sample_size}")
    print(f"Using k: {k}")
    sample_dfs = get_sample_datasets(spark_df_selected_feat, sample_size, 5)

    for i, sample_df in enumerate(sample_dfs):

        print(f"Processing sample {i+1}...")
        start_time = time.time()

        x = sample_df.drop(['asd', 'index'], axis=1)
        y = sample_df['asd']
        sim_df = pairwise_weighted_hamming_similarity(x, feat_importances)
        # G = build_network_threshold(sim_df, labels=y ,threshold=t, features_df=x)
        G = build_network_knn(sim_df , labels=y,  k_neighbors = k, compute_knn_directly = True)
        communities = nx.community.louvain_communities(G)
        # plot_network_clusters(G, communities, title='Patient Similarity Network with Clusters')
        
        cluster_membership = {}
        for idx, community in enumerate(communities):
            for node_id in community:
                cluster_membership[node_id] = idx

        sample_df['cluster'] = sample_df.index.map(cluster_membership)

        print("Saving results...")
        folder_path = f"sampling_louvain_results_knn/{sample_size}/{i}"
        os.makedirs(folder_path, exist_ok=True)

        # Save sample_df
        sample_df_path = os.path.join(folder_path, 'sample_df.pkl')
        with open(sample_df_path, 'wb') as file:
            pickle.dump(sample_df, file)

        # Save communities
        communities_path = os.path.join(folder_path, 'communities.pkl')
        with open(communities_path, 'wb') as file:
            pickle.dump(communities, file)

        # Save G
        g_path = os.path.join(folder_path, 'G.gpickle')
        with open(g_path, 'wb') as file:
            pickle.dump(G, file)

        end_time = time.time()
        elapsed_time_mins = (end_time - start_time)/60
        print(f"Total time taken: {elapsed_time_mins:.2f} minutes.")
        print()

        # Clear variables to free up memory
        # print_memory_usage()
        del G, x, y, sim_df, communities, cluster_membership
        gc.collect()
        # print("garbage collected")
        # print_memory_usage()
        # print()



Using sample size: 1000
Using k: 100
Processing sample 1...
Saving results...
Total time taken: 0.08 minutes.

Processing sample 2...
Saving results...
Total time taken: 0.08 minutes.

Processing sample 3...
Saving results...
Total time taken: 0.08 minutes.

Processing sample 4...
Saving results...
Total time taken: 0.08 minutes.

Processing sample 5...
Saving results...
Total time taken: 0.08 minutes.

Using sample size: 2000
Using k: 200
Processing sample 1...
Saving results...
Total time taken: 0.31 minutes.

Processing sample 2...
Saving results...
Total time taken: 0.31 minutes.

Processing sample 3...
Saving results...
Total time taken: 0.30 minutes.

Processing sample 4...
Saving results...
Total time taken: 0.31 minutes.

Processing sample 5...
Saving results...
Total time taken: 0.30 minutes.

Using sample size: 4000
Using k: 400
Processing sample 1...
Saving results...
Total time taken: 1.24 minutes.

Processing sample 2...
Saving results...
Total time taken: 1.23 minutes.

P

In [34]:

# sample_size=20000
# k = int(sample_size/10)
# print(f"Using sample size: {sample_size}")
# print(f"Using k: {k}")
# sample_dfs = get_sample_datasets(spark_df_selected_feat, sample_size, 5)

# for i, sample_df in enumerate(sample_dfs):

#     print(f"Processing sample {i+1}...")
#     start_time = time.time()

#     x = sample_df.drop(['asd', 'index'], axis=1)
#     y = sample_df['asd']
#     sim_df = pairwise_weighted_hamming_similarity(x, feat_importances)
#     # G = build_network_threshold(sim_df, labels=y ,threshold=t, features_df=x)
#     G = build_network_knn(sim_df , labels=y,  k_neighbors = k, compute_knn_directly = True)
#     communities = nx.community.louvain_communities(G)
#     # plot_network_clusters(G, communities, title='Patient Similarity Network with Clusters')
    
#     cluster_membership = {}
#     for idx, community in enumerate(communities):
#         for node_id in community:
#             cluster_membership[node_id] = idx

#     sample_df['cluster'] = sample_df.index.map(cluster_membership)

#     print("Saving results...")
#     folder_path = f"sampling_louvain_results_knn/{sample_size}/{i}"
#     os.makedirs(folder_path, exist_ok=True)

#     # Save sample_df
#     sample_df_path = os.path.join(folder_path, 'sample_df.pkl')
#     with open(sample_df_path, 'wb') as file:
#         pickle.dump(sample_df, file)

#     # Save communities
#     communities_path = os.path.join(folder_path, 'communities.pkl')
#     with open(communities_path, 'wb') as file:
#         pickle.dump(communities, file)

#     # Save G
#     g_path = os.path.join(folder_path, 'G.gpickle')
#     with open(g_path, 'wb') as file:
#         pickle.dump(G, file)

#     end_time = time.time()
#     elapsed_time_mins = (end_time - start_time)/60
#     print(f"Total time taken: {elapsed_time_mins:.2f} minutes.")
#     print()

#     # Clear variables to free up memory
#     # print_memory_usage()
#     del G, x, y, sim_df, communities, cluster_membership
#     gc.collect()
#     # print("garbage collected")
#     # print_memory_usage()
#     # print()

# # 8k - 20 mins/sample; 5 mins/sample for knn
# # 6k - 10 mins/sample; 3 mins/sample for knn
# # 4k - 5 mins/sample;    1.5 mins/sample for knn
# # 2k - 1 mins/sample; 0.5 mins/sample for knn
# # 1k - 0.5 mins/sample
# # 20k - 130 mins/sample; 33 mins/sample for knn
# # 10k - 40ish mins/sample ; 8 mins/sample for knn

Using sample size: 20000
Using k: 2000


In [33]:
print("All samples processed.")

All samples processed.


In [32]:
# # import pandas as pd
# # import numpy as np
# import seaborn as sns
# # import matplotlib.pyplot as plt
# # import networkx as nx

# # Assuming 'G' is your graph and 'communities' is a list of sets, where each set contains the nodes in a community
# # Step 1: Extract Node Attributes and Cluster Membership
# node_attributes = {node: G.nodes[node] for node in G.nodes()}
# # print(node_attributes)
# cluster_membership = {node: idx for idx, community in enumerate(communities) for node in community}

# # Add cluster membership as an attribute
# nx.set_node_attributes(G, cluster_membership, 'cluster')

# # Step 2: Aggregate Attributes by Cluster
# # Initialize a list to hold aggregated data
# aggregated_data = []

# # Iterate over each community to aggregate attributes
# for idx, community in enumerate(communities):
#     # print(idx, community)
#     community_attributes = [node_attributes[node] for node in community]
#     df = pd.DataFrame(community_attributes)
#     df['cluster'] = idx + 1
#     aggregated_data.append(df)




# # Concatenate all community data
# all_data = pd.concat(aggregated_data)

# # all_data = sample_df

# # Calculate mean and standard deviation for each attribute by cluster
# mean_values = all_data.groupby('cluster').mean()
# std_values = all_data.groupby('cluster').std()

# print(mean_values)

# # Step 3: Prepare Data for Heatmap
# # For simplicity, focusing on mean values; you can similarly plot std_values
# heatmap_data = mean_values

# # Step 4: Plot Heatmap
# plt.figure(figsize=(8, 5))
# sns.heatmap(heatmap_data, annot=False, cmap='viridis')
# plt.title('Mean Values of Node Attributes by Cluster (10k Patients)')
# plt.ylabel('Cluster')
# plt.xlabel('Attribute')
# plt.show()

In [94]:
# # Specify the number of decimal places
# decimal_places = 2

# # Round mean_values and std_values
# rounded_mean_values = mean_values.round(decimal_places)
# rounded_std_values = std_values.round(decimal_places)

# # Initialize an empty DataFrame with the same index as mean_values or std_values
# formatted_table = pd.DataFrame(index=rounded_mean_values.index)

# # Iterate over columns in rounded_mean_values to format "mean (std)" strings
# for col in rounded_mean_values.columns:
#     formatted_table[col] = rounded_mean_values[col].astype(str) + " (" + rounded_std_values[col].astype(str) + ")"

# # Display the formatted table
# print(formatted_table)

In [95]:
# print(formatted_table.iloc[:,:14])
# print()
# print(formatted_table.iloc[:,14:])

In [96]:
# pickle_folder = "louvain_pickles/mf_10k_weighted/"

# mean_values.to_pickle(pickle_folder + "mean_values.pkl")
# std_values.to_pickle(pickle_folder + "std_values.pkl")
# formatted_table.to_pickle(pickle_folder + "formatted_table.pkl")
# pd.to_pickle(all_data, pickle_folder + 'all_data.pkl')


In [97]:
# import pickle

# with open(pickle_folder + 'communities.pkl', 'wb') as file:
#     pickle.dump(communities, file)

In [98]:
# with open(pickle_folder + 'G.pkl', 'wb') as file:
#     pickle.dump(G, file)

In [99]:
# formatted_table