In [None]:
import pandas as pd 
import numpy as np 
import networkx as nx 
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#import altair as alt
from sklearn.metrics.pairwise import cosine_similarity
#import nx_altair as nxa
#import snf
sns.set_style("darkgrid")


#### Functions:

In [None]:
matched_IDs = [] # list of participant IDs

In [None]:
def sort_df_IDs(df, ID_list):

    sorted_df = df[df['participant_id'].isin(ID_list)].sort_values(by=['participant_id'], key=lambda x: x.map({v: i for i, v in enumerate(ID_list)}))
    sorted_df = sorted_df.reset_index(drop=True)
    return sorted_df


In [None]:
def sex_to_binary(df_):
    df = df_.copy()
    for i, entry in enumerate(df['sex']):
        if entry == 'male':
            df['sex'][i] = 0
        elif entry == 'female':
            df['sex'][i] = 1
    return df 

In [None]:
def handedness_to_binary(df_):
    df = df_.copy()
    for i, entry in enumerate(df['handedness']):
        if entry == 'right':
            df['handedness'][i] = 1
        elif entry == 'left':
            df['handedness'][i] = 0
        elif entry == 'ambidex':
            df['handedness'][i] = 2  
    return df

In [None]:
def dep_variable(df_): 
    """
    Takes in a df
    creates a variable dep_tot (0 or 1) based on the dep scores from different scales madrs and gds
    returns df with updated variable column
    """

    df = df_.copy()
    df['dep_tot'] = 0

    for i in range(len(df)):

        if df['dep_scale'][i] == 'madrs':
            if df['dep_scale_score'][i] >= 12:
                dep_tot = 1
            elif df['dep_scale_score'][i] < 12:
                dep_tot = 0

        elif df['dep_scale'][i] == 'gds':
            if df['dep_scale_score'][i] > 5:
                dep_tot = 1
            elif df['dep_scale_score'][i] <= 5:
                dep_tot = 0
        elif df['dep_scale'][i] == 'other':
            dep_tot = 0

        df['dep_tot'][i] = dep_tot

    return df

In [None]:
def bmi(df_):

    df = df_.copy()
    df['bmi'] = 0

    for i in range(len(df)):
        height = df['height_cm'][i]*0.01
        weight = df['weight_kg'][i]
        bmi = weight/(height**2)
        df['bmi'][i] = bmi

    return df
    

In [None]:
def scale_df(df_, cols, sc):
    '''
    Takes in a dataframe (can contain participant and session id), a list of variables for column names, name of scaler 
    scales df with standardscaling or minmax
    assigns new df with given columns
    returns scaled df with participant and session id

    '''
    df = df_.copy()
    
    if sc == 'standard':
        scaler = StandardScaler()
    elif sc == 'minmax':
        scaler = MinMaxScaler()
    df_sc = scaler.fit_transform(df[cols])
    df_sc = pd.DataFrame(df_sc, columns = cols)
    df[cols] = df_sc 

    return df

In [None]:
def scale_adjMatrix(adjM, sc):

    if sc == 'standard':
        scaler = StandardScaler()
    elif sc == 'minmax':
        scaler = MinMaxScaler()

    mask = np.triu(np.ones_like(adjM, dtype=bool), k=1) # mask the lower triangle and diagonal for scaling
    adjM_sc = adjM.copy()
    adjM_sc[mask] = scaler.fit_transform(adjM[mask].reshape(-1, 1)).flatten()


    return adjM_sc

In [None]:
def makeAdjMatrix(df, sim_metric, save_adjM): #'Eucl', 'Cos', 'Pears'

    n = len(df) # find number of rows (number of participant sessions)
    adj_matrix = np.eye(n) # create adjacency matrix with 1s along diagonal

    if sim_metric == 'Eucl':
        distances = pdist(df.values, metric='euclidean') # Compute pairwise Euclidean distances between participants in a condensed array
        sq_distances = squareform(distances) # Convert the condensed distance matrix into a square distance matrix
        
        # Fill the upper triangle of the similarity matrix with similarities:
        for i in range(n):
            for j in range(i+1, n):
                adj_matrix[i, j] = 1 / (1 + sq_distances[i, j])

    elif sim_metric == 'Cos':
        similarities = cosine_similarity(df.values)

        # Fill the upper triangle of the similarity matrix with similarities:
        for i in range(n):
            for j in range(i+1, n):
                adj_matrix[i, j] = similarities[i, j]

    elif sim_metric == 'Pears':
        df_trans = df.copy().transpose() # Transpose the df so that participants are along the columns (necessary for the correlation function)
        sim_matrix_p = df_trans.corr() # Compute the similarity matrix with pearson correlation
        adj_matrix = np.triu(sim_matrix_p.values) # Extract the upper triangular part of the array (including the diagonal)

    if save_adjM:
        np.savetxt(f'adjM_clinical_{sim_metric}.csv', X=adj_matrix, delimiter=',')

    return adj_matrix

In [1]:
def create_PSN(adj, phi, s, title): 
    '''
    Takes in adjacency matrix adj, cutoff phi and seed s
    Creates and draws network graph with spring layout
    
    returns network and similarity list
    '''
    
    G = nx.from_numpy_array(adj)
    PSN = nx.Graph()
    PSN.add_nodes_from(G.nodes)

    similarities = []
    n = len(G.nodes)

    for i in range(n):
        for j in range(i+1,n):
            sim = adj[i,j]
            similarities.append(sim)
            if sim > phi:
                PSN.add_edge(i, j, weight = sim)


    plt.figure(figsize=(15,10))
    plt.title(title + ', phi= ' + str(phi))
    pos = nx.spring_layout(PSN, seed = s, k=1/3)
    nx.draw_networkx_nodes(PSN, pos = pos, node_size = 80, alpha=0.7)
    nx.draw_networkx_edges(PSN, pos=pos, width=2, alpha=1/10)

    plt.grid(False)
    plt.gca().set_facecolor('white')
    
    return PSN, similarities

In [None]:
def plot_similarities(sim, title):

    plt.figure(figsize=(13,8))
    plt.title(title)
    plt.xlabel('Similarities')

    sns.set(font_scale=1.2) 
    sns.histplot(sim, bins=100)

    plt.axvline(np.mean(sim), color='k', linestyle='dashed', linewidth=1)

    sorted_sim = sorted(sim, reverse=True)
    top_10_index = int(0.1 *len(sorted_sim))
    top_10_cutoff = sorted_sim[top_10_index]

    plt.axvline(top_10_cutoff, color = 'k', linestyle='dashed', linewidth=2)

    plt.show()

    print('mean: ' + str(np.nanmean(sim)))
    print('Top 10 percent similarities: ' + str(top_10_cutoff))
    

In [None]:
picked_some_var= ['participant_id',
 'age',
 'sex',
 'education',
 'handedness',
 'children_num',
 'cog_scale',
 'cog_scale_score',
 'cdr_gs',
 'dep_scale',
 'dep_scale_score',
 'alc_scale_score',
 'height_cm',
 'weight_kg',
 'dem_num_firstdeg',
 'smoke',
 'physact_freq',
 'heart',
 'hypertension',
 'vascular',
 'liver',
 'cpns',
 'cancer'
 ]

#### Pre-processing:

In [None]:
clinical_df = pd.read_excel('..........xlsx', sheet_name='Sheet2')
picked_df = clinical_df[picked_some_var]
picked_df = sort_df_IDs(picked_df.copy(), matched_IDs)

In [None]:
# setting some nan-values and extreme values to valid numbers:

picked_df['dem_num_firstdeg'][6] = 0 # setting an extreme value (60) to 0

picked_df['height_cm'][5] = 175  # two participants had height 1m and weight 20kg. setting to around normal measures.
picked_df['weight_kg'][5] = 80
picked_df['height_cm'][117] = 165
picked_df['weight_kg'][117] = 60

missing_dem_num = picked_df['dem_num_firstdeg'].isnull()
missing_ind_dem_num = missing_dem_num[missing_dem_num].index
picked_df.loc[missing_ind_dem_num, 'dem_num_firstdeg'] = 0

missing_smoke = picked_df['smoke'].isnull()
missing_ind_smoke = missing_smoke[missing_smoke].index
picked_df.loc[missing_ind_smoke, 'smoke'] = 0

missing_physact = picked_df['physact_freq'].isnull()
missing_ind_physact = missing_physact[missing_physact].index
picked_df.loc[missing_ind_physact, 'physact_freq'] = 4

# changing variable values for 'sex' ('male', 'female') to (0, 1):
picked_df = sex_to_binary(picked_df)
picked_df = handedness_to_binary(picked_df)
picked_df = bmi(picked_df)

picked_some_var.append('bmi')
picked_some_var.remove('height_cm')
picked_some_var.remove('weight_kg')

#picked_df = smoke_variable(picked_df)
#picked_some_var.append('smoke_tot')
#picked_some_var.remove('smoke')

picked_df = dep_variable(picked_df)
picked_some_var.append('dep_tot')
picked_some_var.remove('dep_scale')
picked_some_var.remove('dep_scale_score')

picked_df = picked_df[picked_some_var]


In [None]:
df_clin_sc = scale_df(picked_df, picked_some_var[1:], 'minmax')
df_clin_sc

#### PSN implementation:

In [None]:
IDs_clin = list(df_clin_sc['participant_id'])
len(IDs_clin)


In [None]:
adjM = makeAdjMatrix(df_clin_sc[picked_some_var[1:]], 'Cos', True)

In [None]:
adjM_sc = scale_adjMatrix(adjM, 'standard')

In [None]:
clin_PSN, clin_sim = create_PSN(adjM, 0.772, 100, 'PSN clinical data (n=342)') # adj, phi, s, title)

In [None]:
plot_similarities(clin_sim, 'clinical similarities')