In [None]:
import pandas as pd
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import heapq
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import pdist, squareform
sns.set_style("darkgrid")
from networkx.algorithms.community.quality import modularity

#### Functions: 

In [None]:
matched_IDs = [] # participant ids

In [None]:
def read_cantab_file(file, enc): # reads the file and returns a df. choose encoding.
    df = pd.read_csv(file, encoding=enc)
    return df

In [None]:
def remove_vars(all_var, rem_var):
    '''
    Removes the variables that are a priori decided not to be included in the 
    framework from the CANTAB data. 
    task MOT, the standard deviation variables and other variables.

    returns the reduced list of variables
    '''

    rest = [var for var in all_var if var not in rem_var]
    rest = [var for var in rest if 'SD' not in var]
    rest = [var for var in rest if 'MOT' not in var]

    return rest

In [None]:
test = remove_vars(cols_var, [])

In [None]:
def preprocess_df(df, visit, site):
    """
    process the data, filter on visit session and site, remove unneeded columns, remove variables not to be included
    """
    
    sites = [1,2,3,4]
    
    df = df.loc[df['Visit Identifier'] == visit] # filtering out only sessions from visit identifier
    
    if site in sites:
        df = df.loc[df['Site'] == site] # Filtering on site if relevant, else all sites are included
        
    all_col = df.columns.tolist()  
    i = all_col.index('SWM Recommended Standard 2.0 Extended Observation')
    cols_var = all_col[i+1:-1] # filtering out variable columns 

    # Removing variables not to be included: 
    variables = remove_vars(cols_var, var_to_remove+var_na)
    print('variablene: '+ str(len(variables)))

    var_with_ids = variables.copy() # making duplicate list of variable list for adding ID columns as well

    var_with_ids.insert(0, 'Participant ID') # adding IDs first in the list 
    var_with_ids.insert(1, 'session_id')
   
    df = df[var_with_ids] # reducing the df with respect to variable list with IDs

    df = df.reset_index(drop=True) #reseting the indexes
    
    return df, variables, var_with_ids # returning dataframe, list of all relevant variables and list of all variables + ids 
    
    

In [None]:
def in_both_dfs(df1, df2):
    ids_1 = df1['Participant ID']
    ids_2 = df2['Participant ID']
    
    print(len(ids_1))
    print(len(ids_2))
    
    in_both = list(set(ids_1).intersection(ids_2))
    
    return in_both

In [None]:
def scale_df(df_, cols, sc):
    '''
    Takes in a dataframe (can contain participant and session id), a list of variables for column names, name of scaler 
    scales df with standardscaling or minmax
    assigns new df with given columns
    returns scaled df with participant and session id

    '''
    df = df_.copy()
    
    if sc == 'standard':
        scaler = StandardScaler()
    elif sc == 'minmax':
        scaler = MinMaxScaler()
    df_sc = scaler.fit_transform(df[cols])
    df_sc = pd.DataFrame(df_sc, columns = cols)
    df[cols] = df_sc 

    return df

In [None]:
var_to_remove = [ # variables that are decided not to be included, bad measures, irrelevant measures or wrong representation/calculation
    'DMSCC',
    'DMSML',
    'DMSML0',
    'DMSML12',
    'DMSML4',
    'DMSMLAD',
    'DMSMLS',
    'DMSPEGC',
    'DMSPEGE',
    'DMSTC0',
    'DMSTC12',
    'DMSTC4',
    'DMSTCAD',
    'DMSTCS',
    'DMSTEAD',
    'DMSTEC',
    'DMSTECAD',
    'DMSTED',
    'DMSTEDAD',
    'DMSTEP',
    'DMSTEPAD',

    'PALTA12',
    'PALTA2',
    'PALTA28',
    'PALTA4',
    'PALTA6',
    'PALTA8',

    'PRMMCLD',
    'PRMMCLI',

    'RVPML',

    'SWMDE4',
    'SWMDE6',
    'SWMDE8',
    'SWMPR',
    'SWMS',
    'SWMS6',
    'SWMSX',
    'SWMTE4',
    'SWMTE6',
    'SWMTE8',
    'SWMWE4',
    'SWMWE6',
    'SWMWE8'
]

In [None]:
var_na = ['DMSMDL12', # removed when the number of nan-values were really high
 'DMSML12',
 'PALMETS28',
 'SWMBE12',
 'SWMDE12',
 'SWMTE12',
 'SWMWE12']

In [None]:
def makeAdjMatrix(df, sim_metric, visit, save_adjM): #'Eucl', 'Cos', 'Pears'

    n = len(df) # find number of rows (number of participant sessions)
    adj_matrix = np.eye(n) # create adjacency matrix with 1s along diagonal

    if sim_metric == 'Eucl':
        distances = pdist(df.values, metric='euclidean') # Compute pairwise Euclidean distances between participants in a condensed array
        sq_distances = squareform(distances) # Convert the condensed distance matrix into a square distance matrix
        
        # Fill the upper triangle of the similarity matrix with similarities:
        for i in range(n):
            for j in range(i+1, n):
                adj_matrix[i, j] = 1 / (1 + sq_distances[i, j])

    elif sim_metric == 'Cos':
        similarities = cosine_similarity(df.values)

        # Fill the upper triangle of the similarity matrix with similarities:
        for i in range(n):
            for j in range(i+1, n):
                adj_matrix[i, j] = similarities[i, j]

    elif sim_metric == 'Pears':
        #df_trans = df.copy()
        df_trans = df.copy().transpose() # Transpose the df so that participants are along the columns (necessary for the correlation function)
        matrix_p = df_trans.corr() # Compute the similarity matrix with pearson correlation
        sim_matrix_p = (matrix_p + 1)/2
        adj_matrix = np.triu(sim_matrix_p.values) # Extract the upper triangular part of the array (including the diagonal)

    if save_adjM:
        np.savetxt(f'adjM_{visit}_{sim_metric}.csv', X=adj_matrix, delimiter=',')

    return adj_matrix

In [None]:
def create_PSN(adj, phi, s, title): 
    '''
    Takes in adjacency matrix adj, cutoff phi and seed s
    Creates and draws network graph with spring layout
    
    returns network and similarity list
    '''
    
    G = nx.from_numpy_array(adj)
    PSN = nx.Graph()
    PSN.add_nodes_from(G.nodes)

    similarities = []
    n = len(G.nodes)

    for i in range(n):
        for j in range(i+1,n):
            sim = adj[i,j]
            similarities.append(sim)
            if sim > phi:
                PSN.add_edge(i, j, weight = sim)


    plt.figure(figsize=(10,10))
    plt.title(title + ', cutoff= ' + str(phi))
    pos = nx.spring_layout(PSN, seed = s, k=1/2)
    nx.draw_networkx_nodes(PSN, pos=pos, node_size=80, alpha=0.7)
    nx.draw_networkx_edges(PSN, pos=pos, width=2, alpha=1/10)
    
    plt.grid(False)
    plt.gca().set_facecolor('white')
    
    return PSN, similarities

In [None]:
def plot_similarities(sim, title):

    plt.figure(figsize=(10,10))
    plt.title(title)
    plt.xlabel('Similarities')

    sns.set(font_scale=2) 
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    sns.histplot(sim, bins=100)

    #plt.axvline(np.mean(sim), color='k', linestyle='dashed', linewidth=1)

    sorted_sim = sorted(sim, reverse=True)
    top_10_index = int(0.1 *len(sorted_sim))
    top_10_cutoff = sorted_sim[top_10_index]

    plt.axvline(top_10_cutoff, color = 'k', linestyle='dashed', linewidth=2)
    plt.text(top_10_cutoff+0.01, 500, str(round(top_10_cutoff, 3)),  va='bottom', ha='left')

    plt.show()

    print('mean: ' + str(np.nanmean(sim)))
    print('Top 10 percent similarities: ' + str(top_10_cutoff))
    

In [None]:
apoe_df = pd.read_csv('Genotype_formatted_results.csv') #reads the APOE result file 
APOE_ids = list(apoe_df['Participant'])

holdout_df = pd.read_excel('Hold_out.xlsx') # reads the hold out file with particiant IDs not to be included in analyses
holdout = list(holdout_df['participant_id'])

df = read_cantab_file("CANTAB-ALL.csv", 'ISO-8859-1') # reads the cantab file

# process the cantab df:
df_1, cols_var, cols_var_ids = preprocess_df(df, 1, 'all') # df, visit, site


# filter the ids common in all datasets:
#cantab_ids = in_both_dfs(df_1, df_3) #finds the participant IDs in both visit dfs
cantab_ids = list(df_1['Participant ID'])
cantab_apoe_ids = list(set(cantab_ids).intersection(set(APOE_ids))) #finds common IDs with APOE file 
print(len(cantab_apoe_ids))

relevant_ids = [id for id in matched_IDs if id not in holdout]
#relevant_ids = [id for id in cantab_apoe_ids if id not in holdout]

df_1 = df_1[df_1['Participant ID'].isin(relevant_ids)]
df_1 = df_1.reset_index(drop=True)


apoe_df = apoe_df[apoe_df['Participant'].isin(relevant_ids)]
apoe_df = apoe_df.reset_index(drop=True)

# setting some nan-values to valid numbers:

# DMSMDL0 -> 13000  (had one na)

missing_DMSMDL0 = df_1['DMSMDL0'].isnull()
missing_ind_DMSMDL0 = missing_DMSMDL0[missing_DMSMDL0].index
df_1.loc[missing_ind_DMSMDL0, 'DMSMDL0'] = 13000

# DMSMDL4 -> 20000 (had one na)

missing_DMSMDL4 = df_1['DMSMDL4'].isnull()
missing_ind_DMSMDL4 = missing_DMSMDL4[missing_DMSMDL4].index
df_1.loc[missing_ind_DMSMDL4, 'DMSMDL4'] = 20000

# RVPA -> 0.6 (had 8 na)
missing_RVPA = df_1['RVPA'].isnull()
missing_ind_RVPA = missing_RVPA[missing_RVPA].index
df_1.loc[missing_ind_RVPA, 'RVPA'] = 0.5

# RVPMDL -> 1500 (had 8 na)
missing_RVPMDL = df_1['RVPMDL'].isnull()
missing_ind_RVPMDL = missing_RVPMDL[missing_RVPMDL].index
df_1.loc[missing_ind_RVPMDL, 'RVPMDL'] = 1300


len(relevant_ids)

In [None]:
df_removed1.isna().sum()

### Implementation of the PSN:
- Scale the data
- calculate similarity and adjacency matrix
- Implement the PSN


In [None]:
s = 100 # seed for spring layout algorithm

In [None]:
df_1_sc = scale_df(df_1.copy(), cols_var, 'minmax')
#df_1_sc = scale_df(df_1.copy(), cols_var, 'standard') 

In [None]:
df_1_stsc = scale_df(df_1.copy(), cols_var, 'standard')

#### Euclidean similarity:

In [None]:
adjM_V1_eucl = makeAdjMatrix(df_1_sc[cols_var], 'Eucl', 'V1', save_adjM=False)
#adjM_V3_eucl = makeAdjMatrix(df_3_sc[cols_var], 'Eucl', 'V3', save_adjM=True) 

In [None]:
adjM_V1_eucl_st = makeAdjMatrix(df_1_stsc[cols_var], 'Eucl', 'V1', save_adjM=True)

In [None]:
PSN_euclV1, similaritiesV1 = create_PSN(adjM_V1_eucl, 0.465, s, 'CANTAB: PSN of scores (v1) for all (n=342), 45 variables, eucl.dist')

In [None]:
plot_similarities(similaritiesV1, 'Distribution of euclidean similarities for 45 variables')

In [None]:
PSN_euclV1, similaritiesV1_st = create_PSN(adjM_TEST, 0.141, s, 'CANTAB: PSN of scores (v1) for all (n=342), 45 variables, eucl.dist, standardscaled')

In [None]:
plot_similarities(similaritiesV1_st, 'Distribution of euclidean similarities for 45 variables')

#### Cosine similarity:

##### V1:

In [None]:
adjM_V1_cos_st = makeAdjMatrix(df_1_sc[cols_var], 'Cos', 'V1 (st.scaled)', save_adjM=True)

In [None]:
adjM_V1_cos = makeAdjMatrix(df_1_sc[cols_var], 'Cos', 'V1', save_adjM=True)
#adjM_V3_cos = makeAdjMatrix(df_3_sc[cols_var], 'Cos', 'V3', save_adjM=True)

In [None]:
PSN_cosV1, sim_cosV1 = create_PSN(adjM_V1_cos, 0.944, s, 'CANTAB: PSN of scores (v1) for all (n=346), 45 variables, cos.sim')

In [None]:
plot_similarities(sim_cosV1, 'Distribution of cosine similarities for 45 variables')

Standard V1 scaled df:

In [None]:
len(df_1)

In [None]:
PSN_cos_st, sim_cos_st = create_PSN(adjM_V1_cos_st, 0.416, s, 'Cantab: PSN for V1 for all (n=342), 45 var, cos.sim, standard scaled')

In [None]:
plot_similarities(sim_cos_st, 'Distribution of cosine similarities for 45 variables, st scaled')

#### pearson correlation:

##### V1:

In [None]:
adjM_V1_pears = makeAdjMatrix(df_1_sc[cols_var], 'Pears', 'V1', save_adjM=True)
#adjM_V3_pears = makeAdjMatrix(df_3_sc[cols_var], 'Pears', 'V3', save_adjM=True)


In [None]:
PSN_pearsV1, sim_pearsV1 = create_PSN(adjM_V1_pears, 0.933, s, 'CANTAB: PSN of scores (v1) for all (n=346), 45 variables, pears.sim')

In [None]:
plot_similarities(sim_pearsV1, 'Distribution of pearson corr. similarities for 45 variables')