In [None]:
import pandas as pd 
import numpy as np 
import networkx as nx 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import altair as alt
import nx_altair as nxa
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
#import snf
#from sklearn.metrics.pairwise import nan_euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import pdist, squareform
sns.set_style("darkgrid")


### Functions:

In [None]:
def sort_df_IDs(df_, ID_list):


    df = df_.copy()
    for id in ID_list:

        sorted_df = df[df['participant_id'].isin(ID_list)].sort_values(by=['participant_id'], key=lambda x: x.map({v: i for i, v in enumerate(ID_list)}))
        sorted_df = sorted_df.reset_index(drop=True)
        
    return sorted_df


In [None]:
def remove_outliers(df_, nr_std):
    
    df = df_.copy()

    ptau_conc = df['ptau217']
    mean_conc = np.mean(ptau_conc)
    std = np.std(ptau_conc)

    threshold_upper = mean_conc + nr_std * std #defining thresholds
    threshold_lower = mean_conc - nr_std * std

    filtered_df = df[(df['ptau217'] > threshold_lower) & (df['ptau217'] < threshold_upper)]  # Filter the df based on threshold:
    
    outliers = df[(df['ptau217'] <= threshold_lower) | (df['ptau217'] >= threshold_upper)]
    print(len(outliers))
    outliers_dict = dict(zip(outliers['participant_id'], outliers['ptau217']))

    filtered_df = filtered_df.reset_index(drop=True)

    print(f'lower: {threshold_lower}, upper: {threshold_upper}')

    return filtered_df, outliers_dict

In [None]:
def scale_df(df_, cols, sc):
    '''
    Takes in a dataframe (can contain participant and session id), a list of variables for column names, name of scaler 
    scales df with standardscaling or minmax
    assigns new df with given columns
    returns scaled df with participant and session id

    '''

    df = df_.copy()

    if sc == 'standard':
        scaler = StandardScaler()
    elif sc == 'minmax':
        scaler = MinMaxScaler()
    df_sc = scaler.fit_transform(df[cols])
    df_sc = pd.DataFrame(df_sc, columns = cols)
    df[cols] = df_sc 

    return df

In [None]:
def makeAdjMatrix_pTau(df, sim_metric, save_adjM): #'Eucl', 'Cos', 'Pears'

    n = len(df) # find number of rows (number of participant sessions)
    adj_matrix = np.eye(n) # create adjacency matrix with 1s along diagonal

    if sim_metric == 'Eucl':
        distances = pdist(df.values, metric='euclidean') # Compute pairwise Euclidean distances between participants in a condensed array
        sq_distances = squareform(distances) # Convert the condensed distance matrix into a square distance matrix
        
        # Fill the upper triangle of the similarity matrix with similarities:
        for i in range(n):
            for j in range(i+1, n):
                adj_matrix[i, j] = 1 / (1 + sq_distances[i, j])

    elif sim_metric == 'Cos':
        similarities = cosine_similarity(df.values)

        # Fill the upper triangle of the similarity matrix with similarities:
        for i in range(n):
            for j in range(i+1, n):
                adj_matrix[i, j] = similarities[i, j]

    elif sim_metric == 'Pears':
        df_trans = df.copy().transpose() # Transpose the df so that participants are along the columns (necessary for the correlation function)
        sim_matrix_p = df_trans.corr() # Compute the similarity matrix with pearson correlation
        adj_matrix = np.triu(sim_matrix_p.values) # Extract the upper triangular part of the array (including the diagonal)

    if save_adjM:
        np.savetxt(f'adjM_pTau_{sim_metric}.csv', X=adj_matrix, delimiter=',')

    return adj_matrix

In [None]:
def scale_adjMatrix(adjM, sc):

    if sc == 'standard':
        scaler = StandardScaler()
    elif sc == 'minmax':
        scaler = MinMaxScaler()

    mask = np.triu(np.ones_like(adjM, dtype=bool), k=1) # mask the lower triangle and diagonal for scaling
    adjM_sc = adjM.copy()
    adjM_sc[mask] = scaler.fit_transform(adjM[mask].reshape(-1, 1)).flatten()


    return adjM_sc

In [None]:
def create_PSN(adj, phi, s, title): 
    '''
    Takes in adjacency matrix adj, cutoff phi and seed s
    Creates and draws network graph with spring layout
    
    returns network and similarity list
    '''
    
    G = nx.from_numpy_array(adj)
    PSN = nx.Graph()
    PSN.add_nodes_from(G.nodes)

    similarities = []
    n = len(G.nodes)

    for i in range(n):
        for j in range(i+1,n):
            sim = adj[i,j]
            similarities.append(sim)
            if sim > phi:
                PSN.add_edge(i, j, weight = sim)


    plt.figure(figsize=(12,12))
    plt.title(title + ', phi= ' + str(phi))
    pos = nx.spring_layout(PSN, seed = s, k = 1/2)
    nx.draw_networkx_nodes(PSN, pos=pos, node_size=80, alpha=0.7, node_color='#2d6b87')
    nx.draw_networkx_edges(PSN, pos=pos, width=2, alpha=1/10)
    #nx.draw(PSN, pos = pos, node_size = 80, alpha=0.7) # edgecolor='grey'

    plt.grid(False)
    plt.gca().set_facecolor('white')

    plt.savefig('ptau_psn.svg')
    
    return PSN, similarities

In [None]:
def plot_similarities(sim, title):
    
    plt.figure(figsize=(13,8))
    plt.title(title)
    plt.xlabel('Similarities')

    sns.set(font_scale=1.2) 
    sns.histplot(sim, bins=100)

    plt.axvline(np.mean(sim), color='k', linestyle='dashed', linewidth=1)

    sorted_sim = sorted(sim, reverse=True)
    top_10_index = int(0.1 *len(sorted_sim))
    top_10_cutoff = sorted_sim[top_10_index]

    plt.axvline(top_10_cutoff, color = 'k', linestyle='dashed', linewidth=2)

    plt.show()

    print('mean: ' + str(np.nanmean(sim)))
    print('Top 10 percent similarities: ' + str(top_10_cutoff))

In [None]:
matched_IDs = []

### Pre-processing:

In [None]:
df = pd.read_excel('scd_crf_apoe_ptau217_data_medsci_PTAU181.xlsx', sheet_name='Sheet2')
relevant_cols= ['participant_id', 'ptau217']
df_ptau= df[relevant_cols]
df_ptau

In [None]:
plt.figure(figsize=(8,8))
#sns.boxplot(df_ptau['ptau217'])
sns.distplot(df_ptau['ptau217'])

In [None]:
df_ptau = sort_df_IDs(df_ptau, matched_IDs)
df_ptau

In [None]:
df_ptau.isna().sum()

In [None]:
df_ptau = df_ptau.dropna()
df_ptau.reset_index(drop=True)
df_ptau

#### filtering outliers:
one ID (conc 78.10975..) removed from matched_ID list. 

In [None]:
filtered_df, outliers = remove_outliers(df_ptau, 5)

### PSN implementation

In [None]:
df_ptau_sc = scale_df(df_ptau.copy()[relevant_cols], relevant_cols[1:], 'minmax')

In [None]:
df_ptau_sc

In [None]:
adjMatrix = makeAdjMatrix_pTau(df_ptau[['ptau217']], 'Eucl', False) # df, sim_metric, save_adjM

In [None]:
adjMatrix.shape

In [None]:
adjMatrix_sc = scale_adjMatrix(adjMatrix, 'minmax')

In [None]:
tau_PSN, tau_sims = create_PSN(adjMatrix, 0.719, 100, 'ptau, nonscaled') # (adj, phi, s, title 0.0036759127940981127

In [None]:
plot_similarities(tau_sims, 'similarities tau')

### Plotting of variables to find correlation:


In [None]:
plt.figure(figsize=(10,8))
sns.displot(df_ptau['ptau217'], kde=True)
n = len(df_ptau)
plt.title(f'pTau concentration distribution (n= {n})')
plt.xlabel('pTau concentration')
plt.savefig('ptau217_dist_filtered.svg')
plt.show()

In [None]:
filtered_df.shape

In [None]:
plt.figure(figsize=(10,8))
sns.distplot(df_ptau['ptau217'])
plt.title('pTau concentration distribution')
#plt.savefig('ptau217_dist_filtered.svg')
plt.show()