In [None]:
import pandas as pd 
import numpy as np 
import networkx as nx 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import altair as alt
import nx_altair as nxa
#import snf
sns.set_style("darkgrid")


#### Functions:

In [None]:
def read_apoe_file(file, site):

    sites = [1,2,3,4]

    df = pd.read_csv(file)

    if site in sites:
        df = df.loc[df['Participant'][0] == site] # filtering on site if relevant
    
    participants = list(df['Participant'])

    return df, participants

In [None]:
def sort_df_IDs(df, ID_list):


    sorted_df = df[df['Participant'].isin(ID_list)].sort_values(by=['Participant'], key=lambda x: x.map({v: i for i, v in enumerate(ID_list)}))
    sorted_df = sorted_df.reset_index(drop=True)
    return sorted_df


In [None]:
def preprocess_apoe_df(df, ID_list):
    
    df_reduced = sort_df_IDs(df.copy(), ID_list) # sort the df on ID-list, removing IDs to exclude in further analyses
    
    df_sets = df_reduced.copy()
    node_list = []
    
    for row in df_reduced.index:
        gtype = df_reduced['APOE allele combination'][row]
        node_list.append(gtype)
        g_set = set([gtype[:2], gtype[2:4]])
        df_sets['APOE allele combination'][row] = g_set
    
    return df_sets, node_list

In [None]:
def makeAdjMatrix_apoe(df, sim_metric, save_adjM): 
    '''
    Creates adjacency matrix from a dataframe and a specified measure
    returns the adjacency matrix
    '''
    n = df.shape[0] # number of rows in df
    #print(n)
    adjM = np.eye(n) # create empty matrix of n x n
    

    if sim_metric == 'Jacc': #for the apoe df the genotypes must be converted to sets of alleles
        for i in range(n):
            first = df.iloc[i]
            for j in range(i+1, n):
                second = df.iloc[j] #select whole row for second participant

                similarity = jaccard_sim(first, second)
                adjM[i,j] = round(similarity,3) # setting similarity value to 3 decimals 
    if save_adjM:
        np.savetxt(f'adjM_APOE_{sim_metric}.csv', X=adjM, delimiter=',')

    return adjM
    

In [None]:
def create_PSN_apoe(adj, nodelist, phi, s, title, savefig): 
    '''
    Takes in adjacency matrix adj, cutoff phi and seed s  +++
    Creates and draws network graph with spring layout 
    
    returns network and similarity list
    '''
    
    
    G = nx.from_numpy_array(adj)
    PSN = nx.Graph()
    #node_name_mapping = {i: node_name for i, node_name in enumerate(node_list)}
    PSN.add_nodes_from(G)

    
    #PSN = nx.relabel_nodes(PSN, node_name_mapping)      

    #communities = nx.community.greedy_modularity_communities(G)
    #print(len(communities))
    
    similarities = []
    n = len(G.nodes)
    print(n)

    for i in range(n):
        for j in range(i+1,n):
            sim = adj[i,j]
            similarities.append(sim)
            if sim > phi:
                PSN.add_edge(i, j, weight = sim)
                #nx.draw_networkx_edges()


    plt.figure(figsize=(20,15))
    plt.title(title + ', phi= ' + str(phi))

    pos = nx.spring_layout(PSN, seed = s, iterations=120)

    #edge_alphas = {0: 0, 0.333: 0.333, 0.5: 0.5, 1: 1} #opacity of the edge weights
    #nx.draw_networkx_edges(PSN, pos=pos, width=2,
                       #alpha=[(edge_alphas[G[u][v]['weight']]/100) for u, v in PSN.edges])
    
    #for u, v, wt in PSN.edges(data=True):
        #nx.draw_networkx_edges(PSN, pos=pos, edgelist=[(u, v)], width=wt['weight'], alpha= 1/200) #alpha=(wt['weight'])/100
    node_cols= {'E2E2':'#177245', 'E2E3':'#6aa84f', 'E3E3':'#0b5394', 'E2E4':'#f2a85a', 'E4E4':'#cc0000', 'E3E4':'#800080'}
    #node_cols=['green', 'limegreen', 'blue', 'pink', 'red', 'orange']

    nx.draw_networkx_edges(PSN, pos=pos, width=1, alpha=1/70)

    # Relabel nodes with node labels
    node_mapping = {i: gtype for i, gtype in enumerate(nodelist)}
    nx.set_node_attributes(PSN, node_mapping, name='gtype')


    for node in PSN.nodes():
        #print(node)
        gtype = PSN.nodes[node]['gtype']
        nx.draw_networkx_nodes(PSN, pos=pos, nodelist=[node], node_size=130, node_color=node_cols[gtype])
    
    plt.axis('off')  # Turn off axis
    if savefig:
        plt.savefig('psn_apoe.png')
    plt.show()

    return PSN, similarities



In [None]:
def plot_similarities_apoe(sim, title):

    plt.figure(figsize=(15,10))
    plt.title(title)
    plt.xlabel('Similarities')
    sns.histplot(sim, bins=4)

    plt.show()
    #print('average: ' + str(np.average(sim)))
    #print('median: ' + str(np.median(sim)))

In [None]:
matched_IDs = [] #participant IDs

In [None]:
len(matched_IDs)

In [None]:
def jaccard_sim(set_a,set_b):   #take in sets of genotypes from two participants
    
    intersect = len(set_a.intersection(set_b))
    union = len(set_a.union(set_b))
    #print(union)

    return (intersect/union) if (union != 0) else 0    # Avoid division by zero

In [None]:
a = set(['E4', "E3"])
b = set(['E3', "E2"])
jacc = jaccard_sim(a,b)
jacc

#### pre-processing:

In [None]:
apoe_df, apoe_ids = read_apoe_file('Genotype_formatted_results.csv', 'all')
apoe_df, node_list = preprocess_apoe_df(apoe_df, matched_IDs)

apoe_df
    

#### PSN implementation:

In [None]:
AdjM_Apoe = makeAdjMatrix_apoe(apoe_df['APOE allele combination'], 'Jacc', True)
AdjM_Apoe.shape

In [None]:
psn, sims = create_PSN_apoe(AdjM_Apoe, node_list, 0.1, 100, 'APOE PSN, n=342', savefig=True)

In [None]:
plot_similarities_apoe(sims, 'apoe')

In [None]:
node_cols= {'E2E2':'#177245', 'E2E3':'#6aa84f', 'E3E3':'#0b5394', 'E2E4':'#f2a85a', 'E4E4':'#cc0000', 'E3E4':'#800080'}
palette = [node_cols[gtype] for gtype in node_list]
#sns.histplot(node_list, color=node_cols)
plt.figure(figsize=(8,8))
sns.set(font_scale=1.2) 
sns.countplot(x=node_list, palette=node_cols)
sns.set_context("notebook")
sns.set_style('white')
plt.ylabel('Count')
plt.title('Distribution of APOE genotypes (n=237)')
plt.grid(False)
# plt.gca().set_facecolor('white')
plt.show()