In [None]:
import pandas as pd
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import pdist, squareform
sns.set_style("white")
from scipy.stats import chi2_contingency
from scipy.stats import f_oneway

## Functions:

## Filter data:

In [None]:
apoe_df = pd.read_csv('data_dfs/apoe_df.csv')

In [None]:
cantab_df = pd.read_csv('data_dfs/cantab_df.csv')

In [None]:
clinical_df = pd.read_csv('data_dfs/clinical_df.csv')

In [None]:
ptau_df = pd.read_csv('data_dfs/ptau_df.csv')

### APOE:

In [None]:
apoe_df = apoe_df[['Participant', 'APOE allele combination']]

apoe_df['risk'] = ' '

for i in range(len(apoe_df)):

    if 'E4' in apoe_df['APOE allele combination'][i]:
        risk = 1
    else:
        risk = 0
        
    apoe_df['risk'][i] = risk


In [None]:
apoe_df = apoe_df[['risk']]

### Cantab:

In [None]:
var_cantab = ['Participant ID',
'DMSPC', 
'PALFAMS28', 
'PRMPCD', 
'RVPA', 
'SWMTE468']

In [None]:
cantab_df = cantab_df[var_cantab[1:]]

In [None]:
cantab_df

### Clinical:

In [None]:
mmse_to_moca = [0,0,0,0,0,0,0,1,1,2,3,4,4,5,6,7,8,9,10,11,12,13,14,16,17,19,20,22,23,26,28] # conversion scores from MoCA review paper: Fasnacht et. al, 2023


In [None]:
def convert_mmse_to_moca(df_):

    df = df_.copy()

    for i in range(len(df)):
        if df['cog_scale'][i] == 'mmse':
            mmse = int(df['cog_scale_score'][i])
            if mmse<31:
                moca = mmse_to_moca[mmse]
                df['cog_scale_score'][i] = moca
                print(f'mmse var: {mmse}, moca nå: {moca}')
            
    return df

In [None]:
convert_mmse_to_moca(clinical_df)
clinical_df = clinical_df.drop(columns=['cog_scale'])

In [None]:
clinical_df['comorb'] = ' ' # combining comorbidity
for i in range(len(clinical_df)):
    if sum(clinical_df.loc[i][['heart', 'hypertension', 'vascular', 'liver', 'cpns', 'cancer']]) > 0:
        comorb = 1
    else:
        comorb = 0
    clinical_df['comorb'][i] = comorb

In [None]:
clinical_var = [
'age', 
'sex', 
'education', 
'handedness',
'children_num',
'cdr_gs', 
'alc_scale_score', 
'bmi',
'dem_num_firstdeg', 
'physact_freq', 
'smoke', 
'dep_tot',
'comorb',
'cog_scale_score'
]

In [None]:
clinical_df = clinical_df[clinical_var]

In [None]:
clinical_df

### ptau:

In [None]:
ptau_df = ptau_df[[#'participant_id', 
'ptau217']]

In [None]:
ptau_df

## Plotting:

In [None]:
test = pd.concat([apoe_df, cantab_df, clinical_df, ptau_df], axis=1)
test = test.apply(pd.to_numeric, errors='ignore')

### Communities found in all fused datasets :

In [None]:
commA = {0,
 1,
 3,
 4,
 7,
 8,
 9,
 11,
 15,
 16,
 18,
 19,
 23,
 24,
 25,
 26,
 30,
 31,
 34,
 35,
 37,
 39,
 41,
 42,
 44,
 45,
 47,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 58,
 61,
 62,
 63,
 64,
 65,
 66,
 70,
 74,
 75,
 79,
 80,
 89,
 91,
 95,
 97,
 99,
 102,
 104,
 106,
 107,
 108,
 110,
 113,
 114,
 115,
 118,
 120,
 122,
 123,
 125,
 126,
 129,
 132,
 133,
 135,
 136,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 148,
 149,
 150,
 152,
 155,
 160,
 161,
 162,
 164,
 166,
 169,
 172,
 173,
 185,
 187,
 191,
 192,
 193,
 197,
 214,
 217,
 218,
 221,
 224,
 226,
 227,
 228,
 239,
 244,
 248,
 263,
 270,
 271,
 287,
 288,
 295,
 303,
 304,
 315,
 318,
 322,
 324,
 325,
 326,
 328,
 335,
 341}

In [None]:
commB = {2,
 5,
 10,
 12,
 13,
 14,
 17,
 20,
 21,
 22,
 27,
 28,
 29,
 33,
 36,
 38,
 40,
 43,
 48,
 57,
 59,
 60,
 67,
 68,
 69,
 71,
 73,
 77,
 78,
 81,
 82,
 83,
 84,
 85,
 87,
 88,
 90,
 92,
 93,
 96,
 98,
 100,
 101,
 103,
 105,
 109,
 111,
 112,
 116,
 117,
 119,
 121,
 124,
 127,
 128,
 130,
 131,
 134,
 137,
 145,
 147,
 151,
 153,
 154,
 156,
 158,
 163,
 165,
 167,
 168,
 171,
 182,
 183,
 190,
 201,
 203,
 208,
 209,
 210,
 211,
 215,
 236,
 237,
 240,
 243,
 246,
 251,
 252,
 256,
 261,
 269,
 273,
 278,
 280,
 282,
 294,
 299,
 300,
 301,
 302,
 305,
 307,
 308,
 310,
 313,
 316,
 320,
 321,
 327,
 338}

In [None]:
commC = {6,
 32,
 46,
 72,
 76,
 86,
 94,
 146,
 157,
 159,
 170,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 184,
 186,
 188,
 189,
 194,
 195,
 196,
 198,
 199,
 200,
 202,
 204,
 205,
 206,
 207,
 212,
 213,
 216,
 219,
 220,
 222,
 223,
 225,
 229,
 230,
 231,
 232,
 233,
 234,
 235,
 238,
 241,
 242,
 245,
 247,
 249,
 250,
 253,
 254,
 255,
 257,
 258,
 259,
 260,
 262,
 264,
 265,
 266,
 267,
 268,
 272,
 274,
 275,
 276,
 277,
 279,
 281,
 283,
 284,
 285,
 286,
 289,
 290,
 291,
 292,
 293,
 296,
 297,
 298,
 306,
 309,
 311,
 312,
 314,
 317,
 319,
 323,
 329,
 330,
 331,
 332,
 333,
 334,
 336,
 337,
 339,
 340}

In [None]:
test['community'] = 0

for node in commA:
    test['community'][node] = 'A'

for node in commB:
    test['community'][node] = 'B'

for node in commC:
    test['community'][node] = 'C'



test

## Anova, numeric variables:

In [None]:
variables = list(test.columns)[:-1]


In [None]:
var_numerical = [
 'DMSPC',
 'PALFAMS28',
 'PRMPCD',
 'RVPA',
 'SWMTE468',
 'age',
 'children_num',
 'alc_scale_score',
 'bmi',
 'dem_num_firstdeg',
 'ptau217'
]

In [None]:
var_categorical = [
 'education',
 'physact_freq',
 'smoke',
 'dep_tot',
]

In [None]:
var_binary = [
'risk',
'sex',
'cdr_gs',
'handedness',
'comorb',]

In [None]:
for_anova = var_numerical #+ var_binary

In [None]:


df_num = test.copy()
df_num = df_num[for_anova + ['community']]

F_stats = []
p_vals = []

#anova_df = pd.DataFrame(columns=['variable', 'Fstat', 'pval'])

for var in for_anova:

    F_stat, p_val = f_oneway(*[group for comm, group in df_num.groupby('community')[var]])
    
    F_stats.append(F_stat)
    p_vals.append(p_val)
    
    print(f'{var}:     F-statistic: {F_stat},     p-val: {p_val}')

plt.figure(figsize=(10,10))

# Bar plot for F-statistics
sns.barplot(x=for_anova, y=p_vals, color='cadetblue')


plt.ylabel('P-value', fontsize=18)
plt.xlabel('Variables', fontsize=18)
plt.title('ANOVA Results for numeric variables', fontsize=20)



plt.tick_params(axis='x', rotation=45, labelsize=15) # Rotate x-axis labels for better visibility
plt.tick_params(axis='y', labelsize=15)


plt.ylim(0, 0.02)

plt.axhline(y=0.0025, color='grey', linestyle='--', linewidth=2)  # Mark significance level

plt.tight_layout()
#plt.savefig('anova_results_numeric.svg')

plt.show()



## Chi-sq categorical variables:

In [None]:
for_chi = var_categorical + var_binary

In [None]:
chi_sq_results = {}
p_vals_chi = []

for var in for_chi:
    cont_table = pd.crosstab(test[var], test['community'])
    chi_sq, p_val, dof, ex_freq, = chi2_contingency(cont_table)
    p_vals_chi.append(p_val)
    print(f'variable: {var}, Chi-sq: {chi_sq}, p-value: {p_val}')


plt.figure(figsize=(10,10))

sns.barplot(x=for_chi, y=p_vals_chi, color='cadetblue')

plt.ylabel('P-value', fontsize=18)
plt.xlabel('Variables', fontsize=18)
plt.title('Chi-squared results for categorical/binary variables', fontsize=20)

plt.tick_params(axis='x', rotation=45, labelsize=15) # Rotate x-axis labels for better visibility
plt.tick_params(axis='y', labelsize=15)

plt.ylim(0, 0.02)
plt.axhline(y=0.0025, color='grey', linestyle='--', linewidth=2)  # Mark significance level

plt.tight_layout()
plt.savefig('chi2_results_cat_bin.svg')

plt.show()


In [None]:
sign_level = round(0.05/(7*3), 5)
sign_level

## plotting significant variables:

In [None]:
sign_var = [
'risk',
'cdr_gs',
'age',
'PALFAMS28',
'education',
'physact_freq',
'ptau217',
'community'
]

In [None]:
cantab_df

In [None]:
communities = test['community'].unique()

palette = sns.color_palette("muted", len(communities))
color_map = {community: color for community, color in zip(communities, palette)}


### Each variable plot:

#### numeric variables:
- box plot
- t-tests 

In [None]:
from matplotlib.collections import PathCollection


In [None]:
def plot_var_comm_box(communities, test_var):

    community_order = sorted(communities)

    plt.figure(figsize=(8, 6))
    edgecol='black'
    edgewidth= 2
    

    ax=sns.boxplot(data=test, x='community', y=test_var, palette='muted', order=community_order, showfliers=False, 
                boxprops={'linewidth': edgewidth, 'edgecolor': edgecol, 'alpha': 0.5, 'facecolor': 'none'},
                whiskerprops={'linewidth': edgewidth, 'color': edgecol,  'alpha': 0.5},
                capprops={'linewidth': edgewidth, 'color': edgecol, 'alpha': 0.5},
                medianprops={'linewidth': edgewidth, 'color': edgecol, 'alpha': 0.5},
                meanprops={'linewidth': edgewidth, 'color': edgecol})

   
    sns.stripplot(data=test, x='community', y=test_var, palette='muted', order=community_order, jitter=True, ax=ax, size= 10, edgecolor=palette, alpha= 0.3) 

    plt.xlabel('Community', fontsize=15)
    plt.xticks(fontsize=20)
    plt.ylabel(test_var, fontsize=15)
    plt.yticks(fontsize=13)
    plt.title(f'Community distribution over variable: {test_var}', fontsize=20)

    plt.savefig(f'sign_boxplot_{test_var}.svg')

    plt.tight_layout()
    plt.show()


In [None]:
sign_var_num =[
 #'risk',
 #'cdr_gs',
 'age',
 'PALFAMS28',
 #'education',
 #'physact_freq',
 'ptau217',
 #'community'
 ]
 

In [None]:
from scipy.stats import ttest_ind

def t_test_pair(df, var, comm1, comm2):

    filtered_1 = df[df['community'] == comm1]
    filtered_2 = df[df['community'] == comm2]

    values_1 = filtered_1[var]
    values_2 = filtered_2[var]

    t_stat, p_val = ttest_ind(values_1, values_2)

    print(f'{comm1} and {comm2}, variable: {var}, t-stat: {t_stat}, p-value: {p_val}')


In [None]:
for var in sign_var_num:
    plot_var_comm_box(communities, var)
    t_test_pair(test, var, 'A', 'B')
    t_test_pair(test, var, 'B', 'C')
    t_test_pair(test, var, 'A', 'C')
   

In [None]:
sns.displot(data=test, x='ptau217', hue='community', kind='kde', fill=True)

plt.xlabel('pg/mL', fontsize=13)
plt.xticks(fontsize=13)
plt.ylabel('Probability density', fontsize=13)
plt.yticks(fontsize=13)
plt.title('P-tau concentration distribution accross communities', fontsize=15)
plt.savefig('kde_ptau_communities.svg')

In [None]:
sns.displot(data=test, x='age', hue='community', kind='kde', fill=True)

plt.xlabel('age', fontsize=13)
plt.xticks(fontsize=13)
plt.ylabel('Probability density', fontsize=13)
plt.yticks(fontsize=13)
plt.title('Age distribution accross communities', fontsize=15)
plt.savefig('kde_age_communities.svg')

In [None]:
sns.displot(data=test, x='PALFAMS28', hue='community', kind='kde', fill=True)

plt.xlabel('PALFAMS28', fontsize=13)
plt.xticks(fontsize=13)
plt.ylabel('Probability density', fontsize=13)
plt.yticks(fontsize=13)
plt.title('PALFAMS28 distribution accross communities', fontsize=15)
plt.savefig('kde_PALFAMS28_communities.svg')

#### Categorical and binary variables:
 - Bar plotS

 - Binary: proportion test

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(13,7))


sns.countplot(data=test, x='community', hue='cdr_gs', dodge=True, palette= palette_cdr, ax=ax1, order =['A','B','C'])
sns.countplot(data=test, x='community', hue='risk', dodge=True, palette= palette_apoe, ax=ax2, order =['A','B','C'])

ax1.set_xlabel('Community', fontsize=14)
ax1.set_ylabel('Count', fontsize=14)
ax2.set_xlabel('Community', fontsize=14)
ax2.set_ylabel('Count', fontsize=14)

ax1.tick_params(axis='both', which='major', labelsize=12)
ax2.tick_params(axis='both', which='major', labelsize=12)

plt.suptitle('Distribution of CDR and APOE-risk across communities', fontsize=20)

plt.savefig('community_dist_cdr_apoe.svg')

In [None]:
sign_var_bin = ['risk', 'cdr_gs']

In [None]:
from statsmodels.stats.proportion import proportions_ztest

def Z_prop_pair(df, var, comm1, comm2):

    filtered_df = df[df['community'].isin([comm1, comm2])]

    sample1 = filtered_df[filtered_df['community'] == comm1][var]
    sample2 = filtered_df[filtered_df['community'] == comm2][var]

    count1 = sample1.sum()
    count2 = sample2.sum()

    nobs1 = len(sample1)
    nobs2 = len(sample2)

    z_stat, p_val = proportions_ztest([count1, count2], [nobs1, nobs2])

    print(f"variable: {var}, {comm1}, {comm2}, Z-score: {z_stat}, P-value: {p_val}")


In [None]:
Z_prop_pair(test, 'risk', 'A', 'B')
Z_prop_pair(test, 'risk', 'B', 'C')
Z_prop_pair(test, 'risk', 'A', 'C')

In [None]:
Z_prop_pair(test, 'cdr_gs', 'A', 'B')
Z_prop_pair(test, 'cdr_gs', 'B', 'C')
Z_prop_pair(test, 'cdr_gs', 'A', 'C')

 - Categorical: Chi-squared test  

In [None]:
[#'risk',
 #'cdr_gs',
 #'age',
 #'PALFAMS28',
 'education',
 'physact_freq',
 #'ptau217',
 #'community'
 ]

In [None]:
cat_palette1= sns.light_palette((0.2823529411764706, 0.47058823529411764, 0.8156862745098039), n_colors=7)
cat_palette1

In [None]:
cat_palette2= sns.light_palette((0.9333333333333333, 0.5215686274509804, 0.2901960784313726), n_colors=7)
cat_palette2

In [None]:
education_levels = {
    1: 'Left formal education before the age of 16',
    2: 'Left at age 16',
    3: 'Left at age 17-18',
    4: 'Undergraduate degree or equivalent',
    5: "Master's degree og equivalent",
    6: 'PhD or equivalent'
}

In [None]:
plt.figure(figsize=(10,10))
ax = sns.countplot(data=test, x='community', hue='education', dodge=True, palette=cat_palette1[1:], order=['A','B','C'])

handles, labels = ax.get_legend_handles_labels()
new_labels = [education_levels[int(label)] for label in labels if label.isdigit()]
ax.legend(handles, new_labels, title='Education Level', fontsize=14)
plt.xlabel('Community', fontsize=14)
plt.ylabel('Count', fontsize=14)

plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

plt.title('Education distribution across communities', fontsize=18)
plt.tight_layout()
plt.savefig('community_dist_education.svg')

chi2_pair(test, 'education', 'A', 'B')
chi2_pair(test, 'education', 'B', 'C')
chi2_pair(test, 'education', 'A', 'C')


In [None]:
A_df = test[test['community']=='A']
B_df = test[test['community']=='B']
C_df = test[test['community']=='C']

In [None]:
print(np.median(A_df[variable]))
print(np.median(B_df[variable]))
print(np.median(C_df[variable]))

In [None]:
physact_levels = {
    1: 'Daily',
    2: '2-3 times a week',
    3: 'Weekly',
    4: 'Occasionally',
    5: "Never",
}

In [None]:
plt.figure(figsize=(10,10))
axx = sns.countplot(data=test, x='community', hue='physact_freq', dodge=True, palette=cat_palette2[1:], order=['A','B','C'])

handles, labels = axx.get_legend_handles_labels()
new_labels = [physact_levels[int(label)] for label in labels if label.isdigit()]
print(new_labels)
axx.legend(handles, new_labels, title='Physical activity level', fontsize=14)

plt.xlabel('Community', fontsize=14)
plt.ylabel('Count', fontsize=14)

plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

plt.title('Physical activity distribution across communities', fontsize=18)
plt.tight_layout()
plt.savefig('community_dist_physact.svg')
plt.show()

chi2_pair(test, 'physact_freq', 'A', 'B')
chi2_pair(test, 'physact_freq', 'B', 'C')
chi2_pair(test, 'physact_freq', 'A', 'C')

In [None]:
def chi2_pair(df, var, comm1, comm2):

    filtered_df = df[df['community'].isin([comm1, comm2])]
    cont_table = pd.crosstab(filtered_df[var], filtered_df['community'])
    chi_sq, p_val, x, y = chi2_contingency(cont_table)

    print(f'{comm1} and {comm2}, variable: {var}, Chi-sq: {chi_sq}, p-value: {p_val}')



# CHECK groups against MoCA:
ANOVA

In [None]:
test = test[test['cog_scale_score'] <= 30]
test = test[test['cog_scale_score'] >= 5]
test.reset_index(drop=True)

In [None]:
plot_var_comm_box(communities, 'cog_scale_score')

In [None]:
for_anova_moca = sign_var+['cog_scale_score']

In [None]:
ax = sns.pairplot(test[for_anova_moca], hue='community', palette='muted')
plt.savefig('pairplot_allsign_moca.png')
plt.show()


In [None]:
F_stat_moca, p_val_moca = f_oneway(*[group for comm, group in test.groupby('community')['cog_scale_score']])
    
print(f'  F-statistic: {F_stat_moca},     p-val: {p_val_moca}')

In [None]:
t_test_pair(test, 'cog_scale_score', 'A', 'B')
t_test_pair(test, 'cog_scale_score', 'B', 'C')
t_test_pair(test, 'cog_scale_score', 'A', 'C')


In [None]:
0.05/3

In [None]:
medians = data.groupby('species')['sepal_length'].median()
# Mark the median of each group
for species, median in medians.items():
    plt.axvline(median, color='r', linestyle='--', label=f'Median ({species}): {median:.2f}')

plt.legend()

In [None]:
sns.set_context("notebook", rc={"figure.figsize": (20, 6)})

sns.displot(data = test, x='cog_scale_score', hue='community', kind='kde', fill=True)
comm_medians= test.groupby('community')['cog_scale_score'].median()

for comm, median in comm_medians.items():
    plt.axvline(median, color=color_map[comm], linestyle='-', linewidth=3, label=f'{comm}: {median:.2f}')

plt.title('MoCA score distribution across communities')
plt.xlabel('MoCA score')
plt.legend()
plt.savefig('MoCA_distribution_communities.png')
plt.show()