In [None]:
import pandas as pd
import glob
from collections import defaultdict
import nease
import gseapy as gp

In [None]:
path = ""#path to summary files
phenotype = 'J26675'#adipose_2, hypothalamus, J26675, J26676, J26677, J26678

In [None]:
def newAS(filename, psi_thr_min, psi_thr_max):
    condensed = open(path + filename)
    psi_thr_min = psi_thr_min
    psi_thr_max = psi_thr_max
    
    
    gene_100_dict = defaultdict(set)
    junction_100_dict = defaultdict(set)
    
    
    for line in condensed:
        if line.split()[0] != 'Phenotype':
            phenotype, gene, junction, depth, sample_n, psi, tpm = line.split()
            if float(psi) >= psi_thr_min and float(psi) < psi_thr_max:
                if int(sample_n) == 100:
                    gene_100_dict[int(depth[:-1])].add(gene)
    condensed.close()
    
    
    
    return(gene_100_dict[50], gene_100_dict[100], gene_100_dict[150], gene_100_dict[200])


In [None]:
in50M, in100M, in150M, in200M = newAS('/Summary/' + phenotype + '_summary.tsv', 0.05, 0.95)

50M

In [None]:
df = pd.read_table(path + '/Enrichment/' + phenotype + '_50M_unified', sep = '\t', index_col = 'id')

In [None]:
#Reformat the list of exons skipping events for NEASE
df = df[df['event_type'] == 'ES']
df['start_coordinates'] = df['start_coordinates'].astype('int')
df.head()
df_plus = df[df['strand'] == '+']
df_minus = df[df['strand'] == '-']
df_minus['start_coordinates'] = df_minus['start_coordinates'] - 1
columns = ['gene', 'start_coordinates', 'end_coordinates']
input_df_minus = pd.DataFrame(df_minus, columns=columns)
input_df_plus = pd.DataFrame(df_plus, columns=columns)
input_df = pd.concat([input_df_minus,input_df_plus ])

#extract those genes with AS detected in 100 subsamples
input_df = input_df[input_df['gene'].isin(in50M)]
input_df

In [None]:
#Run NEASE
events=nease.run(input_df, organism='Human')

In [None]:
#Run NEASE enrichment with KEGG
nease_enr_50=events.enrich(database=['KEGG']).sort_values('p_value')

In [None]:
#Save the results
nease_enr_50.to_csv(path + '/Enrichment/' + phenotype + '_50M_nease', sep='\t')
nease_enr_50

In [None]:
#Read the saved results
nease_enr_50 = pd.read_csv(path + '/Enrichment/' + phenotype + '_50M_nease', sep='\t')
nease_enr_50[nease_enr_50['adj p_value'] < 0.05]['Pathway name']

In [None]:
nease_50M = set(nease_enr_50[nease_enr_50['adj p_value'] < 0.05]['Pathway name'].to_list())

100M

In [None]:
df = pd.read_table(path + '/Enrichment/' + phenotype + '_100M_unified', sep = '\t', index_col = 'id')

In [None]:
#Reformat the list of exons skipping events for NEASE
df = df[df['event_type'] == 'ES']
df['start_coordinates'] = df['start_coordinates'].astype('int')
df.head()
df_plus = df[df['strand'] == '+']
df_minus = df[df['strand'] == '-']
df_minus['start_coordinates'] = df_minus['start_coordinates'] - 1
columns = ['gene', 'start_coordinates', 'end_coordinates']
input_df_minus = pd.DataFrame(df_minus, columns=columns)
input_df_plus = pd.DataFrame(df_plus, columns=columns)
input_df = pd.concat([input_df_minus,input_df_plus ])
#extract those genes with AS detected in 100 subsamples
input_df = input_df[input_df['gene'].isin(in100M)]
input_df

In [None]:
#Run NEASE for 100M
events_100=nease.run(input_df, organism='Human')

In [None]:
#Run with KEGG
nease_enr_100=events_100.enrich(database=['KEGG']).sort_values('p_value')

In [None]:
#Save the results
nease_enr_100.to_csv(path + '/Enrichment/' + phenotype + '_100M_nease', sep='\t')
nease_enr_100

In [None]:
#Read the saved results
nease_enr_100 = pd.read_csv(path + '/Enrichment/' + phenotype + '_100M_nease', sep='\t')
nease_enr_100[nease_enr_100['adj p_value'] < 0.05]['Pathway name']

In [None]:
nease_100M = set(nease_enr_100[nease_enr_100['adj p_value'] < 0.05]['Pathway name'].to_list())

150M

In [None]:
df = pd.read_table(path + '/Enrichment/' + phenotype + '_150M_unified', sep = '\t', index_col = 'id')


In [None]:
#Reformat the list of exons skipping events for NEASE
df = df[df['event_type'] == 'ES']
df['start_coordinates'] = df['start_coordinates'].astype('int')
df.head()
df_plus = df[df['strand'] == '+']
df_minus = df[df['strand'] == '-']
df_minus['start_coordinates'] = df_minus['start_coordinates'] - 1
columns = ['gene', 'start_coordinates', 'end_coordinates']
input_df_minus = pd.DataFrame(df_minus, columns=columns)
input_df_plus = pd.DataFrame(df_plus, columns=columns)
input_df = pd.concat([input_df_minus,input_df_plus ])
#extract those genes with AS detected in 100 subsamples
input_df = input_df[input_df['gene'].isin(in150M)]
input_df

In [None]:
#Run NEASE for 150M
events_150=nease.run(input_df, organism='Human')

In [None]:
#Run with KEGG
nease_enr_150=events_150.enrich(database=['KEGG']).sort_values('p_value')

In [None]:
#Save the results
nease_enr_150.to_csv(path + '/Enrichment/' + phenotype + '_150M_nease', sep='\t')
nease_enr_150

In [None]:
#Read the saved results
nease_enr_150 = pd.read_csv(path + '/Enrichment/' + phenotype + '_150M_nease', sep='\t')
nease_enr_150[nease_enr_150['adj p_value'] < 0.05]['Pathway name']

In [None]:
nease_150M = set(nease_enr_150[nease_enr_150['adj p_value'] < 0.05]['Pathway name'].to_list())

200M

In [None]:
df = pd.read_table(path + '/Enrichment/' + phenotype + '_200M_unified', sep = '\t', index_col = 'id')


In [None]:
#Reformat the list of exons skipping events for NEASE
df = df[df['event_type'] == 'ES']
df['start_coordinates'] = df['start_coordinates'].astype('int')
df_plus = df[df['strand'] == '+']
df_minus = df[df['strand'] == '-']
df_minus['start_coordinates'] = df_minus['start_coordinates'] - 1
columns = ['gene', 'start_coordinates', 'end_coordinates']
input_df_minus = pd.DataFrame(df_minus, columns=columns)
input_df_plus = pd.DataFrame(df_plus, columns=columns)
input_df = pd.concat([input_df_minus,input_df_plus ])
#extract those genes with AS detected in 100 subsamples
input_df = input_df[input_df['gene'].isin(in200M)]
input_df

In [None]:
#Run NEASE for 200M
events_200=nease.run(input_df, organism='Human')

In [None]:
#Run with KEGG
nease_enr_200=events_200.enrich(database=['KEGG']).sort_values('p_value')

In [None]:
#Save the results
nease_enr_200.to_csv(path + '/Enrichment/' + phenotype + '_200M_nease', sep='\t')
nease_enr_200

In [None]:
#Read the saved results
nease_enr_200 = pd.read_csv(path + '/Enrichment/' + phenotype + '_200M_nease', sep='\t')
nease_enr_200[nease_enr_200['adj p_value'] < 0.05]['Pathway name']

In [None]:
nease_200M = set(nease_enr_200[nease_enr_200['adj p_value'] < 0.05]['Pathway name'].to_list())

Comparisons

In [None]:
nease_enr_200[nease_enr_200['Pathway name'].isin(nease_200M - nease_50M)].to_csv(path + '/Enrichment/' + phenotype + '_200M_50M')

In [None]:
nease_enr_200[nease_enr_200['Pathway name'].isin(nease_200M - nease_100M)].to_csv(path + '/Enrichment/' + phenotype + '_200M_100M')

In [None]:
nease_enr_200[nease_enr_200['Pathway name'].isin(nease_200M - nease_150M)].to_csv(path + '/Enrichment/' + phenotype + '_200M_150M')

In [None]:
nease_200M - nease_50M

In [None]:
nease_200M - nease_100M

In [None]:
nease_200M - nease_150M

In [None]:
nease_200M - nease_50M - nease_100M - nease_150M

In [None]:
nease_enr_200[nease_enr_200['Pathway name'].isin(nease_200M - nease_150M)]