In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import rc_context
import bbknn
import re
import json
import os
import rpy2
import anndata
from datetime import date
from scipy.stats import binom_test
from datetime import datetime

# YYYY-MM-DD
today = date.today()
today = today.strftime("%Y-%m-%d")

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


# Read in peaks matrix and tidy it

In [2]:
%%time

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
    
print(f'{current_time}:...reading peak file...takes 5 mins...')
peaks=pd.read_csv('/nfs/team205/heart/anndata_objects/Foetal/multiome_ATAC/ArchR/project_output/PeakMatrix/Foetal_celltype-by-Peak.csv')
    
peaks.columns = peaks.columns.str.replace('Unnamed: 0', 'fine_grain')
peaks=peaks.set_index(peaks['fine_grain'])
peaks=peaks.drop(columns=['fine_grain'])

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print(f'{current_time}:...done.')

21:47:13:...reading peak file...takes 5 mins...
21:52:04:...done.
CPU times: user 4min 40s, sys: 7.99 s, total: 4min 48s
Wall time: 4min 51s


In [3]:
%%time
all_peaks=pd.DataFrame(peaks.columns, columns=['peak'])
all_peaks['chrom']=all_peaks['peak'].str.split(':',expand=True)[0]
all_peaks['peak_window']=all_peaks['peak'].str.split(':',expand=True)[1]
all_peaks['start']=all_peaks['peak_window'].str.split('_',expand=True)[0].astype(int)
all_peaks['end']=all_peaks['peak_window'].str.split('_',expand=True)[1].astype(int)
all_peaks=all_peaks.set_index(['peak'])
all_peaks.head(4)

CPU times: user 5.19 s, sys: 116 ms, total: 5.31 s
Wall time: 5.31 s


Unnamed: 0_level_0,chrom,peak_window,start,end
peak,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
chr1:794775_795275,chr1,794775_795275,794775,795275
chr1:817100_817600,chr1,817100_817600,817100,817600
chr1:817796_818296,chr1,817796_818296,817796,818296
chr1:818511_819011,chr1,818511_819011,818511,819011


# Set parameters for subsequent analysis

### Set Cell types

In [4]:
# get list of celltypes for subsequent analyses
cell_types=peaks.index.unique().tolist()

# remove celltypes not for analysis
cell_types.remove('Platelets')
cell_types.remove('VentricularCardiomyocytesCycling')
cell_types.remove('AtrialCardiomyocytesCycling')

#cell_types=cell_types[25:27] # reduced while developing

print(len(cell_types))

44


### Set n_permutations

In [5]:
# Set number of permutations
n_permutations=1000 # reduced while developing
print(n_permutations)

1000


### Set threshold for binarisation

In [6]:
# Set threshold for binarisation
threshold=0.1
threshold_for_filename=str(threshold).replace('.','p')
print(threshold_for_filename)

0p1


# Make some SNP table metadata

In [10]:
%%time
# make a table of metadata about the SNP files
md_efo_ids=[]
md_efo_terms=[]
md_n_SNPs=[]
pointlessly_small_n_SNPs=[]

snps_path='/nfs/team205/heart/EBI_GWAS/index_snps/'

pointlessly_small_threshold=20

files=os.listdir(snps_path)

efo_ids=[]

for file in range(len(files)):
    efo_ids.append(str(files[file].split("_")[0])+'_'+str(files[file].split("_")[1]))

for efo_id in efo_ids:
    file = [f for f in files if f"{efo_id}" in f]
    file = str(file[0])
    snps_df=pd.read_csv(f'{snps_path}{file}')
    snps_df=snps_df.set_index('variant_id')
    snps_df=snps_df.dropna(axis=0) # remove any rows containign NaNs
    if len(snps_df)<pointlessly_small_threshold:
        pointlessly_small_n_SNPs.append(efo_id)
    else:
        snps_df["chrom"] = snps_df["chromosome_name"].apply(lambda x: 'chr'+str(x)).str.split('.',expand=True)[0] # creates a new 'chrom' column and should work even for X or M chromosomes
        md_efo_ids.append(snps_df['efo_id'][0])
        md_efo_terms.append(snps_df['efo_term'][0])
        md_n_SNPs.append(len(snps_df))

md_dict={
    'efo_id':md_efo_ids,
    'efo_term':md_efo_terms,
    'n_SNPs':md_n_SNPs
}

# re-define efo_id list removing the really short ones.
efo_ids_not_too_small= [efo_id for efo_id in efo_ids if efo_id not in pointlessly_small_n_SNPs]

efo_ids=efo_ids_not_too_small

# Reduce while developing
#efo_ids=efo_ids[25:27] # reduced while developing



print(n_traits+' traits found. Removed '+str(len(pointlessly_small_n_SNPs))+' traits with fewer than 20 SNPs')

SNP_md=pd.DataFrame(md_dict)
SNP_md=SNP_md.set_index('efo_id')
SNP_md.sort_values('n_SNPs',ascending=False).to_csv(f'/home/jovyan/data/SNPs_md_{today}.csv')
SNP_md.sort_values('n_SNPs',ascending=False)

89 traits found. Removed 111 traits with fewer than 20 SNPs
CPU times: user 770 ms, sys: 19.2 ms, total: 790 ms
Wall time: 1.06 s


Unnamed: 0_level_0,efo_term,n_SNPs
efo_id,Unnamed: 1_level_1,Unnamed: 2_level_1
MONDO_0005090,schizophrenia,3348
EFO_0001645,coronary artery disease,1907
EFO_0005763,pulse pressure measurement,1706
EFO_0004761,uric acid measurement,633
EFO_0004682,QT interval,592
...,...,...
EFO_1001161,rheumatic heart disease,21
EFO_0005243,myeloperoxidase measurement,21
EFO_0005037,aortic root size,21
EFO_0004507,D dimer measurement,20


In [None]:
# manually inspect the output to find traits we do *not* want
efo_ids_to_be_removed=['MONDO_0005090','EFO_0004761','EFO_0004286','MONDO_0005277','EFO_0005939','EFO_0004265','EFO_0000712','EFO_0005043','EFO_1001857','EFO_0004791','EFO_0000717','EFO_1001976','EFO_0010071','EFO_0003870','EFO_0009552','EFO_0005524','EFO_1001504','EFO_0004762','EFO_0006501','EFO_0008205','EFO_0004534','EFO_0005128','EFO_0004278','EFO_0004214','EFO_0004517','EFO_0004860','EFO_0006919','EFO_0004644','EFO_0007928','EFO_0010178','EFO_0004578','EFO_1002006','EFO_0006790','EFO_0004520','EFO_0006903','EFO_0000668','MONDO_0016820','MONDO_0010679','MONDO_0001134','EFO_0008204','EFO_0004985','EFO_0004311','EFO_0005529','EFO_0004269','EFO_1000881','EFO_0004277','EFO_0005532','EFO_0005239','EFO_0008373','EFO_0008206','EFO_0006522','EFO_0007787','EFO_1001161','EFO_0005243','EFO_0004507']

all_efo_ids=SNP_md.index.tolist()

desired_efo_ids=[x for x in all_efo_ids if x not in efo_ids_to_be_removed]

SNP_md=SNP_md.loc[desired_efo_ids]

n_traits=str(len(SNP_md))
print(n_traits+' traits')

efo_ids=SNP_md.index.tolist() # defines efo_ids for further analysis

SNP_md.sort_values('n_SNPs',ascending=False)

# Evaluate whether SNPs are found in peaks using the all_peaks file

In [None]:
%%time
# Add on a column for each trait, indicating whether a SNP from that trait falls within a peak. This file varies with the which traits are assessed
files=os.listdir(snps_path)

if os.path.isfile(f'/nfs/team205/heart/EBI_GWAS/SNP_mapped_to_peaks/all_peaks_with_SNPs_for_{n_traits}_traits_incremental.csv')==False:
    print(f'...a SNPs in peaks file for this set of traits is NOT found, making one now...')
    for efo_id in efo_ids:

            file = [f for f in files if f"{efo_id}" in f]
            file = str(file[0])

            snps_df=pd.read_csv(f'{snps_path}{file}')
            snps_df=snps_df.set_index('variant_id')

            snps_df["chrom"] = snps_df["chromosome_name"].apply(lambda x: 'chr'+str(x)).str.split('.',expand=True)[0] # creates a new 'chrom' column and should work even for X or M chrosomes

            now = datetime.now()
            current_time = now.strftime("%H:%M:%S")
            print(f'{current_time}_{efo_id}:...evaluating for SNPs in peaks...')

            all_peaks[efo_id]=0
            for snp in range(len(snps_df["chrom"])): # This loop incrementally adds 1 if there is a SNP within a peak (open or closed)
                all_peaks[efo_id][
                    (all_peaks['chrom']==snps_df["chrom"][snp])
                    &
                    (all_peaks['start'] <= snps_df['chromosome_position'][snp])
                    &
                    (all_peaks['end'] >= snps_df['chromosome_position'][snp])
                ]+=1 # Adds one for each SNP which falls inside a peak
    all_peaks.to_csv(f'/nfs/team205/heart/EBI_GWAS/SNP_mapped_to_peaks/all_peaks_with_SNPs_for_{n_traits}_traits_incremental.csv')
else:
    all_peaks=pd.read_csv(f'/nfs/team205/heart/EBI_GWAS/SNP_mapped_to_peaks/all_peaks_with_SNPs_for_{n_traits}_traits_incremental.csv',index_col='peak')
    print(f'...SNPs in peaks file already exists, reading it in...')

all_peaks

...a SNPs in peaks file for this set of traits is not found, making one now...
22:07:57_EFO_0000537:...evaluating for SNPs in peaks...
22:08:17_EFO_0000275:...evaluating for SNPs in peaks...
22:08:39_MONDO_0005178:...evaluating for SNPs in peaks...
22:08:44_EFO_0005763:...evaluating for SNPs in peaks...
22:10:04_EFO_0009184:...evaluating for SNPs in peaks...
22:10:06_EFO_0004831:...evaluating for SNPs in peaks...
22:10:08_EFO_0006919:...evaluating for SNPs in peaks...
22:10:10_EFO_0005527:...evaluating for SNPs in peaks...
22:10:12_EFO_0004282:...evaluating for SNPs in peaks...
22:10:13_EFO_0005524:...evaluating for SNPs in peaks...
22:10:18_EFO_0008205:...evaluating for SNPs in peaks...
22:10:21_EFO_0003870:...evaluating for SNPs in peaks...
22:10:27_EFO_0005055:...evaluating for SNPs in peaks...
22:10:42_EFO_0009094:...evaluating for SNPs in peaks...
22:10:44_EFO_0008373:...evaluating for SNPs in peaks...
22:10:45_MONDO_0005090:...evaluating for SNPs in peaks...
22:13:24_EFO_0010071:

# Make a table of peaks x cell for each cell type, binarise using a threshold, then make 1000 permutations, then overwrite the file

In [None]:
%%time

# makes cell x peak matrix, including 1000 permutations
# NB the SNPs in peaks file needs is specific to the defined set of peaks, which will vary accoring to threshold for binarisation, so this all peaks file needs to be saved according to threshold used

permutations=range(n_permutations)

bin_mat_path='/nfs/team205/heart/EBI_GWAS/binarised_permutation_matrices/'

for cell_type in cell_types:
    if os.path.isfile(f'{bin_mat_path}{cell_type}_{n_permutations}_permutations_bin_threshold_{threshold_for_filename}_matrix.csv') == False:

        now = datetime.now()
        current_time = now.strftime("%H:%M:%S")
        
        print(f'{current_time}...generating binarised matrix for {cell_type}...')

        cell_df=pd.DataFrame(peaks.loc[cell_type])
        cell_df['peak'] = cell_df.index
        cell_df['chrom']=cell_df['peak'].str.split(':',expand=True)[0]
        cell_df['peak_window']=cell_df['peak'].str.split(':',expand=True)[1]
        cell_df['start']=cell_df['peak_window'].str.split('_',expand=True)[0].astype(int)
        cell_df['end']=cell_df['peak_window'].str.split('_',expand=True)[1].astype(int)
        cell_df[f'{cell_type}_binarised_real']=cell_df[cell_type].ge(threshold).astype(int)

        for permutation in permutations:
            cell_df[f'{cell_type}_binarised_permutation_{permutation}'] = np.random.permutation(cell_df[f'{cell_type}_binarised_real'])
        cell_df=cell_df.filter(regex=cell_type+'_')
        cell_df.to_csv(f'{bin_mat_path}{cell_type}_{n_permutations}_permutations_bin_threshold_{threshold_for_filename}_matrix.csv',index=True)
    else:
        print(f'NOT generating binarised matrix for {cell_type} since it already exists')
print('finished')

# Read in Binarised Matrix for each cell type, join it to the all_peaks file which has trait info (binding on the index [peaks]), then save these joined binarised matrix files (now with added trait info) in a different directory

In [None]:
%%time

joined_bin_mat_path='/nfs/team205/heart/EBI_GWAS/binarised_permutation_matrices_joined_incremental/'

for cell_type in cell_types:
    if os.path.isfile(f'{joined_bin_mat_path}{cell_type}_{n_permutations}_permutations_bin_threshold_{threshold_for_filename}_matrix_joined.csv') == False:    
        
        now = datetime.now()
        current_time = now.strftime("%H:%M:%S")
        print(f'{current_time}: {cell_type}: making bin matrix with trait for this cell')

        # read in binary matrices which have already been made
        binarised_matrix=pd.read_csv(f'{bin_mat_path}{cell_type}_{n_permutations}_permutations_bin_threshold_{threshold_for_filename}_matrix.csv')

        # tidy and add the columns we need
        binarised_matrix.rename(columns = {'Unnamed: 0':'peak'}, inplace = True)
        binarised_matrix=binarised_matrix.set_index(binarised_matrix.iloc[:,0])
        binarised_matrix=binarised_matrix.drop(columns=['peak'])
        
        # add on the columns indicating whether SNPs are in peaks
        binarised_matrix=binarised_matrix.join(all_peaks)

        # save this modified binary matrix to a separate directory
        binarised_matrix.to_csv(f'{joined_bin_mat_path}{cell_type}_{n_permutations}_permutations_bin_threshold_{threshold_for_filename}_matrix_joined.csv')
    else:
        print(f'file already exists for {cell_type}')


print('finished')

# Calculate enrichment of cell types for different traits

In [None]:
# While developing
efo_ids=efo_ids[2:4]
cell_types=cell_types[2:4]

In [None]:
%%time
# Find the proportion of all peaks which are open in this celltype

output_path='/nfs/team205/heart/EBI_GWAS/enrichment_output/'

list_of_output_dfs=[]

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print(f'...================================================================...') 
print(f'...{current_time}:STARTING')
print(f'...================================================================...') 

# make some empty lists
proportion_of_SNPs_found_in_celltype_specific_open_peaks=[]
proportion_of_all_open_peaks_found_in_this_celltype=[]
proportion_of_SNPs_found_in_permutations_of_celltype_specific_open_peaks=[]
proportions_of_SNPs_in_permuted_open_peaks_greater_than_observed_proportion=[]
n_times_proportions_of_SNPs_in_permuted_open_peaks_greater_than_observed_proportion=[]
mean_proportions_of_SNPs_in_open_peaks=[]
p_values=[]
efo_id_list=[]
efo_term_list=[]
cell_type_list=[]
n_SNPs_list=[]

permutations=range(n_permutations)

print('...evaluating '+str(len(efo_ids))+' traits, across '+str(len(cell_types))+' cell types...')

for cell_type in cell_types:
    
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print(f'...================================================================...')    
    print(f'...{current_time}: reading binarised matrix for {cell_type}...')
    print(f'...================================================================...') 

    cell_bin_mat=pd.read_csv(f'{joined_bin_mat_path}{cell_type}_{n_permutations}_permutations_bin_threshold_{threshold_for_filename}_matrix_joined.csv',index_col='peak')
    
    prop_bins_in_this_cell_type=(len(cell_bin_mat[cell_bin_mat[f'{cell_type}_binarised_real']==1]))/len(cell_bin_mat)
            
    
    for efo_id in efo_ids:
                
        # grab some metadata
        efo_term=SNP_md.loc[efo_id]['efo_term']
        n_SNPs=SNP_md.loc[efo_id]['n_SNPs']
        
        # add columns which won't change until we run a new cell_type
        proportion_of_all_open_peaks_found_in_this_celltype.append(prop_bins_in_this_cell_type)
        cell_type_list.append(cell_type)

        # add columns which won't change until we run a new efo_id
        n_SNPs_list.append(n_SNPs)
        efo_id_list.append(efo_id)
        efo_term_list.append(efo_term)
        
        # subset to just open regions for this cell type
        # find the proportion of SNPs for this trait that lie within this cell types open peaks
        observed_proportion=(cell_bin_mat[efo_id][cell_bin_mat[f'{cell_type}_binarised_real']==1].sum())/(cell_bin_mat[efo_id].sum())

        proportion_of_SNPs_found_in_celltype_specific_open_peaks.append(observed_proportion)

        proportions_of_SNPs_in_open_peaks=[]
        

        for permutation in permutations:
            proportion_of_SNPs_found_in_permutations_of_celltype_specific_open_peaks.append(cell_bin_mat[efo_id][cell_bin_mat[f'{cell_type}_binarised_permutation_{permutation}']==1].sum()/cell_bin_mat[efo_id].sum())

        proportions_of_SNPs_in_permuted_open_peaks_greater_than_observed_proportion = [i for i in proportion_of_SNPs_found_in_permutations_of_celltype_specific_open_peaks if i >= observed_proportion]
        
        n_times_proportions_of_SNPs_in_permuted_open_peaks_greater_than_observed_proportion.append(len(proportions_of_SNPs_in_permuted_open_peaks_greater_than_observed_proportion))

        p_values.append(len(proportions_of_SNPs_in_permuted_open_peaks_greater_than_observed_proportion)/len(permutations)) # p val is simply the proportion of null hypotheses 'observations' greater than the actual observed proportion

        mean_proportions_of_SNPs_in_open_peaks.append(sum(proportion_of_SNPs_found_in_permutations_of_celltype_specific_open_peaks)/len(proportion_of_SNPs_found_in_permutations_of_celltype_specific_open_peaks))
        
        # Plot histograms for each cell type
#        plt.rcParams["figure.figsize"] = (20,10)
#        plt.rcParams["figure.dpi"] = 300

#        plt.hist(proportion_of_SNPs_found_in_permutations_of_celltype_specific_open_peaks,
#                 bins=100,color='red',
#                 range=(0,1),
#                 histtype='stepfilled',edgecolor='none')
#        plt.axvline(x=observed_proportion, color='blue', linestyle='--')
#        plt.legend(['null: proportion of SNPs falling in randomly shuffled OC regions','observed: proportion of SNPs falling cell-type specific OC regions'])
#        plt.title('cell type: '+cell_type+', trait: '+efo_id+', term: '+efo_term+', threshold for binarisation: '+threshold_for_filename)
#        plt.savefig(f'{output_path}{efo_id}_{efo_term}_{cell_type}_{threshold_for_filename}_SNP_enrichment.png')
#        plt.clf() #clears the current plot

        
output_dict={
    'cell_type':cell_type_list,
    'proportion_of_all_open_peaks_found_in_this_celltype':proportion_of_all_open_peaks_found_in_this_celltype,
    'proportion_of_SNPs_found_in_celltype_specific_open_peaks':proportion_of_SNPs_found_in_celltype_specific_open_peaks,
    'n_times_proportions_of_SNPs_in_permuted_open_peaks_greater_than_observed_proportion':n_times_proportions_of_SNPs_in_permuted_open_peaks_greater_than_observed_proportion,
    'mean_proportions_of_SNPs_in_open_peaks':mean_proportions_of_SNPs_in_open_peaks,
    'p_value':p_values,
    'n_SNPs':n_SNPs_list,
    'efo_id':efo_id_list,
    'efo_term':efo_term_list}

output_df=pd.DataFrame(output_dict)

list_of_output_dfs.append(output_df)
combined_output_df=pd.concat(list_of_output_dfs)
combined_output_df=combined_output_df.sort_values(by=['efo_id'])
combined_output_df=combined_output_df.set_index('cell_type')
combined_output_df.to_csv(f'{output_path}{threshold_for_filename}_all_traits_summary.csv')

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print(f'...================================================================...')    
print(f'...{current_time}:FINSIHED')
print(f'...================================================================...')