In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import rc_context
import bbknn
import re
import json
import os
import rpy2
import anndata
from datetime import date
from scipy.stats import binom_test
from datetime import datetime
import requests
import logging

# YYYY-MM-DD
today = date.today()
today = today.strftime("%Y-%m-%d")

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


# Read in peaks matrix and tidy it

In [2]:
%%time

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
    
print(f'{current_time}:...reading peak file...takes 5 mins...')
peaks=pd.read_csv('/nfs/team205/heart/anndata_objects/8regions/ArchR/project_output/PeakMatrix/Adult_celltype-by-Peak.csv')
    
peaks.columns = peaks.columns.str.replace('Unnamed: 0', 'fine_grain')
peaks=peaks.set_index(peaks['fine_grain'])
peaks=peaks.drop(columns=['fine_grain'])

# replace characters which can break the code
peaks.index=peaks.index.str.replace('+', 'pos')
peaks.index=peaks.index.str.replace('/', '_or_')

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print(f'{current_time}:...done.')
peaks.head()

15:05:05:...reading peak file...takes 5 mins...
15:09:40:...done.
CPU times: user 4min 13s, sys: 19.2 s, total: 4min 32s
Wall time: 4min 35s


Unnamed: 0_level_0,chr1:794932_795432,chr1:817104_817604,chr1:818775_819275,chr1:819697_820197,chr1:821364_821864,chr1:825517_826017,chr1:826623_827123,chr1:827307_827807,chr1:829896_830396,chr1:830664_831164,...,chrX:155755916_155756416,chrX:155767382_155767882,chrX:155768350_155768850,chrX:155794136_155794636,chrX:155820049_155820549,chrX:155874572_155875072,chrX:155880996_155881496,chrX:155881574_155882074,chrX:155888126_155888626,chrX:155894812_155895312
fine_grain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MAIT-like,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06383,0.0,0.0,...,0.0,0.0,0.0,0.021277,0.0,0.0,0.010638,0.0,0.0,0.0
B,0.0,0.004202,0.0,0.0,0.004202,0.0,0.0,0.046218,0.0,0.0,...,0.0,0.004202,0.004202,0.0,0.0,0.004202,0.016807,0.004202,0.004202,0.004202
PC1_vent,0.000356,0.016364,0.006403,0.003202,0.00249,0.004269,0.007471,0.06937,0.000356,0.001423,...,0.001423,0.027392,0.003557,0.000356,0.012807,0.007826,0.040199,0.004269,0.000711,0.001067
CD8posT_cytox,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031915,0.0,0.0,...,0.0,0.010638,0.0,0.0,0.010638,0.0,0.042553,0.0,0.0,0.0
B_plasma,0.0,0.0,0.009346,0.0,0.0,0.0,0.0,0.056075,0.0,0.0,...,0.009346,0.009346,0.0,0.0,0.0,0.009346,0.065421,0.009346,0.0,0.009346


In [3]:
%%time
all_peaks=pd.DataFrame(peaks.columns, columns=['peak'])
all_peaks['chrom']=all_peaks['peak'].str.split(':',expand=True)[0]
all_peaks['peak_window']=all_peaks['peak'].str.split(':',expand=True)[1]
all_peaks['start']=all_peaks['peak_window'].str.split('_',expand=True)[0].astype(int)
all_peaks['end']=all_peaks['peak_window'].str.split('_',expand=True)[1].astype(int)
all_peaks=all_peaks.set_index(['peak'])
all_peaks.head(4)

CPU times: user 4.11 s, sys: 120 ms, total: 4.23 s
Wall time: 4.23 s


Unnamed: 0_level_0,chrom,peak_window,start,end
peak,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
chr1:794932_795432,chr1,794932_795432,794932,795432
chr1:817104_817604,chr1,817104_817604,817104,817604
chr1:818775_819275,chr1,818775_819275,818775,819275
chr1:819697_820197,chr1,819697_820197,819697,820197


# Set parameters for subsequent analysis

### Peak window width

In [4]:
# select a window size (bp) of interest (by default it is 500bp)

window_size=500
window_adjustment=(window_size-500)/2
print('adjustment '+str(window_adjustment))
window_size_for_filename='peak_width_'+str(window_size)
print(window_size_for_filename)

all_peaks['start']=all_peaks['start'].apply(lambda x: x-window_adjustment)
all_peaks['end']=all_peaks['end'].apply(lambda x: x+window_adjustment)

all_peaks

adjustment 0.0
peak_width_500


NameError: name 'all_peaks' is not defined

### Set Cell types

In [None]:
# get list of celltypes for subsequent analyses
cell_types=peaks.index.unique().tolist()
cell_types

In [None]:
#cell_types=cell_types[25:27] # reduced while developing
print(len(cell_types))

### Set n_permutations

In [None]:
# Set number of permutations
n_permutations=1000 # reduced while developing
print(n_permutations)

### Set threshold for binarisation

In [None]:
# Set threshold for binarisation
threshold=0.05
threshold_for_filename=str(threshold).replace('.','p')
print(threshold_for_filename)

### Set threshold for min number of index SNPs for a trait

In [9]:
too_few_snps_threshold=1

# Get SNP data

### Set threshold for LD

In [10]:
LD_threshold=0.9 # threshold (r squared)
LD_threshold_for_filename=str(LD_threshold).replace('.','p')
print(LD_threshold_for_filename)

0p9


In [11]:
snps_path='/nfs/team205/heart/JC_SNP_enrichment/8region/EBI_GWAS/index_snps/'
snps_with_LD_path='/nfs/team205/heart/JC_SNP_enrichment/8region/EBI_GWAS/index_snps_with_LD/'
snps_with_LD_with_pos_path='/nfs/team205/heart/JC_SNP_enrichment/8region/EBI_GWAS/index_snps_with_LD_with_pos/'

## Get SNPs in LD

In [12]:
# these are SNPs which cause errors when querying their genomic positions - usually because they no longer have assigned coordinates.
# when such SNPs are found, add them to this list
previous_problematic_snps=['rs9404922', 'rs5863461', 'rs140418671','rs10734649','rs7660850','rs7683747','rs59042994','rs35331282','rs5820605']

In [13]:
os.makedirs(snps_with_LD_path,
           exist_ok=True) # makes the directory

too_few_snps=[]

errorcodes_e_index=[]
errorcodes_file_index=[]
problematic_snps=[]


now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print(f'{current_time}: Fetching SNPs in LD')

index_snp_files=os.listdir(snps_path)


for index_snp_file in index_snp_files:
    
    efo_id=index_snp_file.split('_')[0]+'_'+index_snp_file.split('_')[1]
    
    if os.path.isfile(f'{snps_with_LD_path}{efo_id}_with_SNPs_in_LD_{LD_threshold_for_filename}.csv')==False:
    
        snps_df=pd.read_csv(f'{snps_path}{index_snp_file}')

        if len(snps_df)>too_few_snps_threshold:
            
            try:

                now = datetime.now()
                current_time = now.strftime("%H:%M:%S")
                print(f'{current_time}: Starting '+index_snp_file+', '+str(len(snps_df))+' index SNPs')

                index_simplified_classification=[]
                index_phenotype=[]
                index_study=[]
                index_snps=[]
                index_chrom=[]
                index_pos=[]
                population_name=[]
                tagged_snps=[]
                tagged_r2=[]

                for i in range(len(snps_df[rsid_col])):

                    rsid=snps_df[rsid_col][i]
                    
                    if rsid.startswith('rs') and rsid not in previous_problematic_snps:
                        
                        try:

                            server = "http://rest.ensembl.org"
                            ext = f"/ld/human/{rsid}/1000GENOMES:phase_3:EUR?r2={LD_threshold};window_size=500"

                            r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})

                            if not r.ok:
                              r.raise_for_status()
                              sys.exit()

                            decoded = r.json()

                            if len(decoded)>0: # sometimes there will be no SNPs in LD

                                for j in range(len(decoded)):
                                    index_snps.append(snps_df[rsid_col][i])
                                    index_chrom.append(snps_df[chrom][i])
                                    index_pos.append(snps_df[pos][i])
                                    population_name.append(decoded[j]['population_name'])
                                    tagged_snps.append(decoded[j]['variation2'])
                                    tagged_r2.append(decoded[j]['r2'])

                            else:
                                index_snps.append(snps_df[rsid_col][i])
                                index_chrom.append(snps_df[chrom][i])
                                index_pos.append(snps_df[pos][i])
                                population_name.append('NA')
                                tagged_snps.append('NA')
                                tagged_r2.append('NA')

                            # for each index SNP add an extra row with the index SNP id, chrom, and pos in the tagged_SNPS columns (even though they are the index SNP) - makes later reading easier
                            index_snps.append(snps_df[rsid_col][i])
                            index_chrom.append(snps_df[chrom][i])
                            index_pos.append(snps_df[pos][i])
                            population_name.append('NA')
                            tagged_snps.append(rsid)
                            tagged_r2.append('index')

                            output_dict={
                                'index_snps':index_snps,
                                'index_chrom':index_chrom,
                                'index_pos':index_pos,
                                'population_name':population_name,
                                'tagged_snps':tagged_snps,
                                'tagged_r2':tagged_r2}

                            output=pd.DataFrame(output_dict)

                        except Exception as problematic_snp:
                            print('problematic snp: '+rsid+' ('+file+')')
                            problematic_snps_error.append(e)
                            problematic_snps.append(rsid)
                            problematic_snps_file.append(index_snp_file)

                    # remove rows where no tagged SNP could be found
                    output = output[output["tagged_snps"] != "NA"]

                    output.to_csv(f'{snps_with_LD_path}{efo_id}_with_SNPs_in_LD_{LD_threshold_for_filename}.csv')
                    
            except Exception as e:
                # Log the error
                errorcodes_e_index.append(e)
                errorcodes_file_index.append(index_snp_file)
                
        else:
            too_few_snps.append(index_snp_file)
            print('NOT starting (too few SNPs): '+efo_id+', '+str(len(snps_df))+' index SNPs')
    else:
        print('NOT starting (already exists): '+efo_id)

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print(f'{current_time}: Finished')

error_dict={
    'errorcodes_e_index':errorcodes_e_index,
    'errorcodes_file_index':errorcodes_file_index}

pd.DataFrame(error_dict)    

19:31:16: Fetching SNPs in LD
NOT starting (already exists): EFO_0000537
NOT starting (already exists): EFO_0000275
NOT starting (already exists): EFO_0005053
NOT starting (already exists): MONDO_0005178
NOT starting (already exists): EFO_0005763
NOT starting (already exists): EFO_0009285
NOT starting (already exists): EFO_0009184
NOT starting (already exists): EFO_0004573
NOT starting (already exists): EFO_0021816
NOT starting (already exists): EFO_0009783
NOT starting (already exists): EFO_0004831
NOT starting (already exists): EFO_0009277
NOT starting (too few SNPs): MONDO_0000890, 1 index SNPs
NOT starting (already exists): EFO_0006919
NOT starting (already exists): EFO_1001482
NOT starting (already exists): EFO_0005527
NOT starting (already exists): EFO_OBSTRUCTIVE
NOT starting (already exists): EFO_0004282
NOT starting (already exists): EFO_0006828
NOT starting (already exists): EFO_0004519
NOT starting (already exists): EFO_0005524
NOT starting (already exists): MONDO_0002078
NO

Unnamed: 0,errorcodes_e_index,errorcodes_file_index


## Get coordinates of SNPs in LD

In [14]:
%%time

errorcodes_e_LD=[]
errorcodes_file_LD=[]
problematic_snps=[]
problematic_snps_file=[]
problematic_snps_error=[]


os.makedirs(snps_with_LD_with_pos_path,
           exist_ok=True) # makes the directory

files=os.listdir(snps_with_LD_path)

files=[f for f in files if f"{LD_threshold_for_filename}" in f]

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print(f'{current_time}: Fetching genomic coordinates for SNPs in LD')

for file in files:
    
    efo_id=file.split('_')[0]+'_'+file.split('_')[1]
    
    if os.path.isfile(f'{snps_with_LD_with_pos_path}{efo_id}_with_SNPs_in_LD_{LD_threshold_for_filename}_with_pos.csv')==False:
        
        try:

            snps_df=pd.read_csv(f'{snps_with_LD_path}{file}')

            now = datetime.now()
            current_time = now.strftime("%H:%M:%S")
            print(f'{current_time}: Starting: '+file+', '+str(len(snps_df))+' total SNPs')

            # lists which are needed for each output
            tagged_pos=[]
            tagged_chrom=[]

            for snp in range(len(snps_df['tagged_snps'])):

                rsid_of_interest=snps_df['tagged_snps'][snp]
                
                if rsid_of_interest.startswith('rs') and rsid_of_interest not in previous_problematic_snps:
                    
                    try:
                        
                        server = "https://rest.ensembl.org"
                        ext = f"/variation/human/{rsid_of_interest}?"
                        r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
                        decoded = r.json()

                        if len(decoded)>0 and "mappings" in decoded:
                            tagged_pos.append(decoded['mappings'][0]['start'])
                            tagged_chrom.append(decoded['mappings'][0]['seq_region_name'])
                        else:
                            tagged_pos.append("NA")
                            tagged_chrom.append("NA")
                    except Exception as e:
                            print('problematic snp: '+rsid_of_interest+' ('+file+')')
                            problematic_snps_error.append(e)
                            problematic_snps.append(rsid_of_interest)
                            problematic_snps_file.append(file)
                            tagged_pos.append("NA")
                            tagged_chrom.append("NA")
                else:
                    tagged_pos.append("NA")
                    tagged_chrom.append("NA")
            snps_df['tagged_pos']=tagged_pos
            snps_df['tagged_chrom']=tagged_chrom
            snps_df.to_csv(f'{snps_with_LD_with_pos_path}{efo_id}_with_SNPs_in_LD_{LD_threshold_for_filename}_with_pos.csv')

        except Exception as e:
            # Log the error
            errorcodes_e_LD.append(e)
            errorcodes_file_LD.append(file)
            tagged_pos.append("NA")
            tagged_chrom.append("NA")

    else:
        print('NOT starting (already exists): '+file)

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print(f'{current_time}: Finished')

19:31:17: Fetching genomic coordinates for SNPs in LD
NOT starting (already exists): EFO_0004644_with_SNPs_in_LD_0p9.csv
NOT starting (already exists): EFO_0009185_with_SNPs_in_LD_0p9.csv
NOT starting (already exists): EFO_0005524_with_SNPs_in_LD_0p9.csv
NOT starting (already exists): EFO_0005207_with_SNPs_in_LD_0p9.csv
NOT starting (already exists): EFO_0005669_with_SNPs_in_LD_0p9.csv
NOT starting (already exists): EFO_0000266_with_SNPs_in_LD_0p9.csv
NOT starting (already exists): EFO_0000407_with_SNPs_in_LD_0p9.csv
NOT starting (already exists): EFO_0004277_with_SNPs_in_LD_0p9.csv
NOT starting (already exists): EFO_0007208_with_SNPs_in_LD_0p9.csv
NOT starting (already exists): EFO_0020101_with_SNPs_in_LD_0p9.csv
NOT starting (already exists): EFO_0004462_with_SNPs_in_LD_0p9.csv
NOT starting (already exists): EFO_0005243_with_SNPs_in_LD_0p9.csv
NOT starting (already exists): EFO_0005126_with_SNPs_in_LD_0p9.csv
NOT starting (already exists): EFO_RHLESION_with_SNPs_in_LD_0p9.csv
NOT sta

# Make some SNP table metadata

In [15]:
%%time

# make a table of metadata about the SNP files
md_efo_ids=[]
md_efo_terms=[]
md_n_SNPs=[]


files=os.listdir(snps_with_LD_with_pos_path)

files=[f for f in files if f"{LD_threshold_for_filename}" in f]

for file in files:
    
    efo_id=str(file.split('_')[0])+'_'+str(file.split('_')[1])
    md_efo_ids.append(efo_id)
                      
    snps_df=pd.read_csv(f'{snps_with_LD_with_pos_path}{file}')
    snps_df=snps_df.set_index('tagged_snps')
    
    snps_df["chrom"] = snps_df['tagged_chrom'].apply(lambda x: 'chr'+str(x)).str.split('.',expand=True)[0] # creates a new 'chrom' column and should work even for X or M chromosomes

    md_n_SNPs.append(len(snps_df))
    
    
    original_snp_files=os.listdir(snps_path)
    efo_term=[file for file in original_snp_files if f"{efo_id}" in file]
    efo_term=efo_term[0].split('_')[2]
    md_efo_terms.append(efo_term)

md_dict={
    'efo_id':md_efo_ids,
    'efo_term':md_efo_terms,
    'n_SNPs':md_n_SNPs
}


SNP_md=pd.DataFrame(md_dict)
SNP_md=SNP_md.set_index('efo_id')

SNP_md.sort_values('n_SNPs',ascending=False)

CPU times: user 1.6 s, sys: 87.6 ms, total: 1.69 s
Wall time: 2.14 s


Unnamed: 0_level_0,efo_term,n_SNPs
efo_id,Unnamed: 1_level_1,Unnamed: 2_level_1
MONDO_0005090,schizophrenia,50656
EFO_0005763,pulse pressure measurement,39352
EFO_0004761,uric acid measurement,12256
EFO_0004682,QT interval,10386
EFO_0000537,hypertension,9960
...,...,...
HP_0030680,abnormality of cardiovascular system morphology,6
MONDO_0000153,transposition of the great arteries,5
EFO_0007741,R wave amplitude,3
EFO_0006795,serum VEGFR2 concentration measurement,3


In [16]:
# get final list of efo_ids to be used in analysis
efo_ids=SNP_md.index.tolist()
n_traits=len(efo_ids)
print(n_traits)

190


# Evaluate whether SNPs are found in peaks using the all_peaks file

In [17]:
%%time
# Add on a column for each trait, indicating whether a SNP from that trait falls within a peak. This file varies with the which traits are assessed


all_peaks_path='/nfs/team205/heart/JC_SNP_enrichment/8region/EBI_GWAS/SNP_mapped_to_peaks/'

os.makedirs(all_peaks_path,
           exist_ok=True) # makes the directory


files=os.listdir(snps_with_LD_with_pos_path)

if os.path.isfile(f'{all_peaks_path}all_peaks_with_SNPs_for_{n_traits}_traits_incremental.csv')==False:
    print(f'...a SNPs in peaks file for this set of traits is NOT found, making one now...')
    print(str(len(efo_ids))+' traits in total')
    for efo_id in efo_ids:

            file = [f for f in files if f"{efo_id}" in f]
            file = str(file[0])

            snps_df=pd.read_csv(f'{snps_with_LD_with_pos_path}{file}')
            snps_df=snps_df.set_index('tagged_snps')

            snps_df["chrom"] = snps_df["tagged_chrom"].apply(lambda x: 'chr'+str(x)).str.split('.',expand=True)[0] # creates a new 'chrom' column *with chr prefix* and should work even for X or M chrosomes

            now = datetime.now()
            current_time = now.strftime("%H:%M:%S")
            print(f'{current_time}_{efo_id}:...evaluating for SNPs in peaks...')

            all_peaks[efo_id]=0
            for snp in range(len(snps_df["tagged_chrom"])): # This loop incrementally adds 1 if there is a SNP within a peak (open or closed)
                all_peaks[efo_id][
                    (all_peaks['chrom'] == snps_df["chrom"][snp])
                    &
                    (all_peaks['start'] <= snps_df['tagged_pos'][snp])
                    &
                    (all_peaks['end'] >= snps_df['tagged_pos'][snp])
                ]+=1 # Adds one for each SNP which falls inside a peak
    all_peaks.to_csv(f'{all_peaks_path}all_peaks_with_SNPs_for_{n_traits}_traits_incremental.csv')
else:
    all_peaks=pd.read_csv(f'{all_peaks_path}all_peaks_with_SNPs_for_{n_traits}_traits_incremental.csv',index_col='peak')
    print(f'...SNPs in peaks file already exists, reading it in...')

all_peaks

...SNPs in peaks file already exists, reading it in...
CPU times: user 4.18 s, sys: 316 ms, total: 4.5 s
Wall time: 4.51 s


Unnamed: 0_level_0,chrom,peak_window,start,end,EFO_0010977,EFO_0004520,MONDO_0001823,EFO_0004286,EFO_0004519,EFO_0005527,...,EFO_0005918,EFO_0004328,EFO_VASCULAR,EFO_0004791,EFO_0005053,EFO_0006900,EFO_0010600,EFO_0005054,EFO_0000717,EFO_0005043
peak,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chr1:794932_795432,chr1,794932_795432,794932.0,795432.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chr1:817104_817604,chr1,817104_817604,817104.0,817604.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chr1:818775_819275,chr1,818775_819275,818775.0,819275.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chr1:819697_820197,chr1,819697_820197,819697.0,820197.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chr1:821364_821864,chr1,821364_821864,821364.0,821864.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chrX:155874572_155875072,chrX,155874572_155875072,155874572.0,155875072.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chrX:155880996_155881496,chrX,155880996_155881496,155880996.0,155881496.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chrX:155881574_155882074,chrX,155881574_155882074,155881574.0,155882074.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chrX:155888126_155888626,chrX,155888126_155888626,155888126.0,155888626.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# Show number of SNPs falling in peaks, for each trait

efo_ids_to_drop=[]

for efo_id in efo_ids:
    print(efo_id+': '+str(sum(all_peaks[efo_id])))
    if sum(all_peaks[efo_id]) == 0:
        efo_ids_to_drop.append(efo_id)

EFO_0010977: 2
EFO_0004520: 34
MONDO_0001823: 8
EFO_0004286: 649
EFO_0004519: 1
EFO_0005527: 41
EFO_0009952: 1
EFO_0009609: 4
EFO_0020101: 4
EFO_1001976: 280
HP_0001634: 112
EFO_SEPTAL: 0
EFO_0009094: 22
EFO_0005416: 13
EFO_0005055: 775
EFO_0005532: 23
EFO_0001361: 11
EFO_0005669: 14
EFO_0000266: 32
MONDO_0005090: 4102
EFO_0010820: 4
EFO_0020863: 13
EFO_0004831: 52
EFO_0003875: 2
MONDO_0016820: 74
HP_0030680: 1
EFO_0005251: 0
EFO_0004762: 114
EFO_1001132: 13
EFO_0000537: 961
EFO_0008206: 61
EFO_1001493: 3
EFO_0021817: 6
EFO_0007742: 116
EFO_0008432: 3
EFO_0006919: 15
EFO_0004462: 1360
EFO_0000668: 48
EFO_0007741: 0
EFO_0021815: 21
EFO_0005095: 37
EFO_0005278: 27
EFO_0600025: 0
EFO_0004985: 34
EFO_0021816: 38
EFO_0009276: 3
EFO_0006920: 3
EFO_0006791: 0
EFO_ALLCHD: 38
EFO_0008469: 28
EFO_0010071: 521
EFO_0009552: 242
EFO_0004327: 1061
EFO_1000881: 54
EFO_0005037: 131
EFO_0009184: 39
EFO_0010556: 14
EFO_0005094: 69
EFO_0005243: 38
EFO_0000275: 1124
EFO_0001645: 42
EFO_0006828: 17
EFO_000

### Drop any phenotypes for which there are no SNP-containing peaks

In [19]:
efo_ids=[efo_id for efo_id in efo_ids if efo_id not in efo_ids_to_drop]
print('proceed with: '+str(efo_ids))

proceed with: ['EFO_0010977', 'EFO_0004520', 'MONDO_0001823', 'EFO_0004286', 'EFO_0004519', 'EFO_0005527', 'EFO_0009952', 'EFO_0009609', 'EFO_0020101', 'EFO_1001976', 'HP_0001634', 'EFO_0009094', 'EFO_0005416', 'EFO_0005055', 'EFO_0005532', 'EFO_0001361', 'EFO_0005669', 'EFO_0000266', 'MONDO_0005090', 'EFO_0010820', 'EFO_0020863', 'EFO_0004831', 'EFO_0003875', 'MONDO_0016820', 'HP_0030680', 'EFO_0004762', 'EFO_1001132', 'EFO_0000537', 'EFO_0008206', 'EFO_1001493', 'EFO_0021817', 'EFO_0007742', 'EFO_0008432', 'EFO_0006919', 'EFO_0004462', 'EFO_0000668', 'EFO_0021815', 'EFO_0005095', 'EFO_0005278', 'EFO_0004985', 'EFO_0021816', 'EFO_0009276', 'EFO_0006920', 'EFO_ALLCHD', 'EFO_0008469', 'EFO_0010071', 'EFO_0009552', 'EFO_0004327', 'EFO_1000881', 'EFO_0005037', 'EFO_0009184', 'EFO_0010556', 'EFO_0005094', 'EFO_0005243', 'EFO_0000275', 'EFO_0001645', 'EFO_0006828', 'EFO_0009275', 'EFO_0008536', 'EFO_0001666', 'EFO_0007787', 'EFO_0004311', 'EFO_1001017', 'EFO_0008379', 'EFO_0005128', 'EFO_00

# Make a table of peaks x cell for each cell type, binarise using a threshold, then make 1000 permutations, then overwrite the file

In [20]:
%%time

# makes cell x peak matrix, including 1000 permutations
# NB the SNPs in peaks file needs is specific to the defined set of peaks, which will vary accoring to threshold for binarisation, so this all peaks file needs to be saved according to threshold used

permutations=range(n_permutations)

bin_mat_path='/nfs/team205/heart/JC_SNP_enrichment/8region/EBI_GWAS/binarised_permutation_matrices/'

os.makedirs(bin_mat_path,
           exist_ok=True) # makes the directory

for cell_type in cell_types:
    if os.path.isfile(f'{bin_mat_path}{cell_type}_{n_permutations}_permutations_bin_threshold_{threshold_for_filename}_matrix.csv') == False:

        now = datetime.now()
        current_time = now.strftime("%H:%M:%S")
        
        print(f'{current_time}...generating binarised matrix for {cell_type}...')

        cell_df=pd.DataFrame(peaks.loc[cell_type])
        cell_df['peak'] = cell_df.index
        cell_df['chrom']=cell_df['peak'].str.split(':',expand=True)[0]
        cell_df['peak_window']=cell_df['peak'].str.split(':',expand=True)[1]
        cell_df['start']=cell_df['peak_window'].str.split('_',expand=True)[0].astype(int)
        cell_df['end']=cell_df['peak_window'].str.split('_',expand=True)[1].astype(int)
        cell_df[f'{cell_type}_binarised_real']=cell_df[cell_type].ge(threshold).astype(int)

        for permutation in permutations:
            cell_df[f'{cell_type}_binarised_permutation_{permutation}'] = np.random.permutation(cell_df[f'{cell_type}_binarised_real'])
        cell_df=cell_df.filter(regex=cell_type+'_')
        cell_df.to_csv(f'{bin_mat_path}{cell_type}_{n_permutations}_permutations_bin_threshold_{threshold_for_filename}_matrix.csv',index=True)
    else:
        print(f'NOT generating binarised matrix for {cell_type} since it already exists')
print('finished')

NOT generating binarised matrix for MAIT-like since it already exists
NOT generating binarised matrix for B since it already exists
NOT generating binarised matrix for PC1_vent since it already exists
NOT generating binarised matrix for CD8posT_cytox since it already exists
NOT generating binarised matrix for B_plasma since it already exists
NOT generating binarised matrix for LYVE1posMP_cycling since it already exists
NOT generating binarised matrix for FB3 since it already exists
NOT generating binarised matrix for PC2_atria since it already exists
NOT generating binarised matrix for AVN_bundle_cell since it already exists
NOT generating binarised matrix for Adip3 since it already exists
NOT generating binarised matrix for EC8_ln since it already exists
NOT generating binarised matrix for PC3_str since it already exists
NOT generating binarised matrix for CD8posT_trans since it already exists
NOT generating binarised matrix for aCM2 since it already exists
NOT generating binarised ma

# Read in Binarised Matrix for each cell type, join it to the all_peaks file which has trait info (binding on the index [peaks]), then save these joined binarised matrix files (now with added trait info) in a different directory

In [21]:
%%time

joined_bin_mat_path='/nfs/team205/heart/JC_SNP_enrichment/8region/EBI_GWAS/binarised_permutation_matrices_joined_incremental/'

os.makedirs(joined_bin_mat_path,
           exist_ok=True) # makes the directory

for cell_type in cell_types:
    if os.path.isfile(f'{joined_bin_mat_path}{cell_type}_{n_permutations}_permutations_for_{n_traits}_traits_bin_threshold_{threshold_for_filename}_matrix_joined.csv') == False:    
        
        now = datetime.now()
        current_time = now.strftime("%H:%M:%S")
        print(f'{current_time}: {cell_type}: making bin matrix with trait for this cell')

        # read in binary matrices which have already been made
        binarised_matrix=pd.read_csv(f'{bin_mat_path}{cell_type}_{n_permutations}_permutations_bin_threshold_{threshold_for_filename}_matrix.csv')

        # tidy and add the columns we need
        binarised_matrix.rename(columns = {'Unnamed: 0':'peak'}, inplace = True)
        binarised_matrix=binarised_matrix.set_index(binarised_matrix.iloc[:,0])
        binarised_matrix=binarised_matrix.drop(columns=['peak'])
        
        # add on the columns indicating whether SNPs are in peaks
        binarised_matrix=binarised_matrix.join(all_peaks)

        # save this modified binary matrix to a separate directory
        binarised_matrix.to_csv(f'{joined_bin_mat_path}{cell_type}_{n_permutations}_permutations_for_{n_traits}_traits_bin_threshold_{threshold_for_filename}_matrix_joined.csv')
    else:
        print(f'NOT generating joined binarised matrix for {cell_type} since it already exists')


print('finished')

NOT generating joined binarised matrix for MAIT-like since it already exists
NOT generating joined binarised matrix for B since it already exists
NOT generating joined binarised matrix for PC1_vent since it already exists
NOT generating joined binarised matrix for CD8posT_cytox since it already exists
NOT generating joined binarised matrix for B_plasma since it already exists
NOT generating joined binarised matrix for LYVE1posMP_cycling since it already exists
NOT generating joined binarised matrix for FB3 since it already exists
NOT generating joined binarised matrix for PC2_atria since it already exists
NOT generating joined binarised matrix for AVN_bundle_cell since it already exists
NOT generating joined binarised matrix for Adip3 since it already exists
NOT generating joined binarised matrix for EC8_ln since it already exists
NOT generating joined binarised matrix for PC3_str since it already exists
NOT generating joined binarised matrix for CD8posT_trans since it already exists
N

# Calculate enrichment of cell types for different traits

In [22]:
%%time
import logging

# Set up logging
logging.basicConfig(level=logging.ERROR)



# Find the proportion of all peaks which are open in this celltype

output_path='/nfs/team205/heart/JC_SNP_enrichment/8region/EBI_GWAS/enrichment_output/'

os.makedirs(output_path,
           exist_ok=True) # makes the directory

list_of_output_dfs=[]

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print(f'...================================================================...') 
print(f'...{current_time}:STARTING')
print(f'...================================================================...') 

# make some empty lists
proportion_of_SNPs_found_in_celltype_specific_open_peaks=[]
proportion_of_all_open_peaks_found_in_this_celltype=[]

n_times_proportions_of_SNPs_in_permuted_open_peaks_greater_than_observed_proportion=[]
mean_proportions_of_SNPs_in_open_peaks=[]
p_values=[]
efo_id_list=[]
efo_term_list=[]
cell_type_list=[]
n_SNPs_list=[]

cell_types_done=[]
n_cell_types_total=len(cell_types)

permutations=range(n_permutations)

print('...evaluating '+str(len(efo_ids))+' traits, across '+str(len(cell_types))+' cell types...')

for cell_type in cell_types:
    
    try:
    
        cell_types_done.append(cell_type)
        n_cell_types_done=len(cell_types_done)
        n_cell_types_remaining=n_cell_types_total-n_cell_types_done

        now = datetime.now()
        current_time = now.strftime("%H:%M:%S")
        print(f'...================================================================...')    
        print(f'...{current_time}: {n_cell_types_remaining+1} of {n_cell_types_total} cell types remaining. Reading binarised matrix for {cell_type}...')
        print(f'...================================================================...') 

        cell_bin_mat=pd.read_csv(f'{joined_bin_mat_path}{cell_type}_{n_permutations}_permutations_for_{n_traits}_traits_bin_threshold_{threshold_for_filename}_matrix_joined.csv',index_col='peak')

        prop_bins_in_this_cell_type=(len(cell_bin_mat[cell_bin_mat[f'{cell_type}_binarised_real']==1]))/len(cell_bin_mat)


        for efo_id in efo_ids:

            # grab some metadata
            efo_term=SNP_md.loc[efo_id]['efo_term']
            n_SNPs=SNP_md.loc[efo_id]['n_SNPs']

            # add columns which won't change until we run a new cell_type
            proportion_of_all_open_peaks_found_in_this_celltype.append(prop_bins_in_this_cell_type)
            cell_type_list.append(cell_type)

            # add columns which won't change until we run a new efo_id
            n_SNPs_list.append(n_SNPs)
            efo_id_list.append(efo_id)
            efo_term_list.append(efo_term)

            # subset to just open regions for this cell type
            # find the proportion of SNPs for this trait that lie within this cell types open peaks
            observed_proportion=(cell_bin_mat[efo_id][cell_bin_mat[f'{cell_type}_binarised_real']==1].sum())/(cell_bin_mat[efo_id].sum())

            proportion_of_SNPs_found_in_celltype_specific_open_peaks.append(observed_proportion)

            proportion_of_SNPs_found_in_permutations_of_celltype_specific_open_peaks=[]

            for permutation in permutations:
                proportion_of_SNPs_found_in_permutations_of_celltype_specific_open_peaks.append(cell_bin_mat[efo_id][cell_bin_mat[f'{cell_type}_binarised_permutation_{permutation}']==1].sum()/cell_bin_mat[efo_id].sum())

            proportions_of_SNPs_in_permuted_open_peaks_greater_than_observed_proportion = [i for i in proportion_of_SNPs_found_in_permutations_of_celltype_specific_open_peaks if i >= observed_proportion]

            n_times_proportions_of_SNPs_in_permuted_open_peaks_greater_than_observed_proportion.append(len(proportions_of_SNPs_in_permuted_open_peaks_greater_than_observed_proportion))

            p_values.append(len(proportions_of_SNPs_in_permuted_open_peaks_greater_than_observed_proportion)/len(permutations)) # p val is simply the proportion of null hypotheses 'observations' greater than the actual observed proportion

            mean_proportions_of_SNPs_in_open_peaks.append(sum(proportion_of_SNPs_found_in_permutations_of_celltype_specific_open_peaks)/len(proportion_of_SNPs_found_in_permutations_of_celltype_specific_open_peaks))

            # Plot histograms for each cell type
    #        plt.rcParams["figure.figsize"] = (20,10)
    #        plt.rcParams["figure.dpi"] = 300

    #        plt.hist(proportion_of_SNPs_found_in_permutations_of_celltype_specific_open_peaks,
    #                 bins=100,color='red',
    #                 range=(0,1),
    #                 histtype='stepfilled',edgecolor='none')
    #        plt.axvline(x=observed_proportion, color='blue', linestyle='--')
    #        plt.legend(['null: proportion of SNPs falling in randomly shuffled OC regions','observed: proportion of SNPs falling cell-type specific OC regions'])
    #        plt.title('cell type: '+cell_type+', trait: '+efo_id+', term: '+efo_term+', threshold for binarisation: '+threshold_for_filename)
    #        plt.savefig(f'{output_path}{efo_id}_{efo_term}_{cell_type}_{threshold_for_filename}_SNP_enrichment.png')
    #        plt.clf() #clears the current plot

    
    except KeyError as e:
        # Log the error
        logging.error(e)
        print(cell_type)

    # edited so that the file is written incrementally
        
    output_dict={
        'cell_type':cell_type_list,
        'proportion_of_all_open_peaks_found_in_this_celltype':proportion_of_all_open_peaks_found_in_this_celltype,
        'proportion_of_SNPs_found_in_celltype_specific_open_peaks':proportion_of_SNPs_found_in_celltype_specific_open_peaks,
        'mean_proportions_of_SNPs_in_open_peaks':mean_proportions_of_SNPs_in_open_peaks,
        'n_times_proportions_of_SNPs_in_permuted_open_peaks_greater_than_observed_proportion':n_times_proportions_of_SNPs_in_permuted_open_peaks_greater_than_observed_proportion,
        'p_value':p_values,
        'n_SNPs':n_SNPs_list,
        'efo_id':efo_id_list,
        'efo_term':efo_term_list}

    output_df=pd.DataFrame(output_dict)

    list_of_output_dfs.append(output_df)
    combined_output_df=pd.concat(list_of_output_dfs)
    combined_output_df=combined_output_df.sort_values(by=['efo_id'])
    combined_output_df=combined_output_df.set_index('cell_type')
    combined_output_df.to_csv(f'{output_path}{threshold_for_filename}_{window_size_for_filename}_SNPs_in_LD{LD_threshold_for_filename}_all_traits_summary.csv')

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print(f'...================================================================...')    
print(f'...{current_time}:FINSIHED')
print(f'...================================================================...')

...19:31:35:STARTING
...evaluating 176 traits, across 60 cell types...
...19:31:35: 60 of 60 cell types remaining. Reading binarised matrix for MAIT-like...
...19:40:25: 59 of 60 cell types remaining. Reading binarised matrix for B...
...19:49:06: 58 of 60 cell types remaining. Reading binarised matrix for PC1_vent...
...19:59:37: 57 of 60 cell types remaining. Reading binarised matrix for CD8posT_cytox...
...20:08:01: 56 of 60 cell types remaining. Reading binarised matrix for B_plasma...
...20:18:19: 55 of 60 cell types remaining. Reading binarised matrix for LYVE1posMP_cycling...
...20:30:26: 54 of 60 cell types remaining. Reading binarised matrix for FB3...
...20:42:33: 53 of 60 cell types remaining. Reading binarised matrix for PC2_atria...
...20:52:34: 52 of 60 cell types remaining. Reading binarised matrix for AVN_bundle_cell...
...21:11:45: 51 of 60 cell types remaining. Reading binarised matrix for Adip3...
...21:30:07: 50 of 60 cell types remaining. Reading binarised matrix f