### Notebook for the processing of bulk ATAC-Seq data from Epithelial cells with COPD

- **Developed by**: Carlos Talavera-López Ph.D
- **Institute of Computational Biology - Computational Health Centre -  Helmholtz Munich**
- v230323

### Import required modules

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
import os, sys, shutil, importlib, glob
import celloracle as co
from celloracle import motif_analysis as ma

INFO:matplotlib.font_manager:Failed to extract font properties from /System/Library/Fonts/LastResort.otf: tuple indices must be integers or slices, not str
INFO:matplotlib.font_manager:Failed to extract font properties from /System/Library/Fonts/Supplemental/NISC18030.ttf: In FT2Font: Could not set the fontsize (invalid pixel size; error code 0x17)
INFO:matplotlib.font_manager:Failed to extract font properties from /System/Library/Fonts/Apple Color Emoji.ttc: In FT2Font: Could not set the fontsize (invalid pixel size; error code 0x17)


### Set working environment

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

plt.rcParams['figure.figsize'] = [6, 4.5]
plt.rcParams["savefig.dpi"] = 300

### Load bed file from `GSE152779` processed peaks

In [3]:
file_path_of_bed_file = "/Volumes/A7V/raw_reads/GSE152779/GSE152779_peaks.bed" 
bed = ma.read_bed(file_path_of_bed_file)
print(bed.shape)
bed.head()

(889416, 7)


Unnamed: 0,chrom,start,end,name,score,strand,seqname
0,chr1,713779,714381,FAST1_peak_1a,52,.,chr1_713779_714381
1,chr1,713779,714381,FAST1_peak_1b,215,.,chr1_713779_714381
2,chr1,805231,805496,FAST1_peak_2,22,.,chr1_805231_805496
3,chr1,911379,911899,FAST1_peak_3,72,.,chr1_911379_911899
4,chr1,935358,936188,FAST1_peak_4,86,.,chr1_935358_936188


In [4]:
peaks = ma.process_bed_file.df_to_list_peakstr(bed)
peaks

array(['chr1_713779_714381', 'chr1_713779_714381', 'chr1_805231_805496',
       ..., 'chrX_154493583_154493874', 'chrX_154563896_154564117',
       'chrX_154841912_154842671'], dtype=object)

### Make TSS annotation

In [5]:
tss_annotated = ma.get_tss_info(peak_str_list = peaks, ref_genome = 'hg38')
tss_annotated.tail()

que bed peaks: 889416
tss peaks in que: 38991


Unnamed: 0,chr,start,end,gene_short_name,strand
38986,chr21,30487097,30487407,KRTAP19-2,-
38987,chr21,30487081,30487404,KRTAP19-2,-
38988,chr21,30487081,30487459,KRTAP19-2,-
38989,chr21,30487074,30487448,KRTAP19-2,-
38990,chr21,30487091,30487409,KRTAP19-2,-


In [6]:
peak_id_tss = ma.process_bed_file.df_to_list_peakstr(tss_annotated)
tss_annotated = pd.DataFrame({"peak_id": peak_id_tss,
                              "gene_short_name": tss_annotated.gene_short_name.values})
tss_annotated = tss_annotated.reset_index(drop = True)
print(tss_annotated.shape)
tss_annotated.head()

(38991, 2)


Unnamed: 0,peak_id,gene_short_name
0,chr17_4583060_4583325,SMTNL2
1,chr17_4583036_4583307,SMTNL2
2,chr17_4583142_4583293,SMTNL2
3,chr17_4583133_4583351,SMTNL2
4,chr17_4583141_4583288,SMTNL2


### Save object

In [7]:
tss_annotated.to_csv("processed_peak_file.csv")