### Notebook to scan for TF binding motifs to generate a base GRN combining the ATAC-seq peaks and motif information.

- **Developed by**: Carlos Talavera-López Ph.D
- **Institute of Computational Biology - Computational Health Centre -  Helmholtz Munich**
- v230323

### Import required modules

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
import os, sys, shutil, importlib, glob
import celloracle as co
from celloracle import motif_analysis as ma

from celloracle.utility import save_as_pickled_object

INFO:matplotlib.font_manager:Failed to extract font properties from /System/Library/Fonts/LastResort.otf: tuple indices must be integers or slices, not str
INFO:matplotlib.font_manager:Failed to extract font properties from /System/Library/Fonts/Apple Color Emoji.ttc: In FT2Font: Could not set the fontsize (invalid pixel size; error code 0x17)
INFO:matplotlib.font_manager:Failed to extract font properties from /System/Library/Fonts/Supplemental/NISC18030.ttf: In FT2Font: Could not set the fontsize (invalid pixel size; error code 0x17)


In [2]:
co.__version__

'0.12.0'

### Set working environment

In [3]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

plt.rcParams['figure.figsize'] = [6, 4.5]
plt.rcParams["savefig.dpi"] = 300

### Rerefence genome data preparation

In [4]:
ref_genome = "GRCh38"

genome_installation = ma.is_genome_installed(ref_genome = ref_genome)
print(ref_genome, "installation: ", genome_installation)

GRCh38 installation:  True


### Load processed peak data

In [5]:
peaks = pd.read_csv("processed_peak_file.csv", index_col = 0)
peaks.head()

Unnamed: 0,peak_id,gene_short_name
0,chr17_4583060_4583325,SMTNL2
1,chr17_4583036_4583307,SMTNL2
2,chr17_4583142_4583293,SMTNL2
3,chr17_4583133_4583351,SMTNL2
4,chr17_4583141_4583288,SMTNL2


In [6]:
def decompose_chrstr(peak_str):
    """
    Args:
        peak_str (str): peak_str. e.g. 'chr1_3094484_3095479'
        
    Returns:
        tuple: chromosome name, start position, end position
    """
    
    *chr_, start, end = peak_str.split("_")
    chr_ = "_".join(chr_)
    return chr_, start, end

from genomepy import Genome

def check_peak_format(peaks_df, ref_genome):
    """
    Check peak format. 
     (1) Check chromosome name. 
     (2) Check peak size (length) and remove sort DNA sequences (<5bp)
    
    """
    
    df = peaks_df.copy()
    
    n_peaks_before = df.shape[0]
    
    # Decompose peaks and make df
    decomposed = [decompose_chrstr(peak_str) for peak_str in df["peak_id"]]
    df_decomposed = pd.DataFrame(np.array(decomposed), index=peaks_df.index)
    df_decomposed.columns = ["chr", "start", "end"]
    df_decomposed["start"] = df_decomposed["start"].astype(int)
    df_decomposed["end"] = df_decomposed["end"].astype(int)
    
    # Load genome data
    genome_data = Genome(ref_genome)
    all_chr_list = list(genome_data.keys())
    
    
    # DNA length check
    lengths = np.abs(df_decomposed["end"] - df_decomposed["start"])
    
    
    # Filter peaks with invalid chromosome name
    n_threshold = 5
    df = df[(lengths >= n_threshold) & df_decomposed.chr.isin(all_chr_list)]
    
    # DNA length check
    lengths = np.abs(df_decomposed["end"] - df_decomposed["start"])
    
    # Data counting
    n_invalid_length = len(lengths[lengths < n_threshold])
    n_peaks_invalid_chr = n_peaks_before - df_decomposed.chr.isin(all_chr_list).sum()
    n_peaks_after = df.shape[0]
    
    
    #
    print("Peaks before filtering: ", n_peaks_before)
    print("Peaks with invalid chr_name: ", n_peaks_invalid_chr)
    print("Peaks with invalid length: ", n_invalid_length)
    print("Peaks after filtering: ", n_peaks_after)
    
    return df

In [7]:
peaks = check_peak_format(peaks, ref_genome)

Peaks before filtering:  38991
Peaks with invalid chr_name:  0
Peaks with invalid length:  0
Peaks after filtering:  38991


### Instantiate TFinfo object

In [8]:
tfi = ma.TFinfo(peak_data_frame = peaks, 
                ref_genome = ref_genome) 

### Load `gimmemotif` database

In [13]:
from gimmemotifs.motif import default_motifs
motifs =  default_motifs()
motifs[:10]

[GM.5.0.Sox.0001_AACAAT,
 GM.5.0.Homeodomain.0001_AGCTGTCAnnA,
 GM.5.0.Mixed.0001_snnGGsssGGs,
 GM.5.0.Nuclear_receptor.0001_TAwsTrGGTCAsTrGGTCA,
 GM.5.0.Mixed.0002_GCTAATTA,
 GM.5.0.Nuclear_receptor.0002_wnyrCTTCCGGGkC,
 GM.5.0.bHLH.0001_ACGTG,
 GM.5.0.Myb_SANT.0001_rrCCGTTAAACnGyy,
 GM.5.0.C2H2_ZF.0001_GCGkGGGCGG,
 GM.5.0.GATA.0001_TTATCTsnnnnnnnCA]

In [14]:
import os, glob
from gimmemotifs.motif import MotifConfig
config = MotifConfig()
motif_dir = config.get_motif_dir()

# Get motif data names
motifs_data_name = [i for i in os.listdir(motif_dir) if i.endswith(".pfm")]
motifs_data_name.sort()
motifs_data_name

['CIS-BP.pfm',
 'ENCODE.pfm',
 'HOCOMOCOv10_HUMAN.pfm',
 'HOCOMOCOv10_MOUSE.pfm',
 'HOCOMOCOv11_HUMAN.pfm',
 'HOCOMOCOv11_MOUSE.pfm',
 'HOMER.pfm',
 'IMAGE.pfm',
 'JASPAR2018.pfm',
 'JASPAR2018_fungi.pfm',
 'JASPAR2018_insects.pfm',
 'JASPAR2018_nematodes.pfm',
 'JASPAR2018_plants.pfm',
 'JASPAR2018_urochordates.pfm',
 'JASPAR2018_vertebrates.pfm',
 'JASPAR2020.pfm',
 'JASPAR2020_fungi.pfm',
 'JASPAR2020_insects.pfm',
 'JASPAR2020_nematodes.pfm',
 'JASPAR2020_plants.pfm',
 'JASPAR2020_urochordates.pfm',
 'JASPAR2020_vertebrates.pfm',
 'JASPAR2022.pfm',
 'JASPAR2022_fungi.pfm',
 'JASPAR2022_insects.pfm',
 'JASPAR2022_nematodes.pfm',
 'JASPAR2022_plants.pfm',
 'JASPAR2022_urochordates.pfm',
 'JASPAR2022_vertebrates.pfm',
 'RSAT_insects.pfm',
 'RSAT_plants.pfm',
 'RSAT_vertebrates.pfm',
 'SwissRegulon.pfm',
 'factorbook.pfm',
 'gimme.vertebrate.v5.0.pfm']

### Load vertebrate motifs

In [15]:
from gimmemotifs.motif import read_motifs

path = os.path.join(motif_dir, "gimme.vertebrate.v5.0.pfm")
motifs = read_motifs(path)
motifs[:10]

[GM.5.0.Sox.0001_AACAAT,
 GM.5.0.Homeodomain.0001_AGCTGTCAnnA,
 GM.5.0.Mixed.0001_snnGGsssGGs,
 GM.5.0.Nuclear_receptor.0001_TAwsTrGGTCAsTrGGTCA,
 GM.5.0.Mixed.0002_GCTAATTA,
 GM.5.0.Nuclear_receptor.0002_wnyrCTTCCGGGkC,
 GM.5.0.bHLH.0001_ACGTG,
 GM.5.0.Myb_SANT.0001_rrCCGTTAAACnGyy,
 GM.5.0.C2H2_ZF.0001_GCGkGGGCGG,
 GM.5.0.GATA.0001_TTATCTsnnnnnnnCA]

### Motif scan

In [16]:
tfi.scan(fpr = 0.02, 
         motifs = motifs,  # If you enter None, default motifs will be loaded.
         verbose = True)

tfi.to_hdf5(file_path = "EpithelialCOPD.celloracle.tfinfo")

Checking your motifs... Motifs format looks good. 

Initiating scanner... 



DEBUG:gimme.scanner:using background: genome GRCh38 with size 200


Calculating FPR-based threshold. This step may take substantial time when you load a new ref-genome. It will be done quicker on the second time. 



2023-03-23 18:20:59,244 - INFO - determining FPR-based threshold
INFO:gimme.scanner:determining FPR-based threshold


Motif scan started .. It may take long time.



scanning:   0%|          | 0/25011 [00:00<?, ? sequences/s]

DEBUG:gimme.scanner:Scanning


In [17]:
tfi.scanned_df.head()

Unnamed: 0,seqname,motif_id,factors_direct,factors_indirect,score,pos,strand
0,chr10_100346875_100347053,GM.5.0.Forkhead.0001,"CEBPZ, NFY, FOXI1, Foxi1","Foxk2, Foxl2, Foxq1, Foxs1, FOXE1, Foxi1, FOXD...",8.815813,126,-1
1,chr10_100346875_100347053,GM.5.0.C2H2_ZF.0006,"KLF5, SP1, KLF12, SP2, KLF4, Sp3, SP3","SP9, Klf7, SP1, THAP1, Sp9, Sp3, KLF7, Sp5, SP...",9.084834,76,1
2,chr10_100346875_100347053,GM.5.0.Mixed.0004,,"BCLAF1, YY1",7.405002,35,1
3,chr10_100346875_100347053,GM.5.0.Myb_SANT.0002,"Prdm11, Mypop",MYPOP,7.099406,118,1
4,chr10_100346875_100347053,GM.5.0.E2F.0004,"E2F4, E2F1","E2f5, E2f4, E2f1, E2F6, E2F4",6.743515,97,-1


### Filtering motifs

In [18]:
tfi.reset_filtering()
tfi.filter_motifs_by_score(threshold = 10)
tfi.make_TFinfo_dataframe_and_dictionary(verbose = True)

Filtering finished: 7672944 -> 1533057
1. Converting scanned results into one-hot encoded dataframe.


  0%|          | 0/24958 [00:00<?, ?it/s]

2. Converting results into dictionaries.


  0%|          | 0/4450 [00:00<?, ?it/s]

  0%|          | 0/1716 [00:00<?, ?it/s]

### Get final base GRN

In [19]:
df = tfi.to_dataframe()
df.head()

Unnamed: 0,peak_id,gene_short_name,9430076C15Rik,AC002126.6,AC012531.1,AC226150.2,AFP,AHR,AHRR,AIRE,...,Znf431,Zscan10,Zscan26,Zscan4,arnt,cMyc,cebpa,nMyc,p53,p63
0,chr10_100346875_100347053,SCD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,chr10_100346947_100347161,SCD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,chr10_100347087_100347438,SCD,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,chr10_100372680_100372946,OLMALINC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,chr10_101130776_101130905,TLX1NB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
df = tfi.to_dataframe()
df.to_parquet("Epithelial_lung_base_GRN_dataframe.parquet")