Update: Dec 20, 2022: ignore index when outputting promoter_position.tsv; set_value() deprecated, change to .loc
Author: Linhe Xu
This script find the upstream promoter region (2000 bp upstream 5' end of protein-coding gene) coordinate (GRCm38.p6) of all mouse protein-coding genes. This will be used for reteriving PhastCons from UCSC Genome Browser. 

In [1]:
import numpy as np
import pandas as pd

# Define promoter regions' coordinates

In [2]:
gene_df=pd.read_csv('../results/Ensembl100_mouse/mouse_protein_coding_gene_position.tsv',sep='\t',names=['gene','str','chr','start','end'])

In [3]:
# Define chromosomes to use
chr_list=['1','2','3','4','5','6','7','8','9','10',
          '11','12','13','14','15','16','17','18',
          '19','X','Y','MT']

In [4]:
# Filter out genes not on common chromosome branches 
gene_df=gene_df[gene_df['chr'].isin(chr_list)]
# UCSC Genome Browser uses chrM to mark mitocondrial DNA, etc
gene_df.loc[gene_df['chr']=='MT','chr']='M'
gene_df['chr']='chr'+gene_df['chr']

In [6]:
for index,gene in gene_df.iterrows():
    if gene['str']==1:
        gene_df.at[index,'promoter_start']=gene['start']-2001
        gene_df.at[index,'promoter_end']=gene['start']-1
    else:
        gene_df.at[index,'promoter_start']=gene['end']
        gene_df.at[index,'promoter_end']=gene['end']+2000

In [7]:
gene_df['promoter_start']=gene_df['promoter_start'].astype(int)
gene_df['promoter_end']=gene_df['promoter_end'].astype(int)

In [11]:
promoter_df = gene_df.drop(['start','end','str'],axis=1)

In [12]:
promoter_df.to_csv('../results/Ensembl100_mouse/promoter_position.tsv',sep='\t',index=False)

# Match coordinates to celltype-specific genes

In [9]:
# all protein-coding genes' promoter coordinates
all_df=promoter_df.drop(['gene','str'],axis=1)

In [10]:
promoter_df=promoter_df.drop(['str'],axis=1)

In [11]:
celltype_list=['neuron','endothelia','glia','astrocyte','microglia','oligodendrocyte']

In [12]:
coord_dict={}
for celltype in celltype_list:
    df=pd.read_csv('../results/mouse.celltype-specific_genes/protein-coding_w_dNdS/'+celltype+'.list.txt',names=['gene'])
    coord_dict[celltype]=df

In [13]:
for celltype in celltype_list:
    coord_dict[celltype]=pd.merge(coord_dict[celltype],promoter_df,left_on='gene',right_on='gene',how='inner')

In [14]:
path='../results/mouse.celltype-specific_genes/protein-coding_w_dNdS/promoter.'

In [15]:
for celltype in celltype_list:
    coord_dict[celltype].iloc[0:,1:].to_csv(path+celltype+'.tsv',sep='\t',index=False)

In [16]:
coord_dict['neuron']

Unnamed: 0,gene,chr,promoter_start,promoter_end
0,1500009L16Rik,chr10,83720864,83722864
1,1700001L19Rik,chr13,68595420,68597420
2,4930444P10Rik,chr1,16093325,16095325
3,4930447C04Rik,chr12,72940774,72942774
4,4930452B06Rik,chr14,8666240,8668240
...,...,...,...,...
1288,Zim1,chr7,6696450,6698450
1289,Zkscan16,chr4,58941627,58943627
1290,Zmat4,chr8,23634018,23636018
1291,Zpbp,chr11,11462408,11464408
