Copied from mouse.gene_position.ipynb.
This script find the upstream promoter region (2000 bp upstream 5' end of protein-coding gene) coordinate (GRCg6a) of all chicken protein-coding genes. This will be used for reteriving PhastCons from UCSC Genome Browser. 

In [1]:
import numpy as np
import pandas as pd

# Define promoter regions' coordinates

In [16]:
gene_df=pd.read_csv('../results/Ensembl105_chicken/chicken_protein_coding_gene_position.tsv',sep='\t',names=['gene','str','chr','start','end'])

In [17]:
# Define chromosomes to use
chr_list=['1','2','3','4','5','6','7','8','9','10',
          '11','12','13','14','15','16','17','18',
          '19','20','21','22','23','24','25','26',
          '27','28','29','30','31','32','33','Z','W','MT']

In [18]:
# Filter out genes not on common chromosome branches 
gene_df=gene_df[gene_df['chr'].isin(chr_list)]
# UCSC Genome Browser uses chrM to mark mitocondrial DNA, etc
gene_df.loc[gene_df['chr']=='MT','chr']='M'
gene_df['chr']='chr'+gene_df['chr']

In [20]:
for index,gene in gene_df.iterrows():
    if gene['str']==1:
        gene_df.at[index,'promoter_start']=gene['start']-2001
        gene_df.at[index,'promoter_end']=gene['start']-1
    else:
        gene_df.at[index,'promoter_start']=gene['end']
        gene_df.at[index,'promoter_end']=gene['end']+2000

In [21]:
gene_df['promoter_start']=gene_df['promoter_start'].astype(int)
gene_df['promoter_end']=gene_df['promoter_end'].astype(int)

In [22]:
promoter_df = gene_df.drop(['start','end'],axis=1)

In [23]:
promoter_df

Unnamed: 0,gene,str,chr,promoter_start,promoter_end
0,ND1,1,chrM,2049,4049
1,MT-ND2,1,chrM,3240,5240
2,MT-CO1,1,chrM,4644,6644
3,COII,1,chrM,6330,8330
4,ATP8,1,chrM,7084,9084
...,...,...,...,...,...
16773,ALKBH3,1,chr5,21882896,21884896
16774,,1,chr4,11331697,11333697
16775,NHLRC4,1,chr14,13894397,13896397
16776,GPC1,-1,chr9,3029975,3031975


In [24]:
promoter_df.to_csv('../results/Ensembl105_chicken/promoter_position.tsv',sep='\t')

# Match coordinates to celltype-specific genes

In [25]:
# all protein-coding genes' promoter coordinates
all_df=promoter_df.drop(['gene','str'],axis=1)

In [26]:
promoter_df=promoter_df.drop(['str'],axis=1)

In [27]:
celltype_list=['neuron','endothelia','glia','astrocyte','microglia','oligodendrocyte']

In [28]:
coord_dict={}
for celltype in celltype_list:
    df=pd.read_csv('../results/chicken.celltype-specific_genes/protein-coding_w_dNdS/'+celltype+'.list.txt',names=['gene'])
    coord_dict[celltype]=df

In [30]:
for celltype in celltype_list:
    coord_dict[celltype]=pd.merge(coord_dict[celltype],promoter_df,left_on='gene',right_on='gene',how='inner')

In [31]:
path='../results/chicken.celltype-specific_genes/protein-coding_w_dNdS/promoter.'

In [37]:
for celltype in celltype_list:
    coord_dict[celltype].iloc[0:,1:].to_csv(path+celltype+'.tsv',sep='\t',index=False, header=False)

In [36]:
for celltype in celltype_list:
    print(celltype)
    print(coord_dict[celltype].shape)

neuron
(919, 4)
endothelia
(625, 4)
glia
(759, 4)
astrocyte
(597, 4)
microglia
(612, 4)
oligodendrocyte
(386, 4)


All gene list has less than 1000 lines. No need to parse. Directly feed to UCSC genome browser. 