Identifying neuron and non-neuronal candidate cis-regulatory elements (cCREs) and their genome coordinates based on Li et al 2021 An Atlas of Gene Regulatory Elements in Adult Mouse Cerebrum

In [2]:
import numpy as np
import pandas as pd

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
pd.set_option('display.max_rows', 10)

In [5]:
cCRE_df = pd.read_csv('../data/regulatory_elements/Li_et_al_2021/Supplementary Table 8 - Cell type assignment of cCREs.txt',sep='\t')

In [6]:
cCRE_df[['coordinate','name']]=cCRE_df['cCREs'].str.split('|',expand=True)

In [8]:
cCRE_df[['chromosome','start-end']]=cCRE_df['coordinate'].str.split(':',expand=True)

In [10]:
cCRE_df[['start','end']]=cCRE_df['start-end'].str.split('-',expand=True)

In [11]:
cCRE_df

Unnamed: 0,cCREs,cluster,coordinate,name,chromosome,start-end,start,end
0,chr1:3514481-3515234|cCREs108,CNUGA,chr1:3514481-3515234,cCREs108,chr1,3514481-3515234,3514481,3515234
1,chr1:3670396-3672765|cCREs142,CNUGA,chr1:3670396-3672765,cCREs142,chr1,3670396-3672765,3670396,3672765
2,chr1:4571276-4572407|cCREs226,CNUGA,chr1:4571276-4572407,cCREs226,chr1,4571276-4572407,4571276,4572407
3,chr1:4722514-4723338|cCREs242,CNUGA,chr1:4722514-4723338,cCREs242,chr1,4722514-4723338,4722514,4723338
4,chr1:4807341-4808403|cCREs258,CNUGA,chr1:4807341-4808403,cCREs258,chr1,4807341-4808403,4807341,4808403
...,...,...,...,...,...,...,...,...
9644318,chrY:1285546-1286912|cCREs491800,VPIA3,chrY:1285546-1286912,cCREs491800,chrY,1285546-1286912,1285546,1286912
9644319,chrY:90740854-90745088|cCREs491807,VPIA3,chrY:90740854-90745088,cCREs491807,chrY,90740854-90745088,90740854,90745088
9644320,chrY:90760961-90761754|cCREs491808,VPIA3,chrY:90760961-90761754,cCREs491808,chrY,90760961-90761754,90760961,90761754
9644321,chrY:90808231-90809111|cCREs491815,VPIA3,chrY:90808231-90809111,cCREs491815,chrY,90808231-90809111,90808231,90809111


In [12]:
# renaming to match terms found in supplementary table 3
cCRE_df["cluster"].replace({"TL5GL1": "ITL5GL1", "TL5GL2": "ITL5GL2", "TL5GL3": "ITL5GL3"}, inplace=True)

In [13]:
cluster_df = pd.read_excel('../data/regulatory_elements/Li_et_al_2021/Supplementary Table 3 - Cell cluster annotation.xlsx',engine='openpyxl')

In [14]:
subtype_df = cluster_df[['Sub types','Major Type','Class']]

In [15]:
subtype_df.Class.unique()

array(['NonN', 'GABA', 'Glutamate'], dtype=object)

In [16]:
cCRE_class_df=pd.merge(left=cCRE_df,right=subtype_df,left_on='cluster',right_on='Sub types',how='left')

In [17]:
# Making sure that everything matches
cCRE_class_df[cCRE_class_df['Class'].isna()].cluster.value_counts()

Series([], Name: cluster, dtype: int64)

In [18]:
cCRE_class_df.Class.unique()

array(['GABA', 'Glutamate', 'NonN'], dtype=object)

In [19]:
GABA_df = cCRE_class_df[cCRE_class_df['Class']=='GABA']

In [20]:
GABA_cCREs = set(GABA_df.name.unique())

In [21]:
Glutamate_df = cCRE_class_df[cCRE_class_df['Class']=='Glutamate']

In [22]:
Glutamate_cCREs = set(Glutamate_df.name.unique())

In [23]:
NonN_df = cCRE_class_df[cCRE_class_df['Class']=='NonN']

In [24]:
NonN_cCREs = set(NonN_df.name.unique())

In [25]:
inner_N_cCREs = Glutamate_cCREs & GABA_cCREs # intersection of excitatory and inhibitory neuronal cCREs

In [26]:
outer_N_cCREs = Glutamate_cCREs | GABA_cCREs # All cCREs related to neuron

In [27]:
len(Glutamate_cCREs)

326907

In [28]:
len(GABA_cCREs)

365518

In [29]:
len(inner_N_cCREs)

241853

In [30]:
len(outer_N_cCREs)

450572

In [31]:
strict_NonN_cCREs = NonN_cCREs - outer_N_cCREs

In [32]:
len(strict_NonN_cCREs)

41246

In [33]:
len(NonN_cCREs)

185154

In [34]:
strict_neuronal_cCREs = inner_N_cCREs - NonN_cCREs

In [35]:
len(strict_neuronal_cCREs)

141610

In [50]:
strict_neuronal_df = cCRE_class_df[cCRE_class_df['name'].isin(strict_neuronal_cCREs)]
strict_neuronal_df.drop(['cluster','start-end','Sub types','Major Type','Class'],axis=1,inplace=True)
strict_neuronal_df.drop_duplicates(inplace=True)
strict_neuronal_df.drop(['cCREs','coordinate'],axis=1,inplace=True)
strict_neuronal_df['length']=strict_neuronal_df['end'].astype(int)-strict_neuronal_df['start'].astype(int)

In [55]:
strict_NonN_df = cCRE_class_df[cCRE_class_df['name'].isin(strict_NonN_cCREs)]
strict_NonN_df.drop(['cluster','start-end','Sub types','Major Type','Class'],axis=1,inplace=True)
strict_NonN_df.drop_duplicates(inplace=True)
strict_NonN_df.drop(['cCREs','coordinate'],axis=1,inplace=True)
strict_NonN_df['length']=strict_NonN_df['end'].astype(int)-strict_NonN_df['start'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  strict_NonN_df.drop_duplicates(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  strict_NonN_df['length']=strict_NonN_df['end'].astype(int)-strict_NonN_df['start'].astype(int)


In [57]:
strict_NonN_df['length'].min()

501

In [58]:
strict_NonN_df.to_csv('../results/cCREs/strict_NonN.tsv',sep='\t',index=False)

In [59]:
strict_neuronal_df.to_csv('../results/cCREs/strict_neuronal.tsv',sep='\t',index=False)