# LncATLAS
Publication: LncATLAS database for subcellular localization of long noncoding RNAs (2017) David Mas-Ponte, Joana Carlevaro-Fita, Emilio Palumbo, Toni Hermoso Pulido, Roderic Guigo, and Rory Johnson. RNA 23:1080–1087

Publication [online](https://rnajournal.cshlp.org/content/23/7/1080)

In [1]:
ATLAS_DIR='/Users/jasonmiller/WVU/MDPI/LncAtlas/'
ATLAS_DATA='lncATLAS_all_data_RCI.csv'
infile = ATLAS_DIR+ATLAS_DATA

## Raw data

In [2]:
import pandas as pd
df=pd.read_csv(infile)
df

Unnamed: 0,ENSEMBL ID,Data Source,Data Type,Value,Gene Name,Coding Type,Biotype
0,ENSG00000000003,A549,CNRCI,1.08068,TSPAN6,coding,coding
1,ENSG00000000003,GM12878,CNRCI,,TSPAN6,coding,coding
2,ENSG00000000003,H1.hESC,CNRCI,1.85734,TSPAN6,coding,coding
3,ENSG00000000003,HeLa.S3,CNRCI,1.86839,TSPAN6,coding,coding
4,ENSG00000000003,HepG2,CNRCI,2.29436,TSPAN6,coding,coding
...,...,...,...,...,...,...,...
714515,ENSG00000283125,NCI.H460,CNRCI,,RP11-299P2.2,nc,nc
714516,ENSG00000283125,NHEK,CNRCI,,RP11-299P2.2,nc,nc
714517,ENSG00000283125,SK.MEL.5,CNRCI,,RP11-299P2.2,nc,nc
714518,ENSG00000283125,SK.N.DZ,CNRCI,,RP11-299P2.2,nc,nc


In [3]:
#print('Coding Type')   # same as Biotype in every case
#print(df['Coding Type'].value_counts())
print('Biotype')
print(df['Biotype'].value_counts())

Biotype
coding    395940
nc        318580
Name: Biotype, dtype: int64


## Filtered data

In [4]:
# Filtered for binary nuclear-vs-cytosolic
# One cell line (K562) has other values like RCIno
bf = df.loc[df['Data Type']=='CNRCI']

In [5]:
# Filter against Value=NaN
qf = bf.loc[~bf['Value'].isnull()]
qf

Unnamed: 0,ENSEMBL ID,Data Source,Data Type,Value,Gene Name,Coding Type,Biotype
0,ENSG00000000003,A549,CNRCI,1.080680,TSPAN6,coding,coding
2,ENSG00000000003,H1.hESC,CNRCI,1.857340,TSPAN6,coding,coding
3,ENSG00000000003,HeLa.S3,CNRCI,1.868390,TSPAN6,coding,coding
4,ENSG00000000003,HepG2,CNRCI,2.294360,TSPAN6,coding,coding
5,ENSG00000000003,HT1080,CNRCI,0.866395,TSPAN6,coding,coding
...,...,...,...,...,...,...,...
714484,ENSG00000283122,HepG2,CNRCI,-2.584960,HYMAI,nc,nc
714485,ENSG00000283122,HT1080,CNRCI,-1.485430,HYMAI,nc,nc
714487,ENSG00000283122,IMR.90,CNRCI,-3.305810,HYMAI,nc,nc
714494,ENSG00000283122,MCF.7,CNRCI,-3.544320,HYMAI,nc,nc


In [6]:
print('Biotype')
print(qf['Biotype'].value_counts())

Biotype
coding    169966
nc         28217
Name: Biotype, dtype: int64


In [7]:
fields = ['ENSEMBL ID','Data Source','Value']
coding = qf.loc[qf['Biotype']=='coding'][fields]
noncoding = qf.loc[qf['Biotype']=='nc'][fields]
rename_map = {'ENSEMBL ID':'gene_id','Data Source':'cell_line','Value':'RCI'} 
coding.rename(columns=rename_map,inplace=True)
noncoding.rename(columns=rename_map,inplace=True)

In [8]:
coding

Unnamed: 0,gene_id,cell_line,RCI
0,ENSG00000000003,A549,1.080680
2,ENSG00000000003,H1.hESC,1.857340
3,ENSG00000000003,HeLa.S3,1.868390
4,ENSG00000000003,HepG2,2.294360
5,ENSG00000000003,HT1080,0.866395
...,...,...,...
713848,ENSG00000283013,K562,-0.451526
713854,ENSG00000283013,MCF.7,-1.167910
713982,ENSG00000283039,H1.hESC,-1.064130
714302,ENSG00000283093,H1.hESC,1.250540


In [9]:
coding.describe()

Unnamed: 0,RCI
count,169966.0
mean,-0.145776
std,1.447677
min,-8.93799
25%,-1.0
50%,-0.028581
75%,0.837833
max,10.5541


In [10]:
noncoding

Unnamed: 0,gene_id,cell_line,RCI
31474,ENSG00000082929,MCF.7,1.23491
39740,ENSG00000093100,A549,-3.15798
39741,ENSG00000093100,GM12878,-2.00351
39742,ENSG00000093100,H1.hESC,-2.21820
39743,ENSG00000093100,HeLa.S3,-2.05661
...,...,...,...
714484,ENSG00000283122,HepG2,-2.58496
714485,ENSG00000283122,HT1080,-1.48543
714487,ENSG00000283122,IMR.90,-3.30581
714494,ENSG00000283122,MCF.7,-3.54432


In [11]:
noncoding.describe()

Unnamed: 0,RCI
count,28217.0
mean,-1.103468
std,1.92694
min,-10.255
25%,-2.32193
50%,-0.929734
75%,0.263034
max,5.58139


In [12]:
coding.to_csv(ATLAS_DIR+'quantified_coding_genes.csv',index=False)
noncoding.to_csv(ATLAS_DIR+'quantified_noncoding_genes.csv',index=False)

In [13]:
print('done')

done
