# LncATLAS
Publication: LncATLAS database for subcellular localization of long noncoding RNAs (2017) David Mas-Ponte, Joana Carlevaro-Fita, Emilio Palumbo, Toni Hermoso Pulido, Roderic Guigo, and Rory Johnson. RNA 23:1080–1087

Publication [online](https://rnajournal.cshlp.org/content/23/7/1080)

Here, extract just the genes with a cytoplasm:nuclear RCI value for at least one cell line.  
Transpose from row-per-cell-line to row-per-gene.  

Some genes were expressed in only one cell line:  

    ENSG00000166917,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,-0.293196,nan  
    ENSG00000167046,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,-1.8365,nan,nan  
    ENSG00000167117,nan,nan,nan,1.58496,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan  

This notebook runs after TrainTestSplit4.

This notebook will output 4 files of RCI values for genes in: 
coding train, coding test, noncoding train, noncoding test.

In [1]:
from datetime import datetime
print(datetime.now())
from cell_lines import Cell_Lines

# input raw data
ATLAS_DIR='/Users/jasonmiller/WVU/Localization/LncAtlas/'
ATLAS_DATA='lncATLAS_all_data_RCI.csv'
# input gene lists from train/test split
SPLIT_DIR='/Users/jasonmiller/WVU/Localization/TrainTest/TrainTest_ver43/'
CODING_TEST_INFILE     ='test.canon_pc_transcripts.csv'
NONCODING_TEST_INFILE  ='test.canon_lncRNA_transcripts.csv'
CODING_TRAIN_INFILE    ='train.canon_pc_transcripts.csv'
NONCODING_TRAIN_INFILE ='train.canon_lncRNA_transcripts.csv'
# output to train/test split
CODING_TEST_OUTFILE    ='test.pc_RCI.csv'
NONCODING_TEST_OUTFILE ='test.lncRNA_RCI.csv'
CODING_TRAIN_OUTFILE   ='train.pc_RCI.csv'
NONCODING_TRAIN_OUTFILE='train.lncRNA_RCI.csv'

2023-03-03 17:28:44.130828


## Raw data

In [2]:
def get_atlas_data(filepath):
    '''
    Read in the file with one line per gene, cell line, value (possibly invalid).
    Create lines with all 15 values for one gene.
    '''
    num_cell_lines = Cell_Lines.get_count()
    cell_line_map = Cell_Lines.get_mapping()
    gene_to_data={}
    with open (filepath, 'r') as handle:
        header = None
        for row in handle:
            if header is None:
                header = row
            else:
                row = row.strip()
                fields = row.split(',')
                gene_id = fields[0]
                cell_line = fields[1]
                data_type = fields[2]
                value = fields[3]
                if data_type == 'CNRCI' and value != 'NA':
                    if gene_id not in gene_to_data:
                        data_row = [float('nan')]*num_cell_lines
                        gene_to_data[gene_id] = data_row
                    rci = float(value)
                    cell_line_index = cell_line_map[cell_line]
                    gene_to_data[gene_id][cell_line_index] = rci
    return gene_to_data

In [3]:
atlas_data = get_atlas_data(ATLAS_DIR+ATLAS_DATA)
print(datetime.now())

2023-03-03 17:28:44.921890


In [4]:
def load_ids(geneset_filepath):
    '''Expect CSV with header: transcript_id,gene_id,biotype,length,sequence'''
    gene_set = set()
    with open (geneset_filepath, 'r') as handle:
        header = None
        for row in handle:
            if header is None:
                header = row
            else:
                fields = row.strip().split(',')
                gene_id = fields[1]
                gene_set.add(gene_id)
    return gene_set
def subset(dict_gene_to_rci,gene_list):
    filtered_dict = {}
    for gene_id in dict_gene_to_rci.keys():
        if gene_id in gene_list:
            rci = dict_gene_to_rci[gene_id]
            filtered_dict[gene_id] = rci
    return filtered_dict

In [5]:
def save_to_csv(gene_to_RCI_map,fn):
    column_names = Cell_Lines.get_ordered_list()
    with open(fn,'w') as handle:
        cell_lines = ','.join(column_names)
        header = 'gene_id,' + cell_lines + '\n'
        handle.write(header)
        for gene,rci_list in gene_to_RCI_map.items():
            line_values = gene
            for value in rci_list:
                line_values += ',' + str(value)
            handle.write(line_values + '\n')

In [6]:
print('split')
coding_train_genes = load_ids(SPLIT_DIR+CODING_TRAIN_INFILE)
coding_train_RCI   = subset(atlas_data, coding_train_genes) 
save_to_csv(coding_train_RCI,    SPLIT_DIR+CODING_TRAIN_OUTFILE)

coding_test_genes = load_ids(SPLIT_DIR+CODING_TEST_INFILE)
coding_test_RCI   = subset(atlas_data, coding_test_genes) 
save_to_csv(coding_test_RCI,     SPLIT_DIR+CODING_TEST_OUTFILE)

noncoding_train_genes = load_ids(SPLIT_DIR+NONCODING_TRAIN_INFILE)
noncoding_train_RCI   = subset(atlas_data, noncoding_train_genes) 
save_to_csv(noncoding_train_RCI, SPLIT_DIR+NONCODING_TRAIN_OUTFILE)

noncoding_test_genes = load_ids(SPLIT_DIR+NONCODING_TEST_INFILE)
noncoding_test_RCI   = subset(atlas_data, noncoding_test_genes) 
save_to_csv(noncoding_test_RCI,  SPLIT_DIR+NONCODING_TEST_OUTFILE)

print(datetime.now())

split
2023-03-03 17:28:45.417869
