# Train/Test Split
Read the CNRCI data for lncRNA.  
Include only cell line H1.hESC.  
Compute the mean CNRCI per gene over the remaining 14 cell lines.   
Sort the genes by mean CNRCI.   
To ensure test is like train, 2nd of every 5 genes moves to test.   
Make an 80%/20% partition into train/test subsets.    
Run on lncRNA only.

In [1]:
import numpy as np
from datetime import datetime
print(datetime.now())

2023-05-01 15:32:00.996586


In [2]:
# Input sequence (generated by ProcessGenCode notebook)
GENCODE_DIR = '/Users/jasonmiller/WVU/Localization/GenCode/GenCode43/'
NONCODING_ALL = 'gencode_v43.all_lncRNA_transcripts.csv'
NONCODING_LONG = 'gencode_v43.longest_lncRNA_transcripts.csv'
NONCODING_CANON = 'gencode_v43.canon_lncRNA_transcripts.csv'
# Output sequence 
DATA_DIR = '/Users/jasonmiller/WVU/Localization/TrainTest/TrainTest_ver43/'
NONCODING_ALL_TEST = 'test.all_H1_transcripts.csv'
NONCODING_LONG_TEST = 'test.longest_H1_transcripts.csv'
NONCODING_CANON_TEST = 'test.canon_H1_transcripts.csv'
NONCODING_ALL_TRAIN = 'train.all_H1_transcripts.csv'
NONCODING_LONG_TRAIN = 'train.longest_H1_transcripts.csv'
NONCODING_CANON_TRAIN = 'train.canon_H1_transcripts.csv'

In [3]:
def load_sequence_data(filepath):
    '''
    Load transcript sequences.
    Expect csv file with this header line:
    transcript_id,gene_id,biotype,length,sequence
    '''
    sequence_data=[]
    with open (filepath) as handle:
        header = None
        for line in handle:
            if header is None:
                header = line
            else:
                line = line.strip()
                fields = line.split(',')
                transcript_id = fields[0]
                gene_id       = fields[1]
                biotype       = fields[2]
                length        = int(fields[3])
                sequence      = fields[4]
                if length != len(sequence):
                    print('ERROR!')
                    print(line)
                    raise Exception('Lengths do not match')
                sequence_data.append(fields)
    return sequence_data

In [4]:
def save_seq(gene_list,sequence_data,filepath):
    '''Write a train set or test set given the gene list.'''
    with open(filepath,'w') as handle:
        header = 'transcript_id,gene_id,biotype,length,sequence'
        handle.write(header)
        handle.write('\n')
        valid_ids = set(gene_list)
        for fields in sequence_data:
            transcript_id = fields[0]
            gene_id       = fields[1]
            biotype       = fields[2]
            length        = fields[3]
            sequence      = fields[4]
            if gene_id in valid_ids:
                line = ','.join(fields)
                handle.write(line)
                handle.write('\n')    

In [5]:
def get_sequenced_genes(filename):
    genes = set()
    with open (filename, 'r') as fin:
        header = None
        for line in fin:
            if header is None:
                header = line
            else:
                line = line.strip()
                fields = line.split(',')
                tran_id = fields[0]
                gene_id = fields[1]
                genes.add(gene_id)
    return genes

In [6]:
def gene_numbers(line):
    fields = line.split(',')
    gene_id = fields.pop(0)
    h1 = fields[1]    # H1.hESC
    if h1 =='nan':
        numbers = []
    else:
        numbers = [float(h1)]
    return gene_id, numbers

In [7]:
def load_gene_averages(filename):
    genes = []
    with open (filename, 'r') as fin:
        header = None
        for line in fin:
            if header is None:
                header = line
            else:
                line = line.strip()
                gene_id, numbers = gene_numbers(line)
                if len(numbers)>0:
                    avg_rci = np.mean(numbers)
                    if not np.isnan(avg_rci):
                        genes.append( (gene_id,avg_rci) )
    return genes

In [8]:
def sort_genes(list_of_tuple):
    s = sorted(list_of_tuple, key=lambda x: x[1])
    return s

In [9]:
def filter_genes(data,good_genes):
    keepers = [t for t in data if t[0] in good_genes]
    return keepers

In [10]:
MODULUS = 5
MIDDLE_INDEX = 1  # test gets 1, train gets the other 4
def train_test_split(genes,train,test):
    '''
    genes: tuple(gene_id,mean_rci)
    train,test: filenames
    '''
    with open (train, 'w') as train_out, open(test, 'w') as test_out:
        counter = 0
        for tup in genes:
            gene_id, mean_rci = tup
            if counter==MIDDLE_INDEX:
                print(f"{tup[0]},{str(tup[1])}" , file=test_out)
            else:
                print(f"{tup[0]},{str(tup[1])}" , file=train_out)
            counter = (counter +1) % MODULUS 

In [11]:
def file_average(filename):
    with open (filename, 'r') as fin:
        values = []
        for line in fin:
            line = line.strip()
            gene_id,rci_str = line.split(',')
            rci_val = float(rci_str)
            values.append(rci_val)
        print(filename, len(values))
        print(np.mean(values), np.std(values))

In [12]:
def process_all():
    sequenced_genes = get_sequenced_genes(seq_file)
    print('sequenced genes:', len(sequenced_genes))
    gene_averages = load_gene_averages(rci_file)
    print('gene averages:', len(gene_averages))
    gene_averages = filter_genes(gene_averages,sequenced_genes)
    print('filtered:', len(gene_averages))
    sorted_genes = sort_genes(gene_averages)
    print('sorted:', len(sorted_genes))
    print('First five sorted:',sorted_genes[:5])
    print('Last five sorted:',sorted_genes[-5:])
    train_test_split(sorted_genes,train_file,test_file)

In [13]:
def reload_genes(filename):
    with open (filename, 'r') as fin:
        genes = set()
        for line in fin:
            line = line.strip()
            fields = line.split(',')
            gene_id = fields[0]
            if gene_id in genes:
                print('ERROR! Gene listed twice:',gene_id)
                print(filename)
            genes.add(gene_id)
    return genes

## lncRNA

In [14]:
# input
rci_file = DATA_DIR+'all.lncRNA_RCI.csv'
seq_file = DATA_DIR+'all.canon.lncRNA.csv' # gencode canonical - make sure we have sequence
# output
train_file = DATA_DIR+'train.H1_RCI.csv'
test_file =  DATA_DIR+'test.H1_RCI.csv'

In [15]:
process_all()

sequenced genes: 6423
gene averages: 4669
filtered: 4669
sorted: 4669
First five sorted: [('ENSG00000239906', -6.47018), ('ENSG00000250135', -6.39803), ('ENSG00000272657', -5.87447), ('ENSG00000267436', -5.58496), ('ENSG00000232164', -5.55459)]
Last five sorted: [('ENSG00000245910', 4.36309), ('ENSG00000273061', 4.50711), ('ENSG00000196951', 4.54432), ('ENSG00000252690', 4.90689), ('ENSG00000275097', 5.58139)]


In [16]:
file_average(train_file)
file_average(test_file)

/Users/jasonmiller/WVU/Localization/TrainTest/TrainTest_ver43/train.H1_RCI.csv 3735
-0.45900134819812577 1.5491603068286695
/Users/jasonmiller/WVU/Localization/TrainTest/TrainTest_ver43/test.H1_RCI.csv 934
-0.46124182260171304 1.5521712350761632


In [17]:
train_set = reload_genes(train_file)
test_set =  reload_genes(test_file)
if len(train_set.intersection(test_set))>0:
    print('ERROR: Train/test intersection is not empty')
    print(train_set.intersection(test_set))

In [18]:
print('Write noncoding sequence, canonical transcripts')
sequences = load_sequence_data(GENCODE_DIR+NONCODING_CANON)
save_seq(train_set,sequences,DATA_DIR+NONCODING_CANON_TRAIN)
save_seq(test_set,sequences,DATA_DIR+NONCODING_CANON_TEST)
print('Write noncoding sequence, longest transcripts')
sequences = load_sequence_data(GENCODE_DIR+NONCODING_LONG)
save_seq(train_set,sequences,DATA_DIR+NONCODING_LONG_TRAIN)
save_seq(test_set,sequences,DATA_DIR+NONCODING_LONG_TEST)
print('Write noncoding sequence, all transcripts')
sequences = load_sequence_data(GENCODE_DIR+NONCODING_ALL)
save_seq(train_set,sequences,DATA_DIR+NONCODING_ALL_TRAIN)
save_seq(test_set,sequences,DATA_DIR+NONCODING_ALL_TEST)

Write noncoding sequence, canonical transcripts
Write noncoding sequence, longest transcripts
Write noncoding sequence, all transcripts


In [19]:
print('Done')
print(datetime.now())

Done
2023-05-01 15:32:03.544879
