In [1]:
import numpy as np
DATA_DIR = '/Users/jasonmiller/WVU/Localization/TrainTest/TrainTest_ver43/'

In [2]:
def get_sequenced_genes(filename):
    genes = set()
    with open (filename, 'r') as fin:
        header = None
        for line in fin:
            if header is None:
                header = line
            else:
                line = line.strip()
                fields = line.split(',')
                tran_id = fields[0]
                gene_id = fields[1]
                genes.add(gene_id)
    return genes

In [3]:
def gene_numbers(line):
    fields = line.split(',')
    gene_id = fields.pop(0)
    fields[1] = 'nan'   # H1.hESC
    numbers = [float(x) for x in fields if x!='nan']
    return gene_id, numbers

In [4]:
def load_gene_averages(filename):
    genes = []
    with open (filename, 'r') as fin:
        header = None
        for line in fin:
            if header is None:
                header = line
            else:
                line = line.strip()
                gene_id, numbers = gene_numbers(line)
                if len(numbers)>0:
                    avg_rci = np.mean(numbers)
                    if not np.isnan(avg_rci):
                        genes.append( (gene_id,avg_rci) )
    return genes

In [5]:
def sort_genes(list_of_tuple):
    s = sorted(list_of_tuple, key=lambda x: x[1])
    return s

In [6]:
def filter_genes(data,good_genes):
    keepers = [t for t in data if t[0] in good_genes]
    return keepers

In [7]:
MODULUS = 5
MIDDLE_INDEX = 2
def train_test_split(genes,train,test):
    '''
    genes: tuple(gene_id,mean_rci)
    train,test: filenames
    '''
    with open (train, 'w') as train_out, open(test, 'w') as test_out:
        counter = 0
        for tup in genes:
            gene_id, mean_rci = tup
            if counter==MIDDLE_INDEX:
                print(f"{tup[0]},{str(tup[1])}" , file=test_out)
            else:
                print(f"{tup[0]},{str(tup[1])}" , file=train_out)
            counter = (counter +1) % MODULUS 

In [8]:
def file_average(filename):
    with open (filename, 'r') as fin:
        values = []
        for line in fin:
            line = line.strip()
            gene_id,rci_str = line.split(',')
            rci_val = float(rci_str)
            values.append(rci_val)
        print(filename, len(values))
        print(np.mean(values), np.std(values))

In [9]:
def process_all():
    sequenced_genes = get_sequenced_genes(seq_file)
    print('sequenced genes:', len(sequenced_genes))
    gene_averages = load_gene_averages(rci_file)
    print('gene averages:', len(gene_averages))
    gene_averages = filter_genes(gene_averages,sequenced_genes)
    print('filtered:', len(gene_averages))
    sorted_genes = sort_genes(gene_averages)
    print('sorted:', len(sorted_genes))
    print('First five sorted:',sorted_genes[:5])
    train_test_split(sorted_genes,train_file,test_file)

## lncRNA

In [10]:
# input
rci_file = DATA_DIR+'all.lncRNA_RCI.csv'
seq_file = DATA_DIR+'all.canon.lncRNA.csv' # gencode canonical - make sure we have sequence
# output
train_file = DATA_DIR+'train.lncRNA_RCI.csv'
test_file =  DATA_DIR+'test.lncRNA_RCI.csv'

In [11]:
process_all()

sequenced genes: 6423
gene averages: 5465
filtered: 5465
sorted: 5465
First five sorted: [('ENSG00000229807', -8.639225), ('ENSG00000279289', -8.23002), ('ENSG00000279439', -7.54689), ('ENSG00000279166', -7.5025), ('ENSG00000174171', -7.17991)]


In [12]:
file_average(train_file)
file_average(test_file)

/Users/jasonmiller/WVU/Localization/TrainTest/TrainTest_ver43/train.lncRNA_RCI.csv 4372
-1.2731183624623068 1.856242200100356
/Users/jasonmiller/WVU/Localization/TrainTest/TrainTest_ver43/test.lncRNA_RCI.csv 1093
-1.272938599268234 1.8548936684759063


## mRNA

In [13]:
# input
rci_file = DATA_DIR+'all.pc_RCI.csv'
seq_file = DATA_DIR+'all.canon.pc.csv' # gencode canonical - make sure we have sequence
# output
train_file = DATA_DIR+'train.pc_RCI.csv'
test_file =  DATA_DIR+'test.pc_RCI.csv'

In [14]:
process_all()

sequenced genes: 17668
gene averages: 17079
filtered: 17079
sorted: 17079
First five sorted: [('ENSG00000254995', -6.290376), ('ENSG00000169885', -6.2700175), ('ENSG00000255837', -5.998693333333333), ('ENSG00000179698', -5.992748571428571), ('ENSG00000271698', -5.950271428571428)]


In [15]:
file_average(train_file)
file_average(test_file)

/Users/jasonmiller/WVU/Localization/TrainTest/TrainTest_ver43/train.pc_RCI.csv 13663
-0.23192734972412932 1.3475606256526398
/Users/jasonmiller/WVU/Localization/TrainTest/TrainTest_ver43/test.pc_RCI.csv 3416
-0.23132543092706284 1.348940976495518
