# Train Test Split
Version 2 ran Oct 28.

Choose 20% of genes to be the unseen test set.   
For each test gene, remove all its transcripts from the training set.  

The task is simplified by our upstream data processing.   
In ProcessGenCode, we moved sequences to csv files and filtered:

* Retained only those genes for which there is at least one RCI value in LncAtlas.
* Retained only those non-coding transcripts with gene_type=transcript_type=lncRNA.
* Retained only those coding transcripts with gene_type=transcript_type=protein_coding.

In [1]:
from random import Random
from datetime import datetime
print(datetime.now())

TEST_PORTION = 0.2
# Inputs (generated by ProcessGenCode notebook)
GENCODE_DIR = '/Users/jasonmiller/WVU/Localization/GenCode/'
CODING_ALL = 'gencode.v42.lncRNA_transcripts.csv'
NONCODING_ALL = 'gencode.v42.pc_transcripts.csv'
# Outputs
ATLAS_DIR = '/Users/jasonmiller/WVU/Localization/LncAtlas/'
CODING_TEST = 'CNRCI_coding_test_genes.gc42.csv'
CODING_TRAIN = 'CNRCI_coding_train_genes.gc42.csv'
NONCODING_TEST = 'CNRCI_noncoding_test_genes.gc42.csv'
NONCODING_TRAIN = 'CNRCI_noncoding_train_genes.gc42.csv'

2022-10-28 07:40:02.565984


In [2]:
def load_available_genes(filepath):
    '''
    Load IDs of the genes for which we have sequence.
    The long RNA strings preclude the use of csv.reader utility.
    Expect csv file with this header line:
    transcript_id,gene_id,biotype,length,sequence
    '''
    gene_set=set()
    with open (filepath) as handle:
        header = None
        for line in handle:
            if header is None:
                header = line
            else:
                line = line.strip()
                fields = line.split(',')
                transcript_id = fields[0]
                gene_id       = fields[1]
                biotype       = fields[2]
                length        = int(fields[3])
                sequence      = fields[4]
                if length != len(sequence):
                    print(line)
                    raise Exception('Lengths do not match')
                gene_set.add(gene_id)
    gene_list = sorted(list(gene_set))
    return gene_list

In [3]:
def inplace_shuffle(rows):
    generator = Random()
    generator.seed(42)
    generator.shuffle(rows)  # in-place

In [4]:
def train_test_split(rows):
    length = len(rows)
    divider = int(length*TEST_PORTION)
    train_set = rows[divider:]
    test_set = rows[:divider]
    return (train_set,test_set)

In [5]:
def save_csv(rows,filepath):
    with open(filepath,'w') as handle:
        header = 'gene_id'
        handle.write(header)
        for line in rows:
            handle.write(line)
            handle.write('\n')

In [6]:
def assert_exclusivity(list1,list2):
    set1=set(list1)
    set2=set(list2)
    if len(set1)!=len(list1) or len(set2)!=len(list2):
        raise Exception('Lists contained duplicates')
    intersection = set1 & set2
    if len(intersection)!=0:
        raise Exception('Lists are not exclusive')

## Processing

In [7]:
print(datetime.now())
coding_genes    = load_available_genes(GENCODE_DIR+CODING_ALL)
noncoding_genes = load_available_genes(GENCODE_DIR+NONCODING_ALL)

print('First few coding genes:')
print(coding_genes[:5])
print('First few noncoding genes:')
print(noncoding_genes[:5])
print(datetime.now())

2022-10-28 07:40:02.688307
First few coding genes:
['ENSG00000082929', 'ENSG00000099869', 'ENSG00000105501', 'ENSG00000115934', 'ENSG00000116652']
First few noncoding genes:
['ENSG00000000003', 'ENSG00000000005', 'ENSG00000000419', 'ENSG00000000457', 'ENSG00000000460']
2022-10-28 07:40:03.425540


In [8]:
print(datetime.now())
inplace_shuffle(coding_genes)
inplace_shuffle(noncoding_genes)

print('First few coding genes:')
print(coding_genes[:5])
print('First few noncoding genes:')
print(noncoding_genes[:5])
print(datetime.now())

2022-10-28 07:40:03.434703
First few coding genes:
['ENSG00000249476', 'ENSG00000249267', 'ENSG00000248309', 'ENSG00000261754', 'ENSG00000248980']
First few noncoding genes:
['ENSG00000163412', 'ENSG00000196972', 'ENSG00000230601', 'ENSG00000090615', 'ENSG00000070371']
2022-10-28 07:40:03.464884


In [9]:
print(datetime.now())
coding_train_set,   coding_test_set    = train_test_split(coding_genes)
noncoding_train_set,noncoding_test_set = train_test_split(noncoding_genes)

print('First few coding train genes:')
print(coding_train_set[:5])
print('First few coding test genes:')
print(coding_test_set[:5])
print('First few noncoding train genes:')
print(noncoding_train_set[:5])
print('First few noncoding test genes:')
print(noncoding_test_set[:5])
print(datetime.now())

2022-10-28 07:40:03.483296
First few coding train genes:
['ENSG00000227165', 'ENSG00000277662', 'ENSG00000265458', 'ENSG00000282961', 'ENSG00000228791']
First few coding test genes:
['ENSG00000249476', 'ENSG00000249267', 'ENSG00000248309', 'ENSG00000261754', 'ENSG00000248980']
First few noncoding train genes:
['ENSG00000215251', 'ENSG00000100926', 'ENSG00000117600', 'ENSG00000156502', 'ENSG00000241563']
First few noncoding test genes:
['ENSG00000163412', 'ENSG00000196972', 'ENSG00000230601', 'ENSG00000090615', 'ENSG00000070371']
2022-10-28 07:40:03.486367


In [10]:
print(datetime.now())
coding_train_sort    = sorted(coding_train_set)
coding_test_sort     = sorted(coding_test_set)
noncoding_train_sort = sorted(noncoding_train_set)
noncoding_test_sort  = sorted(noncoding_test_set)
coding_train_set     = None
coding_test_set      = None
noncoding_train_set  = None
noncoding_test_set   = None

print('First few coding train genes:')
print(coding_train_sort[:5])
print('First few coding test genes:')
print(coding_test_sort[:5])
print('First few noncoding train genes:')
print(noncoding_train_sort[:5])
print('First few noncoding test genes:')
print(noncoding_test_sort[:5])
print(datetime.now())

2022-10-28 07:40:03.511259
First few coding train genes:
['ENSG00000082929', 'ENSG00000105501', 'ENSG00000115934', 'ENSG00000116652', 'ENSG00000117242']
First few coding test genes:
['ENSG00000099869', 'ENSG00000120664', 'ENSG00000128254', 'ENSG00000136275', 'ENSG00000146521']
First few noncoding train genes:
['ENSG00000000003', 'ENSG00000000005', 'ENSG00000000419', 'ENSG00000000457', 'ENSG00000000460']
First few noncoding test genes:
['ENSG00000000938', 'ENSG00000000971', 'ENSG00000001036', 'ENSG00000001460', 'ENSG00000001626']
2022-10-28 07:40:03.528878


In [11]:
print(datetime.now())
filename = ATLAS_DIR + CODING_TEST
print('Save to',filename)
save_csv( coding_test_sort, filename)

filename = ATLAS_DIR + CODING_TRAIN
print('Save to',filename)
save_csv( coding_train_sort, filename)

filename = ATLAS_DIR + NONCODING_TEST
print('Save to',filename)
save_csv( noncoding_test_sort, filename)

filename = ATLAS_DIR + NONCODING_TRAIN
print('Save to',filename)
save_csv( noncoding_train_sort, filename)

2022-10-28 07:40:03.546556
Save to /Users/jasonmiller/WVU/Localization/LncAtlas/CNRCI_coding_test_genes.gc42.csv
Save to /Users/jasonmiller/WVU/Localization/LncAtlas/CNRCI_coding_train_genes.gc42.csv
Save to /Users/jasonmiller/WVU/Localization/LncAtlas/CNRCI_noncoding_test_genes.gc42.csv
Save to /Users/jasonmiller/WVU/Localization/LncAtlas/CNRCI_noncoding_train_genes.gc42.csv


In [12]:
print(datetime.now())
print('   Coding total, train, test:',
      len(coding_genes), len(coding_train_sort), len(coding_test_sort))
print('Noncoding total, filtered, train, test:',
      len(noncoding_genes), len(noncoding_train_sort), len(noncoding_test_sort))


2022-10-28 07:40:03.579187
   Coding total, train, test: 6270 5016 1254
Noncoding total, filtered, train, test: 17587 14070 3517


In [13]:
print(datetime.now())
assert_exclusivity(coding_genes,        noncoding_genes)
assert_exclusivity(coding_train_sort,   coding_test_sort)
assert_exclusivity(coding_train_sort,   noncoding_train_sort)
assert_exclusivity(coding_test_sort,    noncoding_test_sort)
assert_exclusivity(coding_test_sort,    noncoding_train_sort)
assert_exclusivity(coding_train_sort,   noncoding_test_sort)
assert_exclusivity(noncoding_train_sort,noncoding_test_sort)

2022-10-28 07:40:03.605119


In [14]:
print('done')
print(datetime.now())

done
2022-10-28 07:40:03.640012
