# Train Test Split

Choose 20% of genes to be the unseen test set.   
For each test gene, remove all its transcripts from the training set.  

The task is simplified by our upstream data processing.   
Instead of one csv row per (gene_id,cell_line), we have one row per (gene_id).  

Analyze the results for stratification.   
Not all genes are expressed in every cell line.   
We must avoid a test set with zero genes for some cell line.  

In [15]:
from csv import reader
from random import Random

TEST_PORTION = 0.2
ATLAS_DIR='/Users/jasonmiller/WVU/Localization/LncAtlas/'
ATLAS_CODING='CNRCI_coding_genes.csv'
ATLAS_NONCODING='CNRCI_noncoding_genes.csv'
GENCODE_DIR='/Users/jasonmiller/WVU/Localization/GenCode/'
GENCODE_CODING='Homo_sapiens.GRCh38.cds.csv'
GENCODE_NONCODING='Homo_sapiens.GRCh38.ncrna.csv'

In [9]:
def load_all_rows(filepath):
    all_rows=[]
    with open (filepath,'r') as handle:
        r = reader(handle)
        headers = next(r,None)
        print('File headers:',headers)
        for line in r:
            all_rows.append(line)
    return all_rows

In [10]:
all_coding_rows = load_all_rows(ATLAS_DIR+ATLAS_CODING)
all_noncoding_rows = load_all_rows(ATLAS_DIR+ATLAS_NONCODING)

File headers: ['gene_id', 'A549', 'H1.hESC', 'HeLa.S3', 'HepG2', 'HT1080', 'HUVEC', 'MCF.7', 'NCI.H460', 'NHEK', 'SK.MEL.5', 'SK.N.DZ', 'SK.N.SH', 'GM12878', 'K562', 'IMR.90']
File headers: ['gene_id', 'A549', 'H1.hESC', 'HeLa.S3', 'HepG2', 'HT1080', 'HUVEC', 'MCF.7', 'NCI.H460', 'NHEK', 'SK.MEL.5', 'SK.N.DZ', 'SK.N.SH', 'GM12878', 'K562', 'IMR.90']


In [14]:
def inplace_shuffle(rows):
    generator = Random()
    generator.seed(42)
    generator.shuffle(rows)  # in-place
inplace_shuffle(all_coding_rows)
inplace_shuffle(all_coding_rows)

[['ENSG00000271447', '0.127354', '1.0', 'nan', 'nan', 'nan', '0.775065', '0.263034', 'nan', '0.3856', 'nan', 'nan', '0.517848', 'nan', 'nan', 'nan'], ['ENSG00000160181', 'nan', 'nan', '1.34792', 'nan', 'nan', 'nan', '2.41504', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'], ['ENSG00000169871', '-0.94296', '-1.47213', '-1.94407', '-2.22531', '-0.735123', '-1.07007', '-0.700933', '-1.10321', '-1.94916', '0.0279369', '0.970047', '-1.27965', '-0.993582', 'nan', '-1.19281'], ['ENSG00000168826', '-0.370687', '-0.0471651', '-0.613863', '-1.18634', '-0.152703', '-0.771731', '-1.26649', '-0.851999', '-1.78061', 'nan', 'nan', '-1.44322', '-0.408392', '-0.770518', '-1.73548'], ['ENSG00000117602', 'nan', '0.451492', 'nan', '1.15609', '1.26212', '0.648738', '0.193042', '1.07091', '0.745835', 'nan', 'nan', '0.533392', 'nan', '-0.520832', 'nan']]
[['ENSG00000148824', '-1.25525', '-0.11483', '-1.34377', '-2.94074', '-0.889159', '-0.93743', '-1.95416', '-1.24373', '-1.4205', '-2.45207', '-0.96

In [16]:
def train_test_split(rows):
    length = len(rows)
    divider = int(length*TEST_PORTION)
    train_set = rows[divider:]
    test_set = rows[:divider]
    print('Total, train, test:',len(rows),len(train_set),len(test_set))
    return (train_set,test_set)
coding_train_set,coding_test_set = train_test_split(all_coding_rows)
noncoding_train_set,noncoding_test_set = train_test_split(all_noncoding_rows)

Total, train, test: 17770 14216 3554
Total, train, test: 6768 5415 1353
