# Train Test Split

Choose 20% of genes to be the unseen test set.   
For each test gene, remove all its transcripts from the training set.  

The task is simplified by our upstream data processing.   
Instead of one csv row per (gene_id,cell_line), we have one row per (gene_id).  

Analyze the results for stratification.   
Not all genes are expressed in every cell line.   
We must avoid a test set with zero genes for some cell line.  

This notebook reads lines of text, shuffles them, partitions them, writes them.
The code ignores the csv nature of each line.

In [1]:
from random import Random

TEST_PORTION = 0.2
ATLAS_DIR='/Users/jasonmiller/WVU/Localization/LncAtlas/'
ATLAS_CODING='CNRCI_coding_genes.csv'
ATLAS_NONCODING='CNRCI_noncoding_genes.csv'
GENCODE_DIR='/Users/jasonmiller/WVU/Localization/GenCode/'
GENCODE_CODING='Homo_sapiens.GRCh38.cds.csv'
GENCODE_NONCODING='Homo_sapiens.GRCh38.ncrna.csv'

CODING_TEST = 'CNRCI_coding_test_genes.csv'
CODING_TRAIN = 'CNRCI_coding_train_genes.csv'
NONCODING_TEST = 'CNRCI_noncoding_test_genes.csv'
NONCODING_TRAIN = 'CNRCI_noncoding_train_genes.csv'

In [2]:
#def load_all_rows(filepath):
#    all_rows=[]
#    with open (filepath,'r') as handle:
#        r = reader(handle)
#        headers = next(r,None)
#        print('File headers:',headers)
#        for line in r:
#            all_rows.append(line)
#    return all_rows
def load_all_rows(filepath):
    with open (filepath,'r') as handle:
        all_rows = handle.readlines()
    return all_rows

In [3]:
all_coding_rows = load_all_rows(ATLAS_DIR+ATLAS_CODING)
all_noncoding_rows = load_all_rows(ATLAS_DIR+ATLAS_NONCODING)

In [4]:
def inplace_shuffle(rows):
    generator = Random()
    generator.seed(42)
    generator.shuffle(rows)  # in-place
inplace_shuffle(all_coding_rows)
inplace_shuffle(all_coding_rows)

In [5]:
def train_test_split(rows):
    length = len(rows)
    divider = int(length*TEST_PORTION)
    train_set = rows[divider:]
    test_set = rows[:divider]
    return (train_set,test_set)
coding_train_set,coding_test_set = train_test_split(all_coding_rows)
noncoding_train_set,noncoding_test_set = train_test_split(all_noncoding_rows)

In [6]:
def save_csv(rows,filepath):
    with open(filepath,'w') as handle:
        for line in rows:
            handle.write(line)

In [7]:
filename = ATLAS_DIR + CODING_TEST
save_csv(coding_test_set,filename)

filename = ATLAS_DIR + CODING_TRAIN
save_csv(coding_train_set,filename)

filename = ATLAS_DIR + NONCODING_TEST
save_csv(noncoding_test_set,filename)

filename = ATLAS_DIR + NONCODING_TRAIN
save_csv(coding_train_set,filename)

In [8]:
print('done')

done


In [9]:
print('   Coding total, train, test:',
      len(all_coding_rows),len(coding_train_set),len(coding_test_set))
print('Noncoding total, train, test:',
      len(all_noncoding_rows),len(noncoding_train_set),len(noncoding_test_set))


   Coding total, train, test: 17771 14217 3554
Noncoding total, train, test: 6769 5416 1353
