# Train Test Split
Using GenCode 43.

Choose 20% of genes to be the unseen test set.   
For each test gene, remove all its transcripts from the training set.  

The task is simplified by our upstream data processing.   
In ProcessGenCode, we moved sequences to csv files and filtered to
retain only those genes for which there is at least one RCI value in LncAtlas.
We also removed ID versions from gene and transcript ID.

In [1]:
from random import Random
from datetime import datetime
print(datetime.now())

TEST_PORTION = 0.2
# Input sequence (generated by ProcessGenCode notebook)
GENCODE_DIR = '/Users/jasonmiller/WVU/Localization/GenCode/GenCode43/'
CODING_ALL = 'gencode_v43.all_pc_transcripts.csv'
CODING_LONG = 'gencode_v43.longest_pc_transcripts.csv'
CODING_CANON = 'gencode_v43.canon_pc_transcripts.csv'
NONCODING_ALL = 'gencode_v43.all_lncRNA_transcripts.csv'
NONCODING_LONG = 'gencode_v43.longest_lncRNA_transcripts.csv'
NONCODING_CANON = 'gencode_v43.canon_lncRNA_transcripts.csv'
# Output sequence 
TRAIN_DIR = '/Users/jasonmiller/WVU/Localization/TrainTest/TrainTest_ver43/'
CODING_ALL_TEST = 'test.all_pc_transcripts.csv'
CODING_LONG_TEST = 'test.longest_pc_transcripts.csv'
CODING_CANON_TEST = 'test.canon_pc_transcripts.csv'
NONCODING_ALL_TEST = 'test.all_lncRNA_transcripts.csv'
NONCODING_LONG_TEST = 'test.longest_lncRNA_transcripts.csv'
NONCODING_CANON_TEST = 'test.canon_lncRNA_transcripts.csv'
CODING_ALL_TRAIN = 'train.all_pc_transcripts.csv'
CODING_LONG_TRAIN = 'train.longest_pc_transcripts.csv'
CODING_CANON_TRAIN = 'train.canon_pc_transcripts.csv'
NONCODING_ALL_TRAIN = 'train.all_lncRNA_transcripts.csv'
NONCODING_LONG_TRAIN = 'train.longest_lncRNA_transcripts.csv'
NONCODING_CANON_TRAIN = 'train.canon_lncRNA_transcripts.csv'

2023-03-02 18:21:47.009880


In [2]:
def load_sequence_data(filepath):
    '''
    Load transcript sequences.
    Expect csv file with this header line:
    transcript_id,gene_id,biotype,length,sequence
    '''
    gene_set=set()
    sequence_data=[]
    with open (filepath) as handle:
        header = None
        for line in handle:
            if header is None:
                header = line
            else:
                line = line.strip()
                fields = line.split(',')
                transcript_id = fields[0]
                gene_id       = fields[1]
                biotype       = fields[2]
                length        = int(fields[3])
                sequence      = fields[4]
                if length != len(sequence):
                    print(line)
                    raise Exception('Lengths do not match')
                gene_set.add(gene_id)
                sequence_data.append(fields)
    gene_list = sorted(list(gene_set))
    return gene_list,sequence_data

In [3]:
def inplace_shuffle(rows):
    generator = Random()
    generator.seed(42)
    generator.shuffle(rows)  # in-place

In [4]:
def train_test_split(rows):
    '''Chose 20% of genes as the test set.'''
    length = len(rows)
    divider = int(length*TEST_PORTION)
    train_set = rows[divider:]
    test_set = rows[:divider]
    return (train_set,test_set)

In [5]:
def assert_exclusivity(list1,list2):
    '''Check that train set and test set are exclusive.'''
    set1=set(list1)
    set2=set(list2)
    if len(set1)!=len(list1) or len(set2)!=len(list2):
        raise Exception('Lists contained duplicates')
    intersection = set1 & set2
    if len(intersection)!=0:
        raise Exception('Lists are not exclusive')
    print('These sets are mutually exclusive.')

In [6]:
def save_seq(gene_list,sequence_data,filepath):
    '''Write a train set or test set given the gene list.'''
    with open(filepath,'w') as handle:
        header = 'transcript_id,gene_id,biotype,length,sequence'
        handle.write(header)
        handle.write('\n')
        valid_ids = set(gene_list)
        for fields in sequence_data:
            transcript_id = fields[0]
            gene_id       = fields[1]
            biotype       = fields[2]
            length        = fields[3]
            sequence      = fields[4]
            if gene_id in valid_ids:
                line = ','.join(fields)
                handle.write(line)
                handle.write('\n')    

## Coding sequence
Use the gene list in the CANON files. 
(Result would be the same using ALL or LONG.)

In [7]:
coding_genes, ignore = load_sequence_data(GENCODE_DIR+CODING_CANON)
noncoding_genes, ignore = load_sequence_data(GENCODE_DIR+NONCODING_CANON)
assert_exclusivity(coding_genes, noncoding_genes)
coding_genes = None
noncoding_genes = None

These sets are mutually exclusive.


In [8]:
print('Coding sequence')
genes, ignore = load_sequence_data(GENCODE_DIR+CODING_CANON)
inplace_shuffle(genes)
train_set, test_set = train_test_split(genes)
print('Total genes',len(genes))
print('Train genes',len(train_set))
print(' Test genes',len(test_set))
assert_exclusivity(train_set, test_set)

Coding sequence
Total genes 17668
Train genes 14135
 Test genes 3533
These sets are mutually exclusive.


In [9]:
print('Write coding sequence, canonical transcripts')
ignore, sequences = load_sequence_data(GENCODE_DIR+CODING_CANON)
save_seq(train_set,sequences,TRAIN_DIR+CODING_CANON_TRAIN)
save_seq(test_set,sequences,TRAIN_DIR+CODING_CANON_TEST)
print('Write coding sequence, longest transcripts')
ignore, sequences = load_sequence_data(GENCODE_DIR+CODING_LONG)
save_seq(train_set,sequences,TRAIN_DIR+CODING_LONG_TRAIN)
save_seq(test_set,sequences,TRAIN_DIR+CODING_LONG_TEST)
print('Write coding sequence, all transcripts')
ignore, sequences = load_sequence_data(GENCODE_DIR+CODING_ALL)
save_seq(train_set,sequences,TRAIN_DIR+CODING_ALL_TRAIN)
save_seq(test_set,sequences,TRAIN_DIR+CODING_ALL_TEST)

Write coding sequence, canonical transcripts
Write coding sequence, longest transcripts
Write coding sequence, all transcripts


## Noncoding sequence
Use the gene list in the CANON files. 
(Result would be the same using ALL or LONG.)

In [10]:
print('Nonoding sequence')
genes, ignore = load_sequence_data(GENCODE_DIR+NONCODING_CANON)
inplace_shuffle(genes)
train_set, test_set = train_test_split(genes)
print('Total genes',len(genes))
print('Train genes',len(train_set))
print(' Test genes',len(test_set))
assert_exclusivity(train_set, test_set)

Nonoding sequence
Total genes 6423
Train genes 5139
 Test genes 1284
These sets are mutually exclusive.


In [11]:
print('Write noncoding sequence, canonical transcripts')
ignore, sequences = load_sequence_data(GENCODE_DIR+NONCODING_CANON)
save_seq(train_set,sequences,TRAIN_DIR+NONCODING_CANON_TRAIN)
save_seq(test_set,sequences,TRAIN_DIR+NONCODING_CANON_TEST)
print('Write noncoding sequence, longest transcripts')
ignore, sequences = load_sequence_data(GENCODE_DIR+NONCODING_LONG)
save_seq(train_set,sequences,TRAIN_DIR+NONCODING_LONG_TRAIN)
save_seq(test_set,sequences,TRAIN_DIR+NONCODING_LONG_TEST)
print('Write noncoding sequence, all transcripts')
ignore, sequences = load_sequence_data(GENCODE_DIR+NONCODING_ALL)
save_seq(train_set,sequences,TRAIN_DIR+NONCODING_ALL_TRAIN)
save_seq(test_set,sequences,TRAIN_DIR+NONCODING_ALL_TEST)

Write noncoding sequence, canonical transcripts
Write noncoding sequence, longest transcripts
Write noncoding sequence, all transcripts


In [12]:
print('done')
print(datetime.now())

done
2023-03-02 18:21:53.876967
