# Train/Valid split
Create files with 80/20 splits.
The files contain just the gene IDs.
This is for mRNA only.

In [1]:
from datetime import datetime
print(datetime.now())
import random
import copy

2023-04-08 13:08:12.349950


In [2]:
print('This is repeatable.')
original = [x for x in range(10)]
print('Original',original)
copy1 = copy.deepcopy(original)
copy2 = copy.deepcopy(original)

random.seed(10)
random.shuffle(copy1)  # in-place
print('Shuffled',copy1)

random.seed(10)
random.shuffle(copy2)  # in-place
print('Shuffled',copy2)

This is repeatable.
Original [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Shuffled [5, 2, 7, 1, 8, 4, 3, 6, 0, 9]
Shuffled [5, 2, 7, 1, 8, 4, 3, 6, 0, 9]


In [3]:
DATA_DIR='/Users/jasonmiller/WVU/Localization/TrainTest/TrainTest_ver43/'
SEQ_FILE='train.all_pc_transcripts.csv'
OUTPUT_DIR='/Users/jasonmiller/WVU/Localization/TrainTest/TrainTest_ver43/'

In [4]:
def load_genes():
    gene_set = set()
    with open (DATA_DIR+SEQ_FILE,'r') as fin:
        header = None
        for line in fin:
            if header is None:
                header = line
            else:
                fields=line.strip().split(',')
                gene_id=fields[1]
                gene_set.add(gene_id)
    return list(gene_set)

In [5]:
print('This is repeatable.')
original = load_genes()
print('Original',original[:5])
copy1 = copy.deepcopy(original)
again = copy.deepcopy(original)
copy2 = copy.deepcopy(original)

random.seed(10)
random.shuffle(copy1)  # in-place
print('Shuffled',copy1[:5])

random.seed(10)
random.shuffle(again)  # in-place
print('Shuffled',again[:5])

# Shuffled ['ENSG00000275191', 'ENSG00000228495', 'ENSG00000272155', 'ENSG00000235890', 'ENSG00000274020']

This is repeatable.
Original ['ENSG00000118482', 'ENSG00000178467', 'ENSG00000177613', 'ENSG00000101463', 'ENSG00000198948']
Shuffled ['ENSG00000140451', 'ENSG00000166170', 'ENSG00000102974', 'ENSG00000268089', 'ENSG00000186806']
Shuffled ['ENSG00000140451', 'ENSG00000166170', 'ENSG00000102974', 'ENSG00000268089', 'ENSG00000186806']


In [6]:
print('This is different from above but also repeatable.')
random.shuffle(copy2)  # in-place
print('Shuffled',copy2[:5])

# Shuffled ['ENSG00000254290', 'ENSG00000231133', 'ENSG00000250514', 'ENSG00000273998', 'ENSG00000268001']

This is different from above but also repeatable.
Shuffled ['ENSG00000183831', 'ENSG00000145362', 'ENSG00000006459', 'ENSG00000128513', 'ENSG00000039560']


In [7]:
def make_valids(gene_list):
    valids=[ [],[],[],[],[] ]
    robin = 0
    for i in range(len(gene_list)):
        gene = gene_list[i]
        valids[robin].append(gene)
        robin = (robin+1)%5
    return valids

In [8]:
def write_valids(valids,repetition):
    r = repetition
    for fold in range(5):
        valid_list = valids[fold]
        f = fold + 1
        VALID_FILE = 'pc.{}.{}.validation_genes.txt'.format(r,f)
        filename = OUTPUT_DIR+VALID_FILE
        with open (filename,'w') as fout:
            for gene in valid_list:
                print(gene,file=fout)

In [9]:
valids = make_valids(copy1)
write_valids(valids,1)
valids = make_valids(copy2)
write_valids(valids,2)

In [10]:
print(datetime.now())
print('done')

2023-04-08 13:08:13.465321
done
