In [16]:
# Imports

import numpy as np
import pandas
import os, os.path
from itertools import *
%matplotlib inline
import matplotlib.pyplot as plt

# Idempotent data retrieval script

chromosomes = [1, 2, 6, 7, 11]
def chromosome_files(n):
    base = 'intersected_final_chr'
    spec = '_cutoff_20_'
    suffixes = ['train.bed', 'sample_partial.bed', 'sample_full.bed']
    return [base + str(n) + spec + suffix for suffix in suffixes]
all_files = set(chain.from_iterable(chromosome_files(n) for n in chromosomes))

if 'methylation_imputation' not in [x for x in os.listdir('.') if os.path.isdir(x)]:
    raise Exception('Missing assignment repository in cwd')

if not os.path.exists('data'):
    os.mkdir('data')

def all_files_present(): return all_files.issubset(set(os.listdir('data')))
if not all_files_present():
    ! gunzip -k methylation_imputation/data/*.bed.gz
    ! mv methylation_imputation/data/*.bed data/
    if not all_files_present():
        raise Exception('Error unpacking data')

def read_tsv(name): return pandas.read_csv(name, sep='\t', header=None)
train_chr1 = read_tsv('data/' + chromosome_files(1)[0])
test_chr1_partial = read_tsv('data/' + chromosome_files(1)[1])
test_chr1_full = read_tsv('data/' + chromosome_files(1)[2])

SITE_START = 1
SITE_END = 2
STRAND_TYPE = 3
SAMPLE_OFFSET = 4

In [29]:
train_chr1.loc[range(267, 273)]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,37
267,chr1,121511,121513,+,0.714286,0.73913,0.8,0.782609,0.794872,0.75,...,0.860465,0.789474,,0.727273,0.896552,0.810811,0.809524,0.842105,0.88,0
268,chr1,121516,121518,+,0.857143,0.8,0.833333,0.913043,0.909091,0.806452,...,0.931818,0.888889,,0.84,0.862069,0.756757,0.916667,0.833333,0.916667,0
269,chr1,122736,122738,-,0.545455,0.384615,0.472727,,0.436364,0.58,...,0.487179,,,,0.413043,0.462963,0.533333,0.659574,0.682927,0
270,chr1,122869,122871,-,0.837838,0.814815,0.725,0.807692,0.710526,0.565217,...,0.62963,,,,0.755556,0.590909,0.681818,0.657143,,0
271,chr1,122884,122886,-,0.842105,0.821429,0.72093,0.8,0.795455,0.478261,...,0.617021,,,,0.682927,0.5625,0.73913,0.666667,0.756098,0
272,chr1,127749,127751,+,0.633333,0.473684,0.703704,0.611111,0.827586,0.666667,...,0.782609,0.8,,0.8125,0.5,0.8125,0.840909,0.6875,0.807692,0


In [35]:
unknown_chr1_ix = np.where(test_chr1_partial[5] == 0)[0]
assert len(train_chr1) == len(test_chr1_partial) and len(train_chr1) == len(test_chr1_full)
def count_unique(col):
    return {x:sum(col == x) for x in set(col)}

print('Chromosome 1')
print('Total samples', len(train_chr1))
print('Site lengths', count_unique(train_chr1[SITE_END] - train_chr1[SITE_START]))
print('Strand types', count_unique(train_chr1[STRAND_TYPE]))
print('Unknown sample site counts', len(unknown_chr1_ix))

Chromosome 1
Total samples 379551
Site lengths {2: 379551}
Strand types {'-': 1464, '+': 378087}
Unknown sample site counts 371941


In [37]:
count_unique(test_chr1_partial[unknown_chr1_ix[STRAND_TYPE]])

{'+': 378087, '-': 1464}