In [1]:
import os
import errno

from labels import populations
from common import open_infile, get_conserved, get_chromosomes_from_args
from mutation_counter import MutationCounter, IncludedRegion

import gzip
from IPython.display import clear_output
import os
import tempfile


In [2]:

chrom= 22
chrom = str(chrom)

class IncludedRegion:
    def __init__(self, chrom, output, outfile_path, conserved):
        self.chrom = chrom
        self.output = output
        self.outfile_path = outfile_path
        self.conserved = conserved

        self.conserved_ind = 0

    def configure(self, column_labels):
        self.indices = get_column_indices(column_labels)
        self.column_index_to_population = get_column_index_to_population(column_labels)
        self.mut_count = initialize_mut_count(self.indices)

    def update_position(self, pos):
        while self.conserved_ind<len(self.conserved)-1 and pos>self.conserved[self.conserved_ind][1]:
            self.conserved_ind+=1

    def update_counts(self, position, mutation, population, count):
        if (
            position >= self.conserved[self.conserved_ind][0] and
            position <= self.conserved[self.conserved_ind][1]
        ):
            self.mut_count[(mutation, population, count)] += 1

    def write_output(self):
        write_output(self.output, self.outfile_path, self.indices, self.mut_count)




In [15]:
included_regions = []

output={population: 'Mut\n' for population in populations}
outfile_path = '../finescale_mut_spectra/mut_type_v_allele_freq_%s_chr'+chrom+'_nosingle.txt'
conserved = [[0, 1e12]]
included_regions.append(IncludedRegion(chrom, output, outfile_path, conserved))


output = {population: 'Ref Alt \n' for population in populations}
outfile_path = '../finescale_mut_spectra/inrepeats_mut_type_v_allele_freq_%s_chr'+chrom+'_nosingle.txt'
conserved = get_conserved('../data/bed_files/nestedRepeats.txt.gz', chrom)
included_regions.append(IncludedRegion(chrom, output, outfile_path, conserved))


output = {population: 'Ref Alt \n' for population in populations}
outfile_path = '../finescale_mut_spectra/phyloP_conserved_mut_type_v_allele_freq_%s_chr'+chrom+'_nosingle.txt'
conserved = get_conserved('../data/bed_files/phastConsElements100way.txt.gz', chrom)
included_regions.append(IncludedRegion(chrom, output, outfile_path, conserved))


mutation_counter = MutationCounter(chrom, included_regions)


In [22]:
conserved[:10]

[(16055136, 16055172),
 (16055444, 16055466),
 (16055553, 16055602),
 (16063545, 16063572),
 (16074725, 16074836),
 (16074882, 16074956),
 (16074980, 16075098),
 (16075110, 16075132),
 (16075175, 16075214),
 (16075268, 16075296)]

### Couting mutations

defining classes

In [13]:
import itertools as it

def get_human_chimp_differences(chromosome_number):
    human_chimp_differences = {}

    file_path = (
        '../data/hg19_chimp_align/human_chimp_diffs_chr' +
        chromosome_number +
        '.txt'
    )
    

    infile= open(file_path,'r')
    d= 0
    
    for line in infile:
        
        if d > 0 and 'SNP' in line:
            position, _, _, chimp_allele = line.split()
            human_chimp_differences[int(position)] = chimp_allele
        
        d += 1
    
    return human_chimp_differences


def get_column_indices(column_labels):
    population_to_column_indices = {
        population: [] for population in populations
    }
    
    sample_ids = column_labels[9:]
    for i, sample_id in enumerate(sample_ids):
        
        population_to_column_indices[
            sample_id_to_population[sample_id].decode()
        ].append(i + 9)
    
    return population_to_column_indices


In [14]:

def get_finescale(mutation_counter):
    infile, line = open_infile(mutation_counter.chrom)
    
    print('configuring..')
    mutation_counter.configure(line)
    
    print('processing lines.')
    for line_counter, line in enumerate(infile):
        mutation_counter.process_line(line)
    
    print('writing.')
    mutation_counter.write_output()

    
get_finescale(mutation_counter)

print('finished chrom '.format(chrom)) 


configuring..
	2513 columns
	column index to pop
len of reference seq: 51304566
	differences to human
	configure regions
processing lines.
writing.
../finescale_mut_spectra/phyloP_conserved_mut_type_v_allele_freq_%s_chr22_nosingle.txt
finished chrom 


In [9]:
infile, line = open_infile(mutation_counter.chrom)


In [10]:
column_labels = line.split()
t= get_column_index_to_population(column_labels)

In [11]:
t= infile.readline()

In [12]:
mutation_counter.process_line(t)

[b'22', b'16050075', b'rs587697622', b'A', b'G', b'100', b'PASS', b'AC=1;AF=0.000199681;AN=5008;NS=2504;DP=8012;EAS_AF=0;AMR_AF=0;AFR_AF=0;EUR_AF=0;SAS_AF=0.001;AA=.|||;VT=SNP', b'GT', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0', b'0|0',

In [13]:
s=t.strip(b'\n').split(b'\t')
pos=int(s[1])

context = mutation_counter.refseq[pos-2:pos+1]


In [18]:
s[40].decode()[2]

'|'

In [23]:
pos in mutation_counter.human_chimp_differences

True

In [24]:
mutation_counter.human_chimp_differences.get(pos) == s[4]

False

In [25]:
context

b'CGG'

In [26]:
s[3] in b'ACGT' and s[4] in b'ACGT'

True

In [27]:
mutation_counter.column_index_to_population

{9: b'GBR',
 10: b'GBR',
 11: b'GBR',
 12: b'GBR',
 13: b'GBR',
 14: b'GBR',
 15: b'GBR',
 16: b'GBR',
 17: b'GBR',
 18: b'GBR',
 19: b'GBR',
 20: b'GBR',
 21: b'GBR',
 22: b'GBR',
 23: b'GBR',
 24: b'GBR',
 25: b'GBR',
 26: b'GBR',
 27: b'GBR',
 28: b'GBR',
 29: b'GBR',
 30: b'GBR',
 31: b'GBR',
 32: b'GBR',
 33: b'GBR',
 34: b'GBR',
 35: b'GBR',
 36: b'GBR',
 37: b'GBR',
 38: b'GBR',
 39: b'GBR',
 40: b'GBR',
 41: b'GBR',
 42: b'GBR',
 43: b'GBR',
 44: b'GBR',
 45: b'GBR',
 46: b'GBR',
 47: b'GBR',
 48: b'GBR',
 49: b'GBR',
 50: b'GBR',
 51: b'GBR',
 52: b'GBR',
 53: b'GBR',
 54: b'GBR',
 55: b'GBR',
 56: b'GBR',
 57: b'GBR',
 58: b'GBR',
 59: b'GBR',
 60: b'GBR',
 61: b'GBR',
 62: b'GBR',
 63: b'GBR',
 64: b'FIN',
 65: b'FIN',
 66: b'FIN',
 67: b'FIN',
 68: b'FIN',
 69: b'FIN',
 70: b'FIN',
 71: b'FIN',
 72: b'FIN',
 73: b'FIN',
 74: b'FIN',
 75: b'FIN',
 76: b'FIN',
 77: b'FIN',
 78: b'FIN',
 79: b'FIN',
 80: b'FIN',
 81: b'GBR',
 82: b'GBR',
 83: b'GBR',
 84: b'GBR',
 85: b'GBR',
