In [9]:
import sys
sys.path.append('/home/vineetb/git_checkouts/dsprint')

In [10]:
import bisect
import os
import os.path
import gzip
import sys
from concurrent.futures import ThreadPoolExecutor, as_completed
import numpy as np

from dsprint.core import CHROMOSOMES


DIRNAME = '/media/vineetb/t5-vineetb/dsprint/in/hgdownload/hg19/phastCons100way/hg19.100way.phastCons'
FILENAME = 'chr4.phastCons100way.wigFix.gz'


class WigFix:

    def __init__(self, filename):
        self.filename = filename
        positions = []
        last_position = 0
        values = {}

        with gzip.open(filename, 'rt') as f:
            line = f.readline()
            while line != '':
                if line.startswith('fixedStep'):
                    last_position = int(line.split(' ')[2].split('=')[1])
                    values[last_position] = []
                    positions.append(last_position)
                else:
                    values[last_position].append(float(line))
                line = f.readline()

        values = {k: np.array(v) for k, v in values.items()}
        self.positions = positions
        self.values = values

    def __getitem__(self, item):
        position = bisect.bisect_right(self.positions, item)
        # The return position from bisect_right is the insert position
        # This is 0 for elements < the first that we have, 1 between [<first>, <second>)
        # Subtract 1 to get the index where we can start our forward search
        if position < 1:
            return None
        start_position = self.positions[position - 1]
        try:
            return self.values[start_position][item - start_position]
        except IndexError:
            return None


class ConservationScore:
    def __init__(self, folder, file_pattern):
        self.folder = folder
        self.file_pattern = file_pattern
        self.wigfixes = {}

    def _load1(self, chromosome):
        file_path = os.path.join(self.folder, self.file_pattern.format(chromosome))
        self.wigfixes[chromosome] = WigFix(file_path)
            
    def load(self, chromosomes=None, max_workers=7):
        chromosomes = chromosomes or CHROMOSOMES
        
        with ThreadPoolExecutor(max_workers=min(max_workers, len(chromosomes))) as executor:
            futures = {executor.submit(self._load1, chromosome): chromosome
                   for chromosome in chromosomes}
        for future in as_completed(futures):
            print(f'Finished processing chromosome {futures[future]}')

    def __getitem__(self, item):
        return self.wigfixes[item]

In [11]:
p = ConservationScore(
    '/media/vineetb/t5-vineetb/dsprint/in/hgdownload/hg19/phastCons100way/hg19.100way.phastCons',
    'chr{}.phastCons100way.wigFix.gz'
)

In [12]:
%%time

p.load(['4','8'])

Finished processing chromosome 8
Finished processing chromosome 4
CPU times: user 3min 7s, sys: 4.38 s, total: 3min 12s
Wall time: 3min 5s


In [13]:
sum(len(x) for pN in p.wigfixes for x in p[pN].values.values())/1024./1024.

311.33763313293457

In [15]:
x = 2.0
sys.getsizeof(x)

24

In [1]:
import numpy as np

In [2]:
x = np.array([1,2,3,5])

In [3]:
x[0]

1

In [4]:
x[1]

2

In [6]:
x[3]

5

In [7]:
type(x)

numpy.ndarray

In [14]:
o = p['4']
print(o[10528])
print(o[11528])
print(o[191041279])
print(o[191041280])
print(o[191041281])
print(o[192041281])

None
0.062
0.01
0.019
0.028
None
