In [3]:
import math
import cooler
import numpy as np
import pandas as pd
import h5py
import dask
import dask.array as da
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
from dask.diagnostics import ProgressBar
from scipy import sparse

In [4]:
###### Idea 1: Read in pixel table as hdf5 and convert to dask dataframe
cool = 's38_221203_Granta519EBF1KI_cl27_0hr_MicroC_25U_Nova.mcool'
res = 5000
f = h5py.File(cool, 'a')
bin1 = da.from_array(f['/resolutions/{}/pixels/bin1_id'.format(res)], chunks=(3000000,))
bin2 = da.from_array(f['/resolutions/{}/pixels/bin2_id'.format(res)], chunks=(3000000,))
count = da.from_array(f['/resolutions/{}/pixels/count'.format(res)], chunks=(3000000,))
df = dd.concat([dd.from_dask_array(d) for d in [bin1, bin2, count]], axis=1)
cols = ['bin1','bin2','counts']
df_bins = df.rename(columns=dict(zip(df.columns, cols)))
df_bins

Unnamed: 0_level_0,bin1,bin2,counts
npartitions=70,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,int64,int64,int32
3000000,...,...,...
...,...,...,...
207000000,...,...,...
207915424,...,...,...


In [5]:
def correct_upper_triangle(x):
    mask = x['bin1'].values != x['bin2'].values
    x.loc[mask, 'counts'] *= 4
    return x
with ProgressBar():
    corrected_pixels = df_bins.map_partitions(correct_upper_triangle).compute()

[########################################] | 100% Completed | 29.04 s


In [6]:
#corrected_pixels = df_bins.compute()

In [7]:
# Store full matrix size
mat_size = da.from_array(f['/resolutions/{}/bins/chrom'.format(res)]).size

In [8]:
clr = cooler.Cooler("{}::resolutions/{}".format(cool, res))
#clr.bins()[5000:5010]

In [9]:
mat = clr.matrix(balance=False, sparse=True)[:,:].tocsr()

In [10]:
rowsums = np.ravel(mat.sum(axis=1))
raw_sum = rowsums.sum()

In [11]:
# Apply transformation to pixel table (in CSR form as dataframe)
i = rowsums[corrected_pixels.bin1]
j = rowsums[corrected_pixels.bin2]
print(i, j)

[  27   27   27 ... 7068 7068 2891] [1581 1104 1221 ... 7068 2891 2891]


In [12]:
scaled_sum = ((corrected_pixels.counts) / (i*j)).sum()
scaled_sum

588.8480385814228

In [13]:
raw_sum = mat.sum()
raw_sum

1143064853

In [14]:
scale = np.sqrt(scaled_sum/raw_sum)
scale

0.0007177383976095415

In [16]:
scaled_rowsums = rowsums * scale
scaled_rowsums

array([0.        , 0.        , 0.01937894, ..., 0.        , 0.        ,
       0.        ])

In [18]:
#del f['resolutions/{}/bins/test_weights'.format(res)]
f.create_dataset('resolutions/{}/bins/test_weights'.format(res), data = scaled_rowsums)
f.close()

In [19]:
clr.bins()[1000:1050]

Unnamed: 0,chrom,start,end,KR,VC,VC_SQRT,test_weights
1000,1,5000000,5005000,0.786037,0.816663,0.785934,0.82181
1001,1,5005000,5010000,0.790358,0.810701,0.78306,0.816786
1002,1,5010000,5015000,0.772805,0.797076,0.776452,0.793101
1003,1,5015000,5020000,0.886132,0.952915,0.848969,0.93306
1004,1,5020000,5025000,0.770715,0.820069,0.787571,0.807456
1005,1,5025000,5030000,0.835658,0.880531,0.816088,0.867746
1006,1,5030000,5035000,0.796115,0.843913,0.798939,0.832577
1007,1,5035000,5040000,0.814779,0.847319,0.80055,0.839754
1008,1,5040000,5045000,0.797997,0.844765,0.799342,0.854109
1009,1,5045000,5050000,0.805485,0.839655,0.796921,0.82181


In [20]:
clr.matrix(balance='test_weights', divisive_weights=True, join=True, as_pixels=True)[5000:5050]

Unnamed: 0,chrom1,start1,end1,chrom2,start2,end2,count,balanced
0,1,25000000,25005000,1,25000000,25005000,266,511.230966
1,1,25000000,25005000,1,25005000,25010000,107,194.608066
2,1,25000000,25005000,1,25010000,25015000,20,40.792619
3,1,25000000,25005000,1,25015000,25020000,8,13.796647
4,1,25000000,25005000,1,25020000,25025000,3,5.272604
...,...,...,...,...,...,...,...,...
14374,1,25245000,25250000,21,35180000,35185000,1,0.664150
14375,1,25245000,25250000,21,37045000,37050000,1,0.552520
14376,1,25245000,25250000,21,42685000,42690000,1,0.620956
14377,1,25245000,25250000,21,42870000,42875000,1,0.555180


In [21]:
clr.matrix(balance='VC', divisive_weights=True, join=True, as_pixels=True)[5000:5050]

Unnamed: 0,chrom1,start1,end1,chrom2,start2,end2,count,balanced
0,1,25000000,25005000,1,25000000,25005000,266,453.851945
1,1,25000000,25005000,1,25005000,25010000,107,174.416035
2,1,25000000,25005000,1,25010000,25015000,20,37.320756
3,1,25000000,25005000,1,25015000,25020000,8,12.598629
4,1,25000000,25005000,1,25020000,25025000,3,4.953336
...,...,...,...,...,...,...,...,...
14374,1,25245000,25250000,21,35180000,35185000,1,0.634450
14375,1,25245000,25250000,21,37045000,37050000,1,0.508811
14376,1,25245000,25250000,21,42685000,42690000,1,0.585045
14377,1,25245000,25250000,21,42870000,42875000,1,0.506640
