This notebook weaves together the haplotype scaffold at biallelic sites phased via shapeit2 with the multiallelic and other extra (e.g., non-PASS) variants phased via mvncall.

In [1]:
%run setup.ipynb

In [2]:
# haplotype scaffold
callset_phased = phase2_ar1.callset_phased
gt_phased = allel.GenotypeDaskArray(callset_phased['2L']['calldata']['genotype'])
pos_phased = allel.SortedIndex(callset_phased['2L']['variants']['POS'])
pos_phased.shape, gt_phased.shape

((8906423,), (8906423, 1164, 2))

In [3]:
# define region we're going to analyse
loc_region = pos_phased.locate_range(0, 6000000)
loc_region

slice(0, 390585, None)

In [4]:
# extract data for region, remove colony parents
pos_phased_region = pos_phased[loc_region]
gt_phased_region = gt_phased[loc_region][:, :-22].compute()
pos_phased_region.shape, gt_phased_region.shape 

((390585,), (390585, 1142, 2))

In [5]:
# load mvn haplotypes
callset_extras = np.load('../data/phasing_extra_phase2.mvncall.200.npz')

In [6]:
pos_extras = callset_extras['variants/POS']
gt_extras = callset_extras['calldata/GT']
pos_extras.shape, gt_extras.shape

((3,), (3, 1142, 2))

In [7]:
# concatenate
gt_combined = np.concatenate([gt_phased_region, gt_extras], axis=0)
pos_combined = np.concatenate([pos_phased_region, pos_extras], axis=0)

# sort by position
idx_sorted = np.argsort(pos_combined)
gt_combined = gt_combined[idx_sorted]
pos_combined = pos_combined[idx_sorted]

In [8]:
# save
haps_combined = allel.GenotypeArray(gt_combined).to_haplotypes()
np.savez_compressed('../data/haps_phase2.npz', haplotypes=haps_combined, POS=pos_combined)

In [9]:
haps_combined.nbytes

892102992

In [10]:
!ls -lh ../data/haps_phase2.npz

-rw-rw-r-- 1 chris chris 16M Aug 19 16:04 ../data/haps_phase2.npz
