This notebook weaves together the haplotype scaffold at biallelic sites phased via shapeit2 with the multiallelic and other extra (e.g., non-PASS) variants phased via mvncall.

In [8]:
%run setup.ipynb

In [23]:
# haplotype scaffold
callset_phased = phase2_ar1.callset_phased
gt_phased = allel.GenotypeDaskArray(callset_phased['2L']['calldata']['genotype'])
pos_phased = allel.SortedIndex(callset_phased['2L']['variants']['POS'])
pos_phased.shape, gt_phased.shape

((8906423,), (8906423, 1164, 2))

In [24]:
# define region we're going to analyse
loc_region = pos_phased.locate_range(0, 6000000)
loc_region

slice(0, 390585, None)

In [28]:
# extract data for region, remove colony parents
pos_phased_region = pos_phased[loc_region]
gt_phased_region = gt_phased[loc_region][:, :-22].compute()
pos_phased_region.shape, gt_phased_region.shape 

((390585,), (390585, 1142, 2))

## awaiting mvn haplotypes...

In [5]:
# load mvn haplotypes
callset_extras = np.load('../data/phasing_extra_phase1.mvncall.200.npz')
pos_extras = callset_extras['variants']['POS']
gt_extras = callset_extras['calldata']['genotype']
pos_extras.shape, gt_extras.shape

((3,), (3, 765, 2))

In [6]:
# concatenate
gt_combined = np.concatenate([gt_phased_region, gt_extras], axis=0)
pos_combined = np.concatenate([pos_phased_region, pos_extras], axis=0)

# sort by position
idx_sorted = np.argsort(pos_combined)
gt_combined = gt_combined[idx_sorted]
pos_combined = pos_combined[idx_sorted]

In [7]:
# obtain data from unphased callset - only needed for variant annotations
callset = phase1_ar31.callset
pos_all = allel.SortedIndex(callset['2L/variants/POS'])
ann_all = callset['2L/variants/ANN'][:][['Annotation', 'HGVS_p', 'HGVS_c']]
ann_all

array([(b'intergenic_region', b'.', b'.'),
       (b'intergenic_region', b'.', b'.'),
       (b'intergenic_region', b'.', b'.'), ...,
       (b'intergenic_region', b'.', b'.'),
       (b'intergenic_region', b'.', b'.'),
       (b'intergenic_region', b'.', b'.')], 
      dtype=[('Annotation', 'S34'), ('HGVS_p', 'S14'), ('HGVS_c', 'S12')])

In [8]:
# locate the intersection with unphased callset - needed to tie in annotations
loc1, _ = pos_all.locate_intersection(pos_combined)
np.count_nonzero(loc1)

341998

In [9]:
# extract annotations for the phased variants
ann_combined = ann_all[loc1]
ann_combined

array([(b'intergenic_region', b'.', b'.'),
       (b'intergenic_region', b'.', b'.'),
       (b'intergenic_region', b'.', b'.'), ...,
       (b'upstream_gene_variant', b'.', b'n.-9G>A'),
       (b'upstream_gene_variant', b'.', b'n.-9T>G'),
       (b'upstream_gene_variant', b'.', b'n.-9G>T')], 
      dtype=[('Annotation', 'S34'), ('HGVS_p', 'S14'), ('HGVS_c', 'S12')])

In [10]:
# save
haps_combined = allel.GenotypeArray(gt_combined).to_haplotypes()
np.savez_compressed('../data/haps_phase1.npz', haplotypes=haps_combined, POS=pos_combined, ANN=ann_combined)

In [11]:
haps_combined.nbytes

523256940

In [12]:
!ls -lh ../data/haps_phase1.npz

-rw-rw-r-- 1 chris chris 12M Dec  1 14:26 ../data/haps_phase1.npz
