In [1]:
def calcAndAppendStatValForScan(alleleCounts, snpLocs, statName, subWinStart, subWinEnd, hapsInSubWin, statVals):
# modified code from https://github.com/kern-lab/diploSHIC/blob/master/fvTools.py
    if statName == "tajD":
        statVals[statName].append(allel.stats.diversity.tajima_d(
            alleleCounts, pos=snpLocs, start=subWinStart, stop=subWinEnd))
    elif statName == "pi":
        statVals[statName].append(allel.stats.diversity.sequence_diversity(
            snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd))
    elif statName == "thetaW":
        statVals[statName].append(allel.stats.diversity.watterson_theta(
            snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd)) 
    elif statName == 'H2/H1' or statName == 'H12':
        h1,h12,h123,h21 = allel.stats.selection.garud_h(hapsInSubWin)
        if statName == 'H2/H1':
            statVals[statName].append(h21)
        else:
            statVals[statName].append(h12)
    else:
        print(statName + " not found")

In [2]:
import numpy as np
import pandas as pd
import allel
from allel.model.ndarray import SortedIndex
from allel.util import asarray_ndim
import re
import subprocess
import os.path

datadir = "/media/kevin/TOSHIBA_EXT/TTT_RecombinationGenomeScans/results_final/"
cmd      = 'ls ' + datadir + '*.vcf.gz'
fileList =  subprocess.run(cmd, shell = True, stdout=subprocess.PIPE).stdout.decode('utf-8')
fileList = fileList.split("\n")
fileList = list(filter(None, fileList))

winsize = 1000     # how big should the sliding window be around each SNP?

In [3]:
for f in fileList:
    print("Processing " + f)
    vcf     = allel.read_vcf(f, fields = ["CHROM", "POS", "GT"])
    m       = re.search('[0-9]{5}', f)      # m is a regex match object
    simNum  = m.group(0)                    # sim number is the first set of 5 numbers in the vcf name
    
    g       = allel.GenotypeArray(vcf["calldata/GT"])
    ac      = g.count_alleles()
    haps    = g.to_haplotypes()
    pos     = vcf['variants/POS']
    
    statvals = { "tajD"   : [],
                 "pi"     : [],
                 "thetaW" : [],
                 "H12"    : [],
                 "H2/H1"  : []}


    for SNP in pos:
        winStart = SNP - int(winsize/2)
        winEnd   = SNP + int(winsize/2)
        
        if winStart < pos[0]:
            winStart = pos[0]
        
        ####### subset the haplotype array ##############
        startInd = np.searchsorted(pos, winStart) 
        endInd   = np.searchsorted(pos, winEnd, side = 'right') - 1 
        # subtract 1 so the end index will not be outside the window. Side = 'right' in case the match is exact,
        # in that case searchsorted will return the index after the match so the -1 will still be valid
        
        hapsInSubWin = allel.HaplotypeArray(haps.subset(list(range(startInd, endInd + 1))))
    
        for statName in statvals.keys():
            calcAndAppendStatValForScan(ac, pos, statName, winStart, winEnd, hapsInSubWin, statvals)
                                            
    scanResultsFile = datadir + simNum + "_Invers_ScanResults.txt"
    outfile         = datadir + simNum + "_Invers_ScanResults_new.txt"
    
    allFeatures = []
    with open (scanResultsFile, 'r') as f:
        for line in f:
            features = line.split()
            features = [f.strip('"') for f in features]
            allFeatures.append(features)
        stats  = allFeatures[1:]
        header = allFeatures[0]
        featDf =  pd.DataFrame(stats, columns = header)
        featDf = featDf[featDf.keep_loci == "TRUE"].reset_index(drop = True)

    featDf = featDf.join(pd.DataFrame.from_dict(statvals))
    featDf.to_csv(outfile, sep = " ", index = False)

Processing /media/kevin/TOSHIBA_EXT/TTT_RecombinationGenomeScans/results_final/10900_Invers_VCFallFILT.vcf.gz
Processing /media/kevin/TOSHIBA_EXT/TTT_RecombinationGenomeScans/results_final/10901_Invers_VCFallFILT.vcf.gz
Processing /media/kevin/TOSHIBA_EXT/TTT_RecombinationGenomeScans/results_final/10902_Invers_VCFallFILT.vcf.gz
Processing /media/kevin/TOSHIBA_EXT/TTT_RecombinationGenomeScans/results_final/10903_Invers_VCFallFILT.vcf.gz
Processing /media/kevin/TOSHIBA_EXT/TTT_RecombinationGenomeScans/results_final/10904_Invers_VCFallFILT.vcf.gz
Processing /media/kevin/TOSHIBA_EXT/TTT_RecombinationGenomeScans/results_final/10905_Invers_VCFallFILT.vcf.gz
Processing /media/kevin/TOSHIBA_EXT/TTT_RecombinationGenomeScans/results_final/10906_Invers_VCFallFILT.vcf.gz
Processing /media/kevin/TOSHIBA_EXT/TTT_RecombinationGenomeScans/results_final/10907_Invers_VCFallFILT.vcf.gz
Processing /media/kevin/TOSHIBA_EXT/TTT_RecombinationGenomeScans/results_final/10908_Invers_VCFallFILT.vcf.gz
Processing