In [1]:
import io
import os
import csv
import gzip
import time
import multiprocessing
import resource
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import statsmodels.api as sm
import random
from collections import Counter
import matplotlib.colors as mcolors
from scipy.stats import poisson
import itertools
import collections
import scipy
from scipy.stats import chi2
from scipy.stats import friedmanchisquare
from scipy.stats import studentized_range
pd.options.mode.chained_assignment = None

In [2]:
def get_mem():
    current_memory_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    current_memory_usage_mb = current_memory_usage / 1024
    print(f"Current memory usage: {current_memory_usage_mb:.2f} MB")
def read_vcf(file):
    with io.TextIOWrapper(gzip.open(file,'r')) as f:
        lines =[l for l in f if not l.startswith('##')]
        dynamic_header_as_key = []
        for liness in f:
            if liness.startswith("#CHROM"):
                dynamic_header_as_key.append(liness)
        values = [str,int,str,str,str,int,str,str,str,str]
        columns2detype = dict(zip(dynamic_header_as_key,values))
        df = pd.read_csv(
            io.StringIO(''.join(lines)),
            dtype=columns2detype,
            sep='\t'
        ).rename(columns={'#CHROM':'CHROM'})
    df['CHROM'] = df['CHROM'].str.extract(r'(\d+)')
    df['CHROM'] = df['CHROM'].astype(int)
    return df
def extract_info(df, info_cols = ['EAF', 'INFO_SCORE'], attribute = 'INFO', drop_attribute = True):
    for i in info_cols:
        df[i] = df[attribute].str.extract( i + '=([^;]+)' )
        df[i] = df[i].astype(float)
    if drop_attribute:
        df = df.drop(columns = [attribute])
    return df
def extract_format(df, sample, fmt = 'FORMAT'):
    fields = df[fmt].values[0].split(':')
    try:
        df[fields] = df[sample].str.split(':', expand=True)
        df[df.columns[-1]] = df[df.columns[-1]].astype(float)
        if len(fields) != len(df[sample].values[0].split(':')):
            raise ValueError("Mismatching fields in FORMAT and Imputed results.")
    except ValueError as e:
        print(f"Error: {e}")
    return df.drop(columns = [fmt, sample])
def drop_cols(df, drop_lst = ['ID', 'QUAL', 'FILTER']):
    return df.drop(columns = drop_lst)

def parse_vcf(file, sample, q = None, 
              info_cols = ['EAF', 'INFO_SCORE'], attribute = 'INFO', fmt = 'FORMAT', drop_attribute = True, drop_lst = ['ID', 'QUAL', 'FILTER']):
    df = read_vcf(file)
    df = extract_info(df, info_cols = info_cols, attribute = attribute, drop_attribute = drop_attribute)
    df = extract_format(df, sample, fmt = fmt)
    df = drop_cols(df, drop_lst = drop_lst)
    if q is None:
        return df
    else:
        q.put(df)
def concat_vcf(lst):
    df = lst[0]
    for i in range(1, len(lst)):
        df = pd.concat([df, lst[i]])
    return df.reset_index(drop = True)

def multi_parse(chromosomes, files, sample, n_processes = 22,
               info_cols = ['EAF', 'INFO_SCORE'], attribute = 'INFO', fmt = 'FORMAT', drop_attribute = True, drop_lst = ['ID', 'QUAL', 'FILTER']):
    manager = multiprocessing.Manager()
    q = manager.Queue()
    processes = []
    for i in range(len(chromosomes)):
        tmp = multiprocessing.Process(target=parse_vcf, args=(files[i], sample, q, info_cols, attribute, fmt, drop_attribute, drop_lst))
        tmp.start()
        processes.append(tmp)
    for process in processes:
        process.join()
    res_lst = []
    while not q.empty():
        res_lst.append(q.get())
    return res_lst

In [3]:
chromosomes = [i for i in range(1,23)]
sample = 'GM8'
files = ['../../GAMCC_oneKG/test/chr' + str(i) + '.vcf.gz' for i in chromosomes]
res = multi_parse(chromosomes, files, sample)
df = concat_vcf(res).sort_values(by = ['CHROM', 'POS'])
df

In [4]:
start = time.time()
df = df.sort_values(by = ['CHROM', 'POS'])
end = time.time()
print(end - start)
df

57.08745741844177


Unnamed: 0,CHROM,POS,REF,ALT,EAF,INFO_SCORE,GT,GP,DS
50840781,1,10397,C,A,0.00174,1.00000,0|0,"0.998,0.002,0",0.002
50840782,1,10420,A,C,0.00103,1.00000,0|0,"0.998,0.002,0",0.002
50840783,1,10437,T,C,0.00181,1.00000,0|0,"0.998,0.002,0",0.002
50840784,1,10438,A,T,0.00270,1.00000,0|0,"0.998,0.002,0",0.002
50840785,1,10440,C,A,0.02091,0.35439,0|0,"0.998,0.002,0",0.002
...,...,...,...,...,...,...,...,...,...
1785595,22,50807823,T,G,0.00102,1.00000,0|0,"0.998,0.002,0",0.002
1785596,22,50807834,T,G,0.00100,1.00000,0|0,"0.998,0.002,0",0.002
1785597,22,50807836,C,A,0.00522,0.54168,0|0,"0.998,0.002,0",0.002
1785598,22,50807841,T,G,0.00115,1.00000,0|0,"0.998,0.002,0",0.002


In [5]:
df.dtypes

CHROM           int64
POS             int64
REF            object
ALT            object
EAF           float64
INFO_SCORE    float64
GT             object
GP             object
DS            float64
dtype: object