In [46]:
import io
import os
import csv
import gzip
import time
import multiprocessing
import resource
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import statsmodels.api as sm
import random
from collections import Counter
import matplotlib.colors as mcolors
from scipy.stats import poisson
import itertools
import collections
import scipy
from scipy.stats import chi2
from scipy.stats import friedmanchisquare
from scipy.stats import studentized_range
pd.options.mode.chained_assignment = None

In [99]:
def get_mem():
    current_memory_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    current_memory_usage_mb = current_memory_usage / 1024
    print(f"Current memory usage: {current_memory_usage_mb:.2f} MB")
def read_vcf(file):
    with io.TextIOWrapper(gzip.open(file,'r')) as f:
        lines =[l for l in f if not l.startswith('##')]
        dynamic_header_as_key = []
        for liness in f:
            if liness.startswith("#CHROM"):
                dynamic_header_as_key.append(liness)
        values = [str,int,str,str,str,int,str,str,str,str]
        columns2detype = dict(zip(dynamic_header_as_key,values))
        df = pd.read_csv(
            io.StringIO(''.join(lines)),
            dtype=columns2detype,
            sep='\t'
        ).rename(columns={'#CHROM':'CHROM'})
    if df.iloc[:, 0].dtype != int:
        df['CHROM'] = df['CHROM'].str.extract(r'(\d+)')
    return df
def extract_info(df, info_cols = ['EAF', 'INFO_SCORE'], attribute = 'INFO', drop_attribute = True):
    for i in info_cols:
        df[i] = df[attribute].str.extract( i + '=([^;]+)' )
    if drop_attribute:
        df = df.drop(columns = [attribute])
    return df
def extract_format(df, sample, fmt = 'FORMAT'):
    fields = df[fmt].values[0].split(':')
    try:
        df[fields] = df[sample].str.split(':', expand=True)
        if len(fields) != len(df[sample].values[0].split(':')):
            raise ValueError("Mismatching fields in FORMAT and Imputed results.")
    except ValueError as e:
        print(f"Error: {e}")
    return df.drop(columns = [fmt, sample])
def drop_cols(df, drop_lst = ['ID', 'QUAL', 'FILTER']):
    return df.drop(columns = drop_lst)

def parse_vcf(file, sample, q = None, 
              info_cols = ['EAF', 'INFO_SCORE'], attribute = 'INFO', fmt = 'FORMAT', drop_attribute = True, drop_lst = ['ID', 'QUAL', 'FILTER']):
    df = read_vcf(file)
    print('reading: ', file)
    df = extract_info(df, info_cols = info_cols, attribute = attribute, drop_attribute = drop_attribute)
    df = extract_format(df, sample, fmt = fmt)
    df = drop_cols(df, drop_lst = drop_lst)
    if q is None:
        return df
    else:
        q.put(df)
def concat_vcf(lst):
    df = lst[0]
    for i in range(1, len(lst)):
        df = pd.concat([df, lst[i]])
    return df.reset_index(drop = True)

def multi_parse(chromosomes, files, sample, n_processes = 22,
               info_cols = ['EAF', 'INFO_SCORE'], attribute = 'INFO', fmt = 'FORMAT', drop_attribute = True, drop_lst = ['ID', 'QUAL', 'FILTER']):
    manager = multiprocessing.Manager()
    q = manager.Queue()
    processes = []
    for i in range(len(chromosomes)):
        tmp = multiprocessing.Process(target=parse_vcf, args=(files[i], sample, q, info_cols, attribute, fmt, drop_attribute, drop_lst))
        print('Process created:', str(i+1))
        tmp.start()
        processes.append(tmp)
    for process in processes:
        process.join()
    res_lst = []
    while not q.empty():
        res_lst.append(q.get())
    return res_lst

In [100]:
chromosomes = [i for i in range(1,23)]
sample = 'GM8'
files = ['../../GAMCC_oneKG/test/chr' + str(i) + '.vcf.gz' for i in chromosomes]
start = time.time()
res = multi_parse(chromosomes, files, sample)
end = time.time()
print(end - start)

Process created: 1
Process created: 2
Process created: 3
Process created: 4
Process created: 5
Process created: 6
Process created: 7
Process created: 8
Process created: 9
Process created: 10
Process created: 11
Process created: 12
Process created: 13
Process created: 14
Process created: 15
Process created: 16
Process created: 17
Process created: 18
Process created: 19
Process created: 20
Process created: 21
Process created: 22
reading:  ../../GAMCC_oneKG/test/chr19.vcf.gz
reading:  ../../GAMCC_oneKG/test/chr22.vcf.gz
reading:  ../../GAMCC_oneKG/test/chr20.vcf.gz
reading:  ../../GAMCC_oneKG/test/chr18.vcf.gz
reading:  ../../GAMCC_oneKG/test/chr15.vcf.gz
reading:  ../../GAMCC_oneKG/test/chr10.vcf.gz
reading:  ../../GAMCC_oneKG/test/chr11.vcf.gz
reading:  ../../GAMCC_oneKG/test/chr4.vcf.gz
151.6055736541748


In [101]:
get_mem()

Current memory usage: 39276.40 MB
