In [2]:
import pandas as pd
import datetime
import concurrent.futures
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [47]:
NUM_WORKERS = 8
def get_current_date_ddmmyy():
    current_date = datetime.datetime.now()
    formatted_date = current_date.strftime("%d%m%y")
    return formatted_date


def create_info_lines(file,col_info_file):
    VCF_VERSION ='##fileformat=VCFv4.2'
    date = f'##date={get_current_date_ddmmyy()}'
    source_file = f'##sourcefile={file}'
    ref = '##reference=hg38'
    info_s = pd.Series([VCF_VERSION,date,source_file,ref])
    info_df = pd.read_csv(col_info_file).replace(' ','.')
    info_s = pd.concat([info_s,info_df.apply(
        lambda x: f'##{x.type}=<ID={x.ID},Number={x.Number},Type={x.Dtype},Description={x.Description}>',
          axis=1)])
    return info_s

def prepare_sample(df,id):
    cols = [f'{id}:GT',f'{id}:DP',f'{id}:GQ',f'{id}:AB']
    sample_df =df[cols].copy()
    sample_df[f'{id}:GT'] = sample_df[f'{id}:GT'].replace(' ','./.').fillna('./.')
    sample_df[[f'{id}:DP',f'{id}:GQ']] = sample_df[[f'{id}:DP',f'{id}:GQ']].fillna(-1).astype(int)
    sample_df[f'{id}:AB'] =  sample_df[f'{id}:AB'].replace(0,'.')
    sample_df =sample_df.fillna('.').replace(-1,'.')
    return sample_df.parallel_apply(lambda x : ':'.join(x.astype(str).tolist()), axis=1).rename(id)

def df_to_vcf(file,info_col):
    info_df = pd.read_csv(info_col)
    df = pd.read_csv(file, low_memory=False)
    res_df = pd.DataFrame()
    res_df['#CHROM'] = df.CHROM.str.replace('chr','')
    res_df['POS'] = df.POS
    res_df['ID'] = '.'
    res_df['REF'] = df.REF
    res_df['ALT'] = df.ALT
    res_df['QUAL'] = 0
    res_df['FILTER'] = df.FILTER.replace(' ','.')
    INFO_cols = info_df[info_df.type == 'INFO'].ID
    df.loc[:,INFO_cols] = df[INFO_cols].replace(' ','.').fillna('.')
    res_df['INFO'] = df.parallel_apply(lambda x: ';'.join([f'{i}={x[i]}' for i in INFO_cols]),axis=1)
    FORMAT_cols = info_df[info_df.type == 'FORMAT'].ID
    format = ':'.join(FORMAT_cols.tolist())
    res_df['FORMAT'] = format
    samples = set([i.split(':')[0] for i in df.columns if ':' in i])

    # Create a ThreadPoolExecutor to run the function in parallel
    with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
        # Submit the function for each item in the list
        # This starts the parallel execution
        futures = [executor.submit(lambda x:prepare_sample(df,x) , id) for id in samples]

        # Wait for all tasks to complete and retrieve the results
        results = [future.result() for future in concurrent.futures.as_completed(futures)]
    sample_df = pd.concat(results, axis=1)
    res_df = pd.concat([res_df,sample_df], axis=1)
    return res_df


In [49]:
file = 'data/pipeline_outputs/variants_with_layers/2023-09-24-new_sinclair/qualityDSD_variants.csv'
col_info_file = 'cols_info.csv'
def main(file, col_info_file, output_name):
    print("Prepare info")
    info_s =create_info_lines(file, col_info_file)
    print("Prepare data")
    vcf_df = df_to_vcf(file, col_info_file).T.reset_index().T
    output_name = file.replace('csv','vcf').split('/')[-1]
    vcf_df = pd.concat([info_s,vcf_df])
    print("saving")
    vcf_df.to_csv(output_name,sep='\t',index=False,header=None)
    print(f'Saved as {output_name}')
main(file, col_info_file)

Prepare info
Prepare data
saving
Saved as data/pipeline_outputs/variants_with_layers/2023-09-24-new_sinclair/qualityDSD_variants.vcf


In [10]:
df = pd.read_csv(file)
df.columns[:51]

Index(['CHROM', 'POS', 'REF', 'ALT', 'FILTER', 'AF', 'AF_popmax', 'GHid',
       'GH_is_elite', 'GH_type', 'geneHancer', 'repeatsMasker', 'DSDgenes_1mb',
       'DSDgenes_1.5mb', 'distance_from_nearest_DSD_TSS', 'INTERVAL_ID',
       'from', 'to', 'length', 'median_DP', 'median_GQ', 'total_probands',
       'sinclair_probands', 'AF_sinclair', 'local_AF_overall', 'stringent_AF',
       'quality', 'AS22WG001:GT', 'AS22WG001:DP', 'AS22WG001:GQ',
       'AS22WG001:AB', 'AS22WG002:GT', 'AS22WG002:DP', 'AS22WG002:GQ',
       'AS22WG002:AB', 'AS22WG003:GT', 'AS22WG003:DP', 'AS22WG003:GQ',
       'AS22WG003:AB', 'AS22WG004:GT', 'AS22WG004:DP', 'AS22WG004:GQ',
       'AS22WG004:AB', 'AS22WG005:GT', 'AS22WG005:DP', 'AS22WG005:GQ',
       'AS22WG005:AB', 'AS22WG006:GT', 'AS22WG006:DP', 'AS22WG006:GQ',
       'AS22WG006:AB'],
      dtype='object')

In [39]:
def prepare_sample(df,id):
    cols = [f'{id}:GT',f'{id}:DP',f'{id}:GQ',f'{id}:AB']
    sample_df =df[cols].copy()
    sample_df[f'{id}:GT'] = sample_df[f'{id}:GT'].replace(' ','./.').fillna('./.')
    sample_df[[f'{id}:DP',f'{id}:GQ']] = sample_df[[f'{id}:DP',f'{id}:GQ']].fillna(-1).astype(int)
    sample_df[f'{id}:AB'] =  sample_df[f'{id}:AB'].replace(0,'.')
    sample_df =sample_df.fillna('.').replace(-1,'.')
    return sample_df.parallel_apply(lambda x : ':'.join(x.astype(str).tolist()), axis=1).rename(id)
sid = 'AS23WG023'
prepare_sample(df,sid)
# df[f'{sid}:DP']

0         ./.:.:.:.
1         ./.:.:.:.
2         ./.:.:.:.
3         ./.:.:.:.
4         ./.:.:.:.
            ...    
166742    ./.:.:.:.
166743    ./.:.:.:.
166744    ./.:.:.:.
166745    ./.:.:.:.
166746    ./.:.:.:.
Name: AS23WG023, Length: 166747, dtype: object

In [21]:
df['H7FCVALXX_3_170116_FD02524749_161220:GT']

0          
1          
2          
3          
4          
         ..
166742     
166743     
166744     
166745     
166746     
Name: H7FCVALXX_3_170116_FD02524749_161220:GT, Length: 166747, dtype: object