In [1]:
import os
from multiprocessing import Pool, Manager, cpu_count

import pandas as pd
from Bio import Align

from pyutils.constants import (
    OUTPUT_DIR,
    REF_SEQS,
    SRA_SUMMARY_FILE,
)
from pyutils.downstream_analysis import update_mutation_summary

In [2]:
sra_summary = pd.read_csv(SRA_SUMMARY_FILE)

In [3]:
for _, row in sra_summary.iterrows():
    exp_name = row['Experiment Title']
    if exp_name in REF_SEQS:
        sra_id = row['Run']
        experiment_dir = os.path.join(OUTPUT_DIR, exp_name.replace(' ', '_'))
        work_dir = os.path.join(OUTPUT_DIR, experiment_dir, sra_id)
        if os.path.exists(work_dir):
            mutation_summary_df = None
            with Pool(cpu_count() - 1) as p, Manager() as m:
                mutation_summary = m.dict()
                params = (
                    (mutation_summary, alignment)
                    for alignment in Align.parse(
                        os.path.join(work_dir, f'{sra_id}.sam'),
                        'sam'
                    )
                )
                p.starmap(func=update_mutation_summary, iterable=params)
                mutation_summary_df = [
                    (ref_index, mut, count)
                    for (ref_index, mut), count in mutation_summary.items()
                ]
                mutation_summary_df = pd.DataFrame(mutation_summary_df, columns=('site', 'nucleo', 'freq'))
                mutation_summary_df = mutation_summary_df.sort_values('site').reset_index()
                mutation_summary_df = mutation_summary_df.pivot(
                    index='nucleo',
                    columns='site',values='freq'
                ).fillna(0)
                mutation_summary_df.to_csv(os.path.join(work_dir, f'{sra_id}_base_freq.csv'))
            # break