## Import packages and define some functions

In [1]:
import matplotlib.pyplot as plt
import matplotlib as matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import scipy as scipy

from collections import Counter
import os

matplotlib.__version__, np.__version__, pd.__version__, sns.__version__, scipy.__version__

('3.9.2', '1.26.4', '2.2.2', '0.13.2', '1.13.1')

In [2]:

chr_ID_to_genbank = {'Ca22chr1A_C_albicans_SC5314':'NC_032089.1',
                     'Ca22chr2A_C_albicans_SC5314':'NC_032090.1',
                     'Ca22chr3A_C_albicans_SC5314':'NC_032091.1',
                     'Ca22chr4A_C_albicans_SC5314':'NC_032092.1',
                     'Ca22chr5A_C_albicans_SC5314':'NC_032093.1',
                     'Ca22chr6A_C_albicans_SC5314':'NC_032094.1',
                     'Ca22chr7A_C_albicans_SC5314':'NC_032095.1',
                     'Ca22chrRA_C_albicans_SC5314':'NC_032096.1'}

genbank_to_chr_no = {'NC_032089.1':'Chr1',
                     'NC_032090.1':'Chr2',
                     'NC_032091.1':'Chr3',
                     'NC_032092.1':'Chr4',
                     'NC_032093.1':'Chr5',
                     'NC_032094.1':'Chr6',
                     'NC_032095.1':'Chr7',
                     'NC_032096.1':'ChrR',
                     'genome':'genome'}

In [3]:
cov_file_list = os.listdir('./coverage_hists/')
cov_file_list
# list of coverage histograms generated by Bedtools genomecov

['HIS4_AA_1.cov.txt',
 'HIS4_AA_2.cov.txt',
 'HIS4_AA_3.cov.txt',
 'HIS4_AA_4.cov.txt',
 'HIS4_BB_1.cov.txt',
 'HIS4_BB_2.cov.txt',
 'HIS4_BB_3.cov.txt',
 'HIS4_BB_4.cov.txt',
 'KSR1_1.cov.txt',
 'KSR1_2.cov.txt',
 'KSR1_3.cov.txt',
 'KSR1_4.cov.txt']

In [4]:
median_cov_dict_of_dict = {}
# dict of dict container for median coverage values

for file in cov_file_list:
    hist_file = pd.read_csv('./coverage_hists/'+file, sep='\t', names=['chr_ID', 'coverage', 'n_obs', 'chr_len', 'density'])
    hist_file['chr_name'] = hist_file['chr_ID'].map(genbank_to_chr_no)
    # loop through files and get coverage info

    median_dict = {}
    # container for rows

    for chrom in hist_file['chr_name'].unique():
        sub_df = hist_file[hist_file['chr_name']==chrom]
        # loop through chromosomes and slice df
    
        raw_cov_list = []
    
        for row in list(sub_df.index):
            cov = sub_df.loc[row, 'coverage']
            n_obs = sub_df.loc[row, 'n_obs']
            raw_cov_list += [cov]*n_obs
            # convert format to generate full distributions
    
        median_dict[chrom] = np.median(raw_cov_list)
        # compute median
    
        raw_cov_list = []
    median_cov_dict_of_dict[file] = median_dict
    # add column info to dict of dicts



In [5]:
median_cov_df = pd.DataFrame.from_dict(median_cov_dict_of_dict).T
# make df

In [6]:
for col in ['Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5', 'Chr6', 'Chr7', 'ChrR']:
    median_cov_df[(col+'_rel')] = median_cov_df[col]/median_cov_df['genome']
# add relative coverage
median_cov_df

Unnamed: 0,Chr1,Chr2,Chr3,Chr4,Chr5,Chr6,Chr7,ChrR,genome,Chr1_rel,Chr2_rel,Chr3_rel,Chr4_rel,Chr5_rel,Chr6_rel,Chr7_rel,ChrR_rel
HIS4_AA_1.cov.txt,130.0,136.0,136.0,136.0,133.0,133.0,140.0,135.0,134.0,0.970149,1.014925,1.014925,1.014925,0.992537,0.992537,1.044776,1.007463
HIS4_AA_2.cov.txt,21.0,23.0,21.0,21.0,21.0,22.0,23.0,23.0,22.0,0.954545,1.045455,0.954545,0.954545,0.954545,1.0,1.045455,1.045455
HIS4_AA_3.cov.txt,25.0,26.0,25.0,25.0,24.0,23.0,25.0,27.0,25.0,1.0,1.04,1.0,1.0,0.96,0.92,1.0,1.08
HIS4_AA_4.cov.txt,10.0,10.0,9.0,10.0,9.0,9.0,9.0,10.0,10.0,1.0,1.0,0.9,1.0,0.9,0.9,0.9,1.0
HIS4_BB_1.cov.txt,98.0,99.0,96.0,96.0,90.0,91.0,90.0,98.0,96.0,1.020833,1.03125,1.0,1.0,0.9375,0.947917,0.9375,1.020833
HIS4_BB_2.cov.txt,117.0,120.0,118.0,119.0,112.0,111.0,117.0,121.0,117.0,1.0,1.025641,1.008547,1.017094,0.957265,0.948718,1.0,1.034188
HIS4_BB_3.cov.txt,201.0,202.0,201.0,296.0,296.0,196.0,198.0,202.0,209.0,0.961722,0.966507,0.961722,1.416268,1.416268,0.937799,0.947368,0.966507
HIS4_BB_4.cov.txt,25.0,26.0,26.0,26.0,25.0,23.0,26.0,27.0,26.0,0.961538,1.0,1.0,1.0,0.961538,0.884615,1.0,1.038462
KSR1_1.cov.txt,94.0,96.0,96.0,93.0,131.0,90.0,92.0,98.0,98.0,0.959184,0.979592,0.979592,0.94898,1.336735,0.918367,0.938776,1.0
KSR1_2.cov.txt,31.0,33.0,32.0,33.0,30.0,31.0,31.0,33.0,32.0,0.96875,1.03125,1.0,1.03125,0.9375,0.96875,0.96875,1.03125


In [7]:
median_cov_df[['Chr1', 'Chr1_rel', 
               'Chr2', 'Chr2_rel',
               'Chr3', 'Chr3_rel',
               'Chr4', 'Chr4_rel',
               'Chr5', 'Chr5_rel',
               'Chr6', 'Chr6_rel',
               'Chr7', 'Chr7_rel',
               'ChrR', 'ChrR_rel',
               'genome']].to_csv('./genome_cov_table.csv')
# export table