#### Presence absence analysis of the DK0911 genome based on orthology



#### Usefull folders

- Warrior TE analysis: /home/benjamin/genome_assembly/Warrior/TE_analysis
- Pst_104_TE analysis: /home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/Warrior_comp_runs/REPET/TE_analysis
- comparative coverage: /home/benjamin/genome_assembly/Warrior/TE_analysis


## Start with filtering the mapping files at the appropriate level

In [1]:
%matplotlib inline

In [2]:
import os
from pybedtools import BedTool
import pandas as pd
import scipy
import pandas as pd
import numpy as np # need for  stats
import matplotlib.pyplot as plt
import seaborn as sns
import re
from scipy.stats import mannwhitneyu
from scipy.stats import fisher_exact
from scipy.stats import spearmanr
from scipy.stats import wilcoxon
from scipy.stats import kruskal
from statsmodels.stats.multitest import multipletests
from seaborn import boxenplot
import pybedtools

In [7]:
DK0911_ortho_dir = '/home/benjamin/genome_assembly/Warrior/DK0911_v04/comp_orthology/orthofinder/DK0911_vs_Ps104E_v13/'
Pst_104E_ortho_dir = '/home/benjamin/genome_assembly/Warrior/DK0911_v04/comp_orthology/orthofinder/Ps104E_v13_vs_DK0911/'
DK0911_genome_file_fn = '/home/benjamin/genome_assembly/Warrior/genome_v04/DK_0911_v04_ph_ctg.genome_file'
Pst_104E_genome_file_fn = \
'/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/092017_assembly/Pst_104E_v13_ph_ctg.genome_file'
OUT_dir = '/home/benjamin/genome_assembly/Warrior/DK0911_v04/comp_orthology/orthofinder/downstream_analysis/'

* do simple checks if any of the genes is more likely to lack orthologs then others e.g. BUSCOS vs effectors and such
* check if the missing orthologs overlap with other missing genes
* pull in the Synet file for synteny and ask if effectors are in longer blogs or ask what is the speciatly of shorter blogs or now hits


In [5]:
###all the bedfiles
Pst_104E_annotation_dir = '/home/benjamin/genome_assembly/Warrior/DK0911_v04/comp_orthology/Pst104E_annotations/'
Pst_104E_dict = {}
Pst_104E_dict['Busco'] = os.path.join(Pst_104E_annotation_dir, 'Pst_104E_v13_ph_ctg.busco.gene.bed')
Pst_104E_dict['All_genes'] = os.path.join(Pst_104E_annotation_dir, 'Pst_104E_v13_ph_ctg.genes.gene.bed')
Pst_104E_dict['Secretome'] = os.path.join(Pst_104E_annotation_dir, 'Pst_104E_v13_ph_ctg.secretome.gene.bed')
Pst_104E_dict['Ceffectors'] = os.path.join(Pst_104E_annotation_dir, 'Pst_104E_v13_ph_ctg.ceffectors.gene.bed')
Pst_104E_dict['EffectorP'] = os.path.join(Pst_104E_annotation_dir, 'Pst_104E_v13_ph_ctg.effectorp.gene.bed')
Pst_104E_dict['TE_superfamily'] = os.path.join(Pst_104E_annotation_dir, 'Pst_104E_v13_ph_ctg.104Ep_DK0911p.REPET.superfamily.gff')

In [6]:
###all the bedfiles
DK0911_annotation_dir = '/home/benjamin/genome_assembly/Warrior/DK0911_v04/comp_orthology/DK0911_annotations/'
DK0911_dict = {}
DK0911_dict['Busco'] = os.path.join(DK0911_annotation_dir, 'DK_0911_v04_ph_ctg.busco.gene.bed')
DK0911_dict['All_genes'] = os.path.join(DK0911_annotation_dir, 'DK_0911_v04_ph_ctg.genes.gene.bed')
DK0911_dict['Secretome'] = os.path.join(DK0911_annotation_dir, 'DK_0911_v04_ph_ctg.secretome.gene.bed')
DK0911_dict['Ceffectors'] = os.path.join(DK0911_annotation_dir, 'DK_0911_v04_ph_ctg.effectorp20.gene.bed')
DK0911_dict['EffectorP'] = os.path.join(DK0911_annotation_dir, 'DK_0911_v04_ph_ctg.effectorp20.gene.bed')
DK0911_dict['TE_superfamily'] = os.path.join(DK0911_annotation_dir, 'DK_0911_v04_ph_ctg.104Ep_DK0911p.REPET.superfamily.gff')

In [None]:
def filter_bed(pairing):
    """Filter bedfiels by substracting one from the other and return resulting bed as dataframe.
    Input: Tripled pairing of abs file path target, reference, ouf filename."""
    traget_bed_fn = pairing[0]
    ref_bed_fn = pairing[1]
    out_fn = pairing[2]
    if not traget_bed_fn.split('.')[-1] == ref_bed_fn.split('.')[-1]:
        print('oh no!')
    traget_bed = BedTool(traget_bed_fn)
    ref_bed = BedTool(ref_bed_fn)
    filtered_bed = traget_bed.subtract(ref_bed).saveas(out_fn)
    pybedtools.cleanup()
    return filtered_bed.to_dataframe()

In [None]:
def non_covered_bases(df):
    """Calculate the coverage of a bed file provided as dataframe."""
    return sum(df['end'] - df['start'])

In [None]:
def randomize_bed(bed_in, genome_file_fn, out_dir, n=10):
    """Randomize a bedfile using shuffle of bedtools."""
    out_fn = os.path.join(out_dir, os.path.basename(bed_in).replace('.bed', '.random_%s.bed'))
    bed = BedTool(bed_in)
    for x in range(0, n):
        bed.shuffle(g=genome_file_fn).saveas(out_fn % x)
    pybedtools.cleanup()

In [None]:
def count_overlap_features(feature_bed, absence_bed, min_f_overlap):
    """Count the number of unqiue genes that intersect witht basence bedfile."""
    filtered_bed = BedTool(absence_bed)
    target_bed = BedTool(feature_bed)
    return target_bed.intersect(filtered_bed, f=min_f_overlap).to_dataframe().name.unique().shape[0]

In [None]:
def summary_dict_to_df(summary_dict, outer_index, inner_index):
    """Convert as summary dict with multiple keys and values of same length. To a multiindex dataframe."""
    assert(outer_index[0:int(len(outer_index)/2)] == outer_index[int(len(outer_index)/2):])
    assert(inner_index[0:int(len(inner_index)/2)] == inner_index[int(len(inner_index)/2):])
    df = pd.DataFrame.from_dict(summary_dict)
    if any([x for x in outer_index if x.startswith('0')]):
        df['Cut_off'] = [float(x.replace('0','.')) for x in outer_index[0:int(len(outer_index)/2)]]
    else:
        df['Cut_off'] = [float(x) for x in outer_index[0:int(len(outer_index)/2)]]
    df['Type'] = inner_index[0:int(len(inner_index)/2)]
    df.sort_values(['Cut_off', 'Type'], inplace = True)
    index = pd.MultiIndex.from_tuples(list(zip(df['Cut_off'] , df['Type'])), names=['Cut-off', 'Type'])
    df.index = index
    return df.loc[:, df.columns[:2]].copy()

In [None]:
def get_TE_sf_df(fn):    
    TE_header = ['Chrom', 'source', 'type', 'start', 'stop', 'score', 'strand', 'blank', 'superfamily']
    TE_df = pd.read_csv(fn, sep='\t', header = None, names=TE_header)
    TE_df["interval"] = TE_df.stop - TE_df.start + 1
    TE_sf_cov_df= TE_df.groupby('superfamily')['interval'].sum()
    return TE_sf_cov_df

In [None]:
def count_base_overlap_TE(feature_bed, absence_bed):
    """Count the number of basepair overlaps of a gff file with that intersect of 
    a basence bedfile.
    Returns a series containing gff attributes as index and sum of missing bases as value."""
    filtered_bed = BedTool(absence_bed)
    target_bed = BedTool(feature_bed)
    overlap_df = target_bed.intersect(filtered_bed).to_dataframe()
    overlap_df['interval'] = overlap_df.end - overlap_df.start + 1
    overlap_sf_cov_df = overlap_df.groupby('attributes')['interval'].sum()
    pybedtools.cleanup()
    return overlap_sf_cov_df

In [None]:
def plot_TE_cov_expect(genome, TE_random_dict, color, out_fn):
    TE_random_df = pd.concat(TE_random_dict, axis=1, sort=True)
    TE_random_df.fillna(0, inplace=True)
    rows = len(TE_random_df.index)//5
    f1, ax = plt.subplots(rows, 5, figsize=(20, 30))


    for key,y in zip(TE_random_df.index, product(range(0, rows), range(0,5))):
        overlap = TE_sf_cov_dict[genome][key]
        random_values = TE_random_df.loc[key,:]
        count = 0 
        for i in random_values:
            if i > overlap:
                count += 1
        p = count/len(random_values)

        two_side_p = 2*p

        if two_side_p > 1:
            two_side_p = round(2 - two_side_p, 3)
        else:
            two_side_p = round(two_side_p, 3)
        sns.violinplot(y=random_values, color=pallete[1], ax=ax[y[0], y[1]])
        ax[y[0], y[1]].axhline(y=overlap, color='r', linestyle='-')
        ax[y[0], y[1]].set_title(key, fontsize=16)
        ax[y[0], y[1]].set_yticklabels([])
        ax[y[0], y[1]].set_ylabel('')
        ax[y[0], y[1]].set_xlabel('p=' + str(two_side_p), fontsize=16)
        ax[y[0] ,0].set_ylabel('Number of overlapping genes', fontsize=16)
    plt.tight_layout()
    plt.savefig(out_fn, dpi =300)
    return f1

In [None]:
from itertools import product

### Get some input values sorted

In [None]:
genome_size_dict = {}
genome_size_dict['DK0911'] = pd.read_csv(DK0911_genome_file_fn, sep='\t',header = None)[1].sum()
genome_size_dict['Pst_104E'] = pd.read_csv(Pst_104E_genome_file_fn, sep='\t',header = None)[1].sum()
TE_sf_cov_dict = {}
TE_sf_cov_dict['DK0911'] = get_TE_sf_df(DK0911_dict['TE_superfamily'])
TE_sf_cov_dict['Pst_104E'] = get_TE_sf_df(Pst_104E_dict['TE_superfamily'])

### This part looks at generating filtered bed files and random shuffled bed files for 0.3 cut-off

The 0.3 coverage cut-off is choosen based on the DK0911 BUSCOs being pretty stable until this coverage cut-off.

In [None]:
all_bed_fns = [os.path.join(SRM_cov_dir, x) for x in os.listdir(SRM_cov_dir) if x.endswith('bed') ] 

In [None]:
references = [x for x in all_bed_fns if os.path.basename(x).startswith('DK_0911_v04_ph_ctg.bwamem.DK0911_gDNA.RG.')\
              or  os.path.basename(x).startswith('Pst_104E_v13_ph_ctg.bwamem.Pst79_folder5.')]
references.sort()

targets = [x for x in all_bed_fns if x not in references]
targets.sort()

out_fns = [os.path.join(SRM_cov_outdir, os.path.basename(x).replace('bed','.filtered.bed')) for x in targets]

pairings = [x for x in zip(targets,references, out_fns)]

In [None]:
#filter the parings
for pair in pairings:
    filter_bed(pair)

In [None]:
print("This is the file that got randomized: %s\n\n" % pairings[0][2])
print("This is the file that got randomized: %s\n\n" % pairings[-8][2])

In [None]:
#randomize the bedfiles of the lowcov targted. Yet first look at the results of fishers exact tests and
#pick the appropriate coveraged cut off
randomize_bed(pairings[-8][2], Pst_104E_genome_file_fn, SRM_cov_rand_outdir, n=5000)
randomize_bed(pairings[0][2], DK0911_genome_file_fn, SRM_cov_rand_outdir, n=5000)

### The next session compares the observed low coverage regions with different genes types in the two isolates
First the results are saved in a dictionary and then converted to a multiindex dataframe

In [None]:
Pst_104E_dict.keys()

In [None]:
Fishers_summary_dict = {}
Absence_summary_dict = {}
outer_index = [] #being the cut off
inner_index = [] #being the type of gene to look at
Fishers_summary_dict['DK0911'] = []
Fishers_summary_dict['Pst_104E'] = []
Absence_summary_dict['DK0911'] = []
Absence_summary_dict['Pst_104E'] = []
min_overlap = 1
keys = ['All_genes','Busco', 'Ceffectors',  'EffectorP',   'Secretome']
#pair[2] is the filtered lowcov file as reference
for pair in pairings:
    if os.path.basename(pair[2]).startswith('DK_0911'):
        expect_list = [count_overlap_features(DK0911_dict['All_genes'],pair[2],min_overlap) ,\
                       pd.read_csv(DK0911_dict['All_genes'], sep='\t').shape[0]]
        cut_off = os.path.basename(pair[2]).split('.')[-3].replace('lowcov', '')
        for key in keys:
            inner_index.append(key)
            outer_index.append(cut_off)
            test_list = [count_overlap_features(DK0911_dict[key],pair[2],min_overlap),\
                         pd.read_csv(DK0911_dict[key], sep='\t').shape[0]]
            Fishers_summary_dict['DK0911'].append(fisher_exact([test_list, expect_list])[1])
            Absence_summary_dict['DK0911'].append(count_overlap_features(DK0911_dict[key],pair[2],min_overlap))
    if os.path.basename(pair[2]).startswith('Pst_104E_v13_ph_ctg'):
        expect_list = [count_overlap_features(Pst_104E_dict['All_genes'],pair[2],min_overlap) ,\
                       pd.read_csv(Pst_104E_dict['All_genes'], sep='\t').shape[0]]
        cut_off = os.path.basename(pair[2]).split('.')[-3].replace('lowcov', '')
        for key in keys:
            inner_index.append(key)
            outer_index.append(cut_off)
            test_list = [count_overlap_features(Pst_104E_dict[key],pair[2],min_overlap),\
                         pd.read_csv(Pst_104E_dict[key], sep='\t').shape[0]]
            Fishers_summary_dict['Pst_104E'].append(fisher_exact([test_list, expect_list])[1])
            Absence_summary_dict['Pst_104E'].append(count_overlap_features(Pst_104E_dict[key],pair[2],min_overlap))

In [None]:
Fishers_df = summary_dict_to_df(Fishers_summary_dict, outer_index, inner_index)
Absence_df = summary_dict_to_df(Absence_summary_dict, outer_index, inner_index)

In [None]:
out_fn = os.path.join(OUT_dir, 'Lowcov_fisher_mo%s_df.tsv' % min_overlap)
Fishers_df.to_csv(out_fn, sep='\t')
Fishers_df

In [None]:
!wc -l {DK0911_dict['Secretome']}

In [None]:
out_fn = os.path.join(OUT_dir, 'Lowcov_absence_mo%s_df.tsv' % min_overlap)
Absence_df.to_csv(out_fn, sep='\t')
Absence_df

### Look over the different filtered lowcov regions and caculate the amount of uncovered bases

In [None]:
#loop over the different lowcoverage file, filter them and count the total bases of lowcoverage
summary_dict = {}
summary_dict['DK0911'] = []
summary_dict['Pst_104E'] = []
index = []
for pair in pairings:
    non_cov_bases = non_covered_bases(filter_bed(pair))
    cut_off = pair[2].split('.')[-3].replace('lowcov','')
    index.append(cut_off)
    if pair[2].split('.')[2].startswith('Pst79_'):
        summary_dict['DK0911'].append(non_cov_bases)
    elif pair[2].split('.')[2].startswith('DK0911_'):
        summary_dict['Pst_104E'].append(non_cov_bases)

length = len(index) 
index[0:int(length/2)] == index[int(length/2):]
df = pd.DataFrame.from_dict(summary_dict)
df.index = [float(x.replace('0','.')) for x in index[0:int(length/2)]]
df.sort_index(inplace=True)
#check on Pst_104E mapping on DK0911
out_fn = os.path.join(OUT_dir, 'Lowcov_missing_bases_df.tsv')
df.to_csv(out_fn, sep='\t')
df

### Now look do the permutation tests at the 0.3 lowcov value for all gene groups in both isolates

In [None]:
Pst_random_dict_lowcov = {}
DK0911_random_dict_lowcov = {}

In [None]:
#loop over all gene files
#for each gene file do the whole permuation test and safe it as part of a dictionary
Pst_104E_random_fn_lowcov = [os.path.join(SRM_cov_rand_outdir, x) for x in os.listdir(SRM_cov_rand_outdir) if x.startswith('Pst')]
Pst_104E_random_fn_lowcov.sort()
DK0911_random_fn_lowcov = [os.path.join(SRM_cov_rand_outdir, x) for x in os.listdir(SRM_cov_rand_outdir) if x.startswith('DK')]
DK0911_random_fn_lowcov.sort()
Pst_random_dict_lowcov = {}
DK0911_random_dict_lowcov = {}

min_overlap = 1.0
for key in keys:
    Pst_random_dict_lowcov[key] = []
    for rand_fn in Pst_104E_random_fn_lowcov:
        Pst_random_dict_lowcov[key].append(count_overlap_features(Pst_104E_dict[key], rand_fn, min_overlap))
    DK0911_random_dict_lowcov[key] = []
    for rand_fn in DK0911_random_fn_lowcov:
        DK0911_random_dict_lowcov[key].append(count_overlap_features(DK0911_dict[key], rand_fn, min_overlap))

In [None]:
pallete = [sns.color_palette('colorblind')[x] for x in [0,1]]
reference_bed_fn = pairings[-8][2]
columns = len(keys)
f1, ax = plt.subplots(1, columns, figsize=(20, 4))


for n, key in enumerate(keys):
    overlap = count_overlap_features(Pst_104E_dict[key],reference_bed_fn, min_overlap)
    random_values = Pst_random_dict_lowcov[key]
    count = 0 
    for i in random_values:
        if i > overlap:
            count += 1
    p = count/len(random_values)
    
    two_side_p = 2*p
    
    if two_side_p > 1:
        two_side_p = round(2 - two_side_p, 3)
    sns.violinplot(y=random_values, color=pallete[0], ax=ax[n])
    ax[n].axhline(y=overlap, color='r', linestyle='-')
    ax[n].set_title(key, fontsize=16)
    ax[n].set_xlabel('p=' + str(two_side_p), fontsize=16)
    ax[0].set_ylabel('Number of overlapping genes', fontsize=16)
    out_fn = os.path.join(OUT_dir, 'Pst104E_noncovered_%s_lowcov01_mo%s.facett.png' % (key,min_overlap))
    plt.savefig(out_fn, dpi =300) 

In [None]:

for key in keys:
    overlap = count_overlap_features(Pst_104E_dict[key],reference_bed_fn, min_overlap)
    random_values = Pst_random_dict_lowcov[key]

    count = 0 
    for i in random_values:
        if i > overlap:
            count += 1
    p = count/len(random_values)
    
    two_side_p = 2*p
    
    if two_side_p > 1:
        two_side_p = round(2 - two_side_p, 3)


    f, ax = plt.subplots(figsize=(4, 7))
    plt.style.use('fast')
    sns.violinplot(y=random_values, color=pallete[0])
    plt.axhline(y=overlap, color='r', linestyle='-')
    plt.title(key, fontsize=16)
    plt.ylabel('Number of overlapping genes', fontsize=16)
    plt.xlabel('p=' + str(two_side_p), fontsize=16)
    plt.rc('xtick', labelsize=16)
    plt.rc('ytick', labelsize=16)
    out_fn = os.path.join(OUT_dir, 'Pst104E_noncovered_%s_lowcov01_mo%s.png' % (key,min_overlap))
    plt.savefig(out_fn, dpi =300)

In [None]:
reference_bed_fn = pairings[0][2]
columns = len(keys)
f1, ax = plt.subplots(1, columns, figsize=(20, 4))


for n, key in enumerate(keys):
    overlap = count_overlap_features(DK0911_dict[key],reference_bed_fn, min_overlap)
    random_values = DK0911_random_dict_lowcov[key]
    count = 0 
    for i in random_values:
        if i > overlap:
            count += 1
    p = count/len(random_values)
    
    two_side_p = 2*p
    
    if two_side_p > 1:
        two_side_p = round(2 - two_side_p, 3)
    sns.violinplot(y=random_values, color=pallete[1], ax=ax[n])
    ax[n].axhline(y=overlap, color='r', linestyle='-')
    ax[n].set_title(key, fontsize=16)
    ax[n].set_xlabel('p=' + str(two_side_p), fontsize=16)
    ax[0].set_ylabel('Number of overlapping genes', fontsize=16)
    out_fn = os.path.join(OUT_dir, 'DK0911_noncovered_%s_lowcov01_mo%s.facett.png' % (key,min_overlap))
    plt.savefig(out_fn, dpi =300) 

In [None]:
reference_bed_fn = pairings[0][2]
for key in keys:
    overlap = count_overlap_features(DK0911_dict[key],reference_bed_fn, min_overlap)
    random_values = DK0911_random_dict_lowcov[key]

    count = 0 
    for i in random_values:
        if i > overlap:
            count += 1
    p = count/len(random_values)
    
    two_side_p = 2*p
    
    if two_side_p > 1:
        two_side_p = round(2 - two_side_p, 3)



    f, ax = plt.subplots(figsize=(4, 7))
    plt.style.use('fast')
    sns.violinplot(y=random_values, color=pallete[1])
    plt.axhline(y=overlap, color='r', linestyle='-')
    plt.title(key, fontsize=16)
    plt.ylabel('Number of overlapping genes', fontsize=16)
    plt.xlabel('p=' + str(two_side_p), fontsize=16)
    plt.rc('xtick', labelsize=16)
    plt.rc('ytick', labelsize=16)
    out_fn = os.path.join(OUT_dir, 'DK0911_noncovered_%s_lowcov01_mo%s.png' % (key,min_overlap))
    plt.savefig(out_fn, dpi =300)

There doesn't seem anything specific about any gene family when looking for coverage of the mapping and what is missing. All seem to be significantly enriched.

### To-do

* look at the coverage files and the mapping files in igv and see if they line up.
    * done that and they looked just fine in case of DK0911 mapping onto Pst104E
* look at the coverage plots that are part of the het cov plotting
* genes that are not covered by Pst_104E reads by DK0911 could be genes related to sexual reproduction or infection of the host
* check into the overlap of genes that are not covered in both methods (lowcov01 and WGA95) and orthofinder. See if there is anything special in terms of length, expression, or function.
* check if there is non-random overlap between TE superfamilies and lowcov or WGA95 do this on the base overlap basis. If anything sticks out maybe look into family level and such.
* consider doing the absence presence polymorphism on the gene level. For all genes count how often they are totally absent <- this doesn't work as the size of real observations is 1.
* check if from the absent genes of of DK0911 in Pst104E are often both pairs of the allele or singletons


### Orthology analysis

* look into the singleton levels in other fungal species. Similar?

## Now look at mummer whole genome alignment in the same way

In [None]:
mummer_files = [os.path.join(WGA_cov_dir, x) for x in os.listdir(WGA_cov_dir) if x.endswith('.bed')]
mummer_files.sort()

In [None]:
Fishers_summary_dict = {}
Absence_summary_dict = {}
outer_index = [] #being the cut off
inner_index = [] #being the type of gene to look at
Fishers_summary_dict['DK0911'] = []
Fishers_summary_dict['Pst_104E'] = []
Absence_summary_dict['DK0911'] = []
Absence_summary_dict['Pst_104E'] = []
min_overlap = 1.0
for m_fn in mummer_files:
    if os.path.basename(m_fn).startswith('DK_0911'):
        expect_list = [count_overlap_features(DK0911_dict['All_genes'],m_fn,min_overlap) ,\
                       pd.read_csv(DK0911_dict['All_genes'], sep='\t').shape[0]]
        cut_off = os.path.basename(m_fn).split('.')[-3].replace('deltafilter_i', '').replace('mu0g_','')
        for key in keys:
            inner_index.append(key)
            outer_index.append(cut_off)
            test_list = [count_overlap_features(DK0911_dict[key],m_fn,min_overlap),\
                         pd.read_csv(DK0911_dict[key], sep='\t').shape[0]]
            Fishers_summary_dict['DK0911'].append(fisher_exact([test_list, expect_list])[1])
            Absence_summary_dict['DK0911'].append(count_overlap_features(DK0911_dict[key],m_fn,min_overlap))
    if os.path.basename(m_fn).startswith('Pst_104E_v13_ph_ctg'):
        expect_list = [count_overlap_features(Pst_104E_dict['All_genes'],m_fn,min_overlap) ,\
                       pd.read_csv(Pst_104E_dict['All_genes'], sep='\t').shape[0]]
        cut_off = os.path.basename(m_fn).split('.')[-3].replace('deltafilter_i', '').replace('mu0g_','')
        for key in keys:
            inner_index.append(key)
            outer_index.append(cut_off)
            test_list = [count_overlap_features(Pst_104E_dict[key],m_fn,min_overlap),\
                         pd.read_csv(Pst_104E_dict[key], sep='\t').shape[0]]
            Fishers_summary_dict['Pst_104E'].append(fisher_exact([test_list, expect_list])[1])
            Absence_summary_dict['Pst_104E'].append(count_overlap_features(Pst_104E_dict[key],m_fn,min_overlap))

In [None]:
Fishers_df = summary_dict_to_df(Fishers_summary_dict, outer_index, inner_index)
Absence_df = summary_dict_to_df(Absence_summary_dict, outer_index, inner_index)

In [None]:
out_fn = os.path.join(OUT_dir, 'WGA_fisher_df.tsv')
Fishers_df.to_csv(out_fn, sep='\t')
Fishers_df

In [None]:
out_fn = os.path.join(OUT_dir, 'WGA_absence_df.tsv')
Absence_df.to_csv(out_fn, sep='\t')
Absence_df

In [None]:
#loop over the different lowcoverage file, filter them and count the total bases of lowcoverage
summary_dict = {}
summary_dict['DK0911'] = []
summary_dict['Pst_104E'] = []
index = []
for m_fn in mummer_files:
    non_cov_bases = non_covered_bases(pd.read_csv(m_fn, sep='\t', header=None, names=['chrom', 'start', 'end']))
    cut_off = os.path.basename(m_fn).split('.')[-3].replace('deltafilter_i', '').replace('mu0g_','')
    index.append(cut_off)
    if m_fn.split('.')[1].startswith('Pst_'):
        summary_dict['DK0911'].append(non_cov_bases)
    elif m_fn.split('.')[1].startswith('DK_0911_'):
        summary_dict['Pst_104E'].append(non_cov_bases)

length = len(index) 
index[0:int(length/2)] == index[int(length/2):]
df = pd.DataFrame.from_dict(summary_dict)
df.index = [float(x) for x in index[0:int(length/2)]]
df.sort_index(inplace=True)
#check on Pst_104E mapping on DK0911
out_fn = os.path.join(OUT_dir, 'WGA_missing_bases_df.tsv')
df.to_csv(out_fn, sep='\t')
df

In [None]:
print("This is the file that got randomized: %s\n\n" % mummer_files[-5])
print("This is the file that got randomized: %s\n\n" % mummer_files[5])

In [None]:
#randomize the bedfiles of the WGA targted. Yet first look at the results of fishers exact tests and
#pick the appropriate coveraged cut off
randomize_bed(mummer_files[-5], Pst_104E_genome_file_fn, WGA_cov_rand_outdir, n=5000)
randomize_bed(mummer_files[5], DK0911_genome_file_fn, WGA_cov_rand_outdir, n=5000)

In [None]:
Pst_random_dict_WGA = {}
DK0911_random_dict_WGA = {}

In [None]:
#loop over all gene files
#for each gene file do the whole permuation test and safe it as part of a dictionary
Pst_104E_random_fn_WGA = [os.path.join(WGA_cov_rand_outdir, x) for x in os.listdir(WGA_cov_rand_outdir) if x.startswith('Pst')]
Pst_104E_random_fn_WGA.sort()
DK0911_random_fn_WGA = [os.path.join(WGA_cov_rand_outdir, x) for x in os.listdir(WGA_cov_rand_outdir) if x.startswith('DK')]
DK0911_random_fn_WGA.sort()
Pst_random_dict_WGA = {}
DK0911_random_dict_WGA = {}
keys = Pst_104E_dict.keys()
min_overlap = 1.0
for key in keys:
    Pst_random_dict_WGA[key] = []
    for rand_fn in Pst_104E_random_fn_WGA:
        Pst_random_dict_WGA[key].append(count_overlap_features(Pst_104E_dict[key], rand_fn, min_overlap))
    DK0911_random_dict_WGA[key] = []
    for rand_fn in DK0911_random_fn_WGA:
        DK0911_random_dict_WGA[key].append(count_overlap_features(DK0911_dict[key], rand_fn, min_overlap))

In [None]:
reference_bed_fn = mummer_files[-5]
columns = len(keys)
f1, ax = plt.subplots(1, columns, figsize=(20, 4))


for n, key in enumerate(keys):
    overlap = count_overlap_features(Pst_104E_dict[key],reference_bed_fn, min_overlap)
    random_values = Pst_random_dict_WGA[key]
    count = 0 
    for i in random_values:
        if i > overlap:
            count += 1
    p = count/len(random_values)
    
    two_side_p = 2*p
    
    if two_side_p > 1:
        two_side_p = round(2 - two_side_p, 3)
    sns.violinplot(y=random_values, color=pallete[0], ax=ax[n])
    ax[n].axhline(y=overlap, color='r', linestyle='-')
    ax[n].set_title(key, fontsize=16)
    ax[n].set_xlabel('p=' + str(two_side_p), fontsize=16)
    ax[0].set_ylabel('Number of overlapping genes', fontsize=16)
    out_fn = os.path.join(OUT_dir, 'Pst104E_noncovered_%s_WGAmi95_mo%s.facett.png' % (key,min_overlap))
    plt.savefig(out_fn, dpi =300)

In [None]:
for key in keys:
    overlap = count_overlap_features(Pst_104E_dict[key],reference_bed_fn, min_overlap)
    random_values = Pst_random_dict_WGA[key]

    count = 0 
    for i in random_values:
        if i > overlap:
            count += 1
    p = count/len(random_values)
    
    two_side_p = 2*p
    
    if two_side_p > 1:
        two_side_p = round(2 - two_side_p, 3)
        
    f, ax = plt.subplots(figsize=(4, 7))
    plt.style.use('fast')
    sns.violinplot(y=random_values, color=pallete[0])
    plt.axhline(y=overlap, color='r', linestyle='-')
    plt.title(key, fontsize=16)
    plt.ylabel('Number of overlapping genes', fontsize=16)

    plt.rc('xtick', labelsize=16)
    plt.rc('ytick', labelsize=16)
    out_fn = os.path.join(OUT_dir, 'Pst104E_noncovered_%s_WGAmi95_mo%s.png' % (key,min_overlap))
    plt.savefig(out_fn, dpi =300)

In [None]:
reference_bed_fn = mummer_files[5]
columns = len(keys)
f1, ax = plt.subplots(1, columns, figsize=(20, 4))


for n, key in enumerate(keys):
    overlap = count_overlap_features(DK0911_dict[key],reference_bed_fn, min_overlap)
    random_values = DK0911_random_dict_WGA[key]
    count = 0 
    for i in random_values:
        if i > overlap:
            count += 1
    p = count/len(random_values)
    
    two_side_p = 2*p
    
    if two_side_p > 1:
        two_side_p = round(2 - two_side_p, 3)
    sns.violinplot(y=random_values, color=pallete[1], ax=ax[n])
    ax[n].axhline(y=overlap, color='r', linestyle='-')
    ax[n].set_title(key, fontsize=16)
    ax[n].set_xlabel('p=' + str(two_side_p), fontsize=16)
    ax[0].set_ylabel('Number of overlapping genes', fontsize=16)
    out_fn = os.path.join(OUT_dir, 'DK0911_noncovered_%s_WGAmi95_mo%s.facett.png' % (key,min_overlap))
    plt.savefig(out_fn, dpi =300)    



        

In [None]:

for key in keys:
    overlap = count_overlap_features(DK0911_dict[key],reference_bed_fn, min_overlap)
    random_values = DK0911_random_dict_WGA[key]

    count = 0 
    for i in random_values:
        if i > overlap:
            count += 1
    p = count/len(random_values)
    
    two_side_p = 2*p
    
    if two_side_p > 1:
        two_side_p = round(2 - two_side_p, 3)



    f, ax = plt.subplots(figsize=(4, 7))
    plt.style.use('fast')
    sns.violinplot(y=random_values, color=pallete[1])
    plt.axhline(y=overlap, color='r', linestyle='-')
    plt.title(key, fontsize=16)
    plt.ylabel('Number of overlapping genes', fontsize=16)
    plt.xlabel('p=' + str(two_side_p), fontsize=16)
    plt.rc('xtick', labelsize=16)
    plt.rc('ytick', labelsize=16)
    out_fn = os.path.join(OUT_dir, 'DK0911_noncovered_%s_WGAmi95_mo%s.png' % (key,min_overlap))
    plt.savefig(out_fn, dpi =300)

### To-Do

* look at transposable elements. Think about sliglthly different analysis including superfamily and family level

In [None]:
#loop over all gene files
#for each gene file do the whole permuation test and safe it as part of a dictionary
Pst_104E_random_fn_lowcov = [os.path.join(SRM_cov_rand_outdir, x) for x in os.listdir(SRM_cov_rand_outdir) if x.startswith('Pst')]
Pst_104E_random_fn_lowcov.sort()
DK0911_random_fn_lowcov = [os.path.join(SRM_cov_rand_outdir, x) for x in os.listdir(SRM_cov_rand_outdir) if x.startswith('DK')]
DK0911_random_fn_lowcov.sort()
#Pst_random_dict_lowcov = {}
#DK0911_random_dict_lowcov = {}
DK0911_random_dict_lowcov['TE_superfamily'] = []
Pst_random_dict_lowcov['TE_superfamily'] = []
for rand_fn in DK0911_random_fn_lowcov[:2500]:
    DK0911_random_dict_lowcov['TE_superfamily'].append(count_base_overlap_TE(DK0911_dict['TE_superfamily'], rand_fn))
for rand_fn in Pst_104E_random_fn_lowcov[:2500]:
    #print(rand_fn)
    Pst_random_dict_lowcov['TE_superfamily'].append(count_base_overlap_TE(Pst_104E_dict['TE_superfamily'], rand_fn))
pybedtools.cleanup()

In [None]:
#loop over all gene files
#for each gene file do the whole permuation test and safe it as part of a dictionary
Pst_104E_random_fn_WGA = [os.path.join(WGA_cov_rand_outdir, x) for x in os.listdir(WGA_cov_rand_outdir) if x.startswith('Pst')]
Pst_104E_random_fn_WGA.sort()
DK0911_random_fn_WGA = [os.path.join(WGA_cov_rand_outdir, x) for x in os.listdir(WGA_cov_rand_outdir) if x.startswith('DK')]
DK0911_random_fn_WGA.sort()
#Pst_random_dict_WGA = {}
#DK0911_random_dict_WGA = {}
Pst_random_dict_WGA['TE_superfamily'] = []
DK0911_random_dict_WGA['TE_superfamily'] = []
for rand_fn in DK0911_random_fn_WGA[:2500]:
    DK0911_random_dict_WGA['TE_superfamily'].append(count_base_overlap_TE(DK0911_dict['TE_superfamily'], rand_fn))
for rand_fn in Pst_104E_random_fn_WGA[:2500]:
    Pst_random_dict_WGA['TE_superfamily'].append(count_base_overlap_TE(Pst_104E_dict['TE_superfamily'], rand_fn))
pybedtools.cleanup(remove_all=True)

In [None]:
out_fn = os.path.join(OUT_dir, 'DK0911_noncovered_TE_superfamily_lowcov01_mo.facett.png' )
plot_TE_cov_expect('DK0911', DK0911_random_dict_lowcov['TE_superfamily'],pallete[1], out_fn)

out_fn = os.path.join(OUT_dir, 'DK0911_noncovered_TE_superfamily_WGAmi95.facett.png' )
plot_TE_cov_expect('DK0911', DK0911_random_dict_WGA['TE_superfamily'],pallete[1], out_fn)

#Pst104E
out_fn = os.path.join(OUT_dir, 'Pst104E_noncovered_TE_superfamily_lowcov01.facett.png')
plot_TE_cov_expect('Pst_104E', Pst_104E_random_fn_lowcov['TE_superfamily'],pallete[0], out_fn)

out_fn = os.path.join(OUT_dir, 'Pst104E_noncovered_TE_superfamily_WGAmi95.facett.png')
plot_TE_cov_expect('Pst_104E', Pst_104E_random_fn_WGA['TE_superfamily'],pallete[0], out_fn)

In [None]:
print("hello")

In [None]:
out_fn = os.path.join(OUT_dir, 'DK0911_noncovered_%s_WGAmi95_mo%s.facett.png' % (key,min_overlap))



TE_random_df = pd.concat(DK0911_random_dict_lowcov['TE_superfamily'], axis=1, sort=True)
TE_random_df.fillna(0, inplace=True)
genome = 'DK0911'

rows = len(TE_random_df.index)//5
f1, ax = plt.subplots(rows, 5, figsize=(20, 30))


for key,y in zip(TE_random_df.index, product(range(0, rows), range(0,5))):
    overlap = TE_sf_cov_dict[genome][key]
    random_values = TE_random_df.loc[key,:]
    count = 0 
    for i in random_values:
        if i > overlap:
            count += 1
    p = count/len(random_values)
    
    two_side_p = 2*p
    
    if two_side_p > 1:
        two_side_p = round(2 - two_side_p, 3)
    else:
        two_side_p = round(two_side_p, 3)
    sns.violinplot(y=random_values, color=pallete[1], ax=ax[y[0], y[1]])
    ax[y[0], y[1]].axhline(y=overlap, color='r', linestyle='-')
    ax[y[0], y[1]].set_title(key, fontsize=16)
    ax[y[0], y[1]].set_yticklabels([])
    ax[y[0], y[1]].set_ylabel('')
    ax[y[0], y[1]].set_xlabel('p=' + str(two_side_p), fontsize=16)
    ax[y[0] ,0].set_ylabel('Number of overlapping genes', fontsize=16)
    plt.tight_layout()
#plt.savefig(out_fn, dpi =300)    

In [None]:
TE_random_df

In [None]:
len(DK0911_random_dict_lowcov['TE_superfamily'])

In [None]:
print('hello')

In [None]:
filter_fn = pairings[0][2]

In [None]:
tmp_df.head()

In [None]:
overlap_df = count_base_overlap_TE(DK0911_dict['TE_superfamily'], filter_fn)
overlap_df.sort_values(['seqname', 'start']).head()
overlap_df['interval'] = overlap_df.end - overlap_df.start + 1
overlap_sf_cov_df = overlap_df.groupby('attributes')['interval'].sum()
genome_size = pd.read_csv(DK0911_genome_file_fn, sep='\t',header = None)[1].sum()
tmp_df = pd.read_csv(filter_fn, sep='\t', header = None)
missing_space = sum(tmp_df[2] - tmp_df[1])
fraction_missing = round(missing_space/genome_size, 4)

In [None]:
TE_sf_cov_df.index

In [None]:
Fishers_summary_dict = {}
Absence_summary_dict = {}
outer_index = [] #being the cut off
inner_index = [] #being the type of gene to look at
Fishers_summary_dict['DK0911'] = []
Fishers_summary_dict['Pst_104E'] = []
Absence_summary_dict['DK0911'] = []
Absence_summary_dict['Pst_104E'] = []
min_overlap = 1
genome = 'DK0911'
for pair in pairings[:1]:
    overlap_df = count_base_overlap_TE(DK0911_dict['TE_superfamily'], pair[2])
    overlap_df['interval'] = overlap_df.end - overlap_df.start + 1
    overlap_sf_cov_df = overlap_df.groupby('attributes')['interval'].sum()
    index = overlap_sf_cov_df.index
    cut_off = os.path.basename(pair[2]).split('.')[-3].replace('lowcov', '')
    for ind in index:
        print(ind)
        expect_list = [TE_sf_cov_dict[genome][ind], genome_size_dict[genome]]
        inner_index.append(ind)
        outer_index.append(cut_off)
        test_list = [overlap_sf_cov_df[ind], missing_space]
        Fishers_summary_dict[genome].append(fisher_exact([test_list, expect_list])[1])
        Absence_summary_dict[genome]

In [None]:
Fishers_summary_dict[genome]

In [None]:
pairings[:1]

In [None]:
Fishers_summary_dict = {}
Absence_summary_dict = {}
outer_index = [] #being the cut off
inner_index = [] #being the type of gene to look at
Fishers_summary_dict['DK0911'] = []
Fishers_summary_dict['Pst_104E'] = []
Absence_summary_dict['DK0911'] = []
Absence_summary_dict['Pst_104E'] = []
min_overlap = 1
index = TE_sf_cov_df.index
#pair[2] is the filtered lowcov file as reference
for pair in pairings:
    if os.path.basename(pair[2]).startswith('DK_0911'):
        expect_list = [count_overlap_features(DK0911_dict['All_genes'],pair[2],min_overlap) ,\
                       pd.read_csv(DK0911_dict['All_genes'], sep='\t').shape[0]]
        cut_off = os.path.basename(pair[2]).split('.')[-3].replace('lowcov', '')
        for key in keys:
            inner_index.append(key)
            outer_index.append(cut_off)
            test_list = [count_overlap_features(DK0911_dict[key],pair[2],min_overlap),\
                         pd.read_csv(DK0911_dict[key], sep='\t').shape[0]]
            Fishers_summary_dict['DK0911'].append(fisher_exact([test_list, expect_list])[1])
            Absence_summary_dict['DK0911'].append(count_overlap_features(DK0911_dict[key],pair[2],min_overlap))
    if os.path.basename(pair[2]).startswith('Pst_104E_v13_ph_ctg'):
        expect_list = [count_overlap_features(Pst_104E_dict['All_genes'],pair[2],min_overlap) ,\
                       pd.read_csv(Pst_104E_dict['All_genes'], sep='\t').shape[0]]
        cut_off = os.path.basename(pair[2]).split('.')[-3].replace('lowcov', '')
        for key in keys:
            inner_index.append(key)
            outer_index.append(cut_off)
            test_list = [count_overlap_features(Pst_104E_dict[key],pair[2],min_overlap),\
                         pd.read_csv(Pst_104E_dict[key], sep='\t').shape[0]]
            Fishers_summary_dict['Pst_104E'].append(fisher_exact([test_list, expect_list])[1])
            Absence_summary_dict['Pst_104E'].append(count_overlap_features(Pst_104E_dict[key],pair[2],min_overlap))


In [None]:
TE_df.sort_values(['Chrom','start']).head()

In [None]:
pd.concat([TE_sf_cov_df, overlap_sf_cov_df], axis=1)

In [None]:
TE_sf_cov_df.combine_first(TE_sf_cov_df)

In [None]:
pd.merge(TE_sf_cov_df, TE_sf_cov_df)