In [1]:
%matplotlib inline

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
#import seaborn
import matplotlib
from Bio import SeqIO, SeqUtils
import os

In [5]:
from sklearn.externals.joblib import Parallel, delayed

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
#Define the PATH
BASE_AA_PATH = '/home/benjamin/genome_assembly/Warrior'
BASE_A_PATH = '/home/benjamin/genome_assembly/Warrior/genome_v04'
#for now use the previous mapping that still included high coverage regions
COV_IN_PATH = '/home/benjamin/genome_assembly/Warrior/SRM'
BAM_IN_PATH = '/home/benjamin/genome_assembly/Warrior/SRM'
#apply analysis restricted to final assembly Pst_104E_v12
COV_OUT_PATH = os.path.join(BASE_AA_PATH, 'COV')
if not os.path.isdir(COV_OUT_PATH):
    os.mkdir(COV_OUT_PATH)
pwh_list_path = os.path.join(BASE_A_PATH, 'DK_0911_v04_pwh.list')
pwoh_list_path = os.path.join(BASE_A_PATH, 'DK_0911_v04_pwoh.list')

In [52]:
input_genome = 'DK_0911_v03'
coverage_file_suffix = 'bwamem.PRI_NTKN_DK0911.sam.sorted.bam.aa.cov'
output_genome = 'DK0911_v04'
ph_cov_fn = os.path.join(COV_IN_PATH, '%s_ph_ctg.%s' % (input_genome, coverage_file_suffix))
assert(os.path.exists(ph_cov_fn))
p_cov_fn = os.path.join(COV_IN_PATH, '%s_p_ctg.%s' % (input_genome, coverage_file_suffix))
assert(os.path.exists(p_cov_fn))

In [8]:
mean_file_name = os.path.join(COV_OUT_PATH, '%s_mean_cov.txt' % output_genome)

In [100]:
#cannot pass in the iterator itself but only the file name variable
#the interator has to be generate each and everytime
def homo_het_contig_ana(contig, mean_s50_ph_p, std_s50_ph_p, ph_cov_fn, p_cov_fn):
    """
    This function generates bed files for regions of the genome where the coverage falls
    below a certain threshold or above certain threshold.
    The homozygous bed are the regions that are unphased of a diploid assembly. These
    are defined by similar coverage when mapping against primary contigs (p) and
    against primary contigs and haplotigs (ph). This compares the coverage of ph mapping to the
    coverage of h mapping.
    
    Threshold: > (2*mean_s50_ph_p - 2*std_s50_ph_p)
    
    
    The unique bed are regions that are specific to primary contigs and are absent from haplotigs.
    These are defined as low coverage regions when mapping against p only.
    
    Threshold: < (mean_s50_ph_p + 2*std_s50_ph_p)
    
    """
    
    #now read in the dataframe for each contig using the iterator function of pandas.read_csv
    cov_header = ["contig", "position", 'coverage']
    ph_cov_it = pd.read_csv(ph_cov_fn, sep='\t', header=None, names=cov_header, chunksize=500000, iterator=True)
    p_cov_it = pd.read_csv(p_cov_fn, sep='\t', header=None, names=cov_header, chunksize=500000, iterator=True)
    tmp_p_df = pd.concat([chunk[chunk['contig']== contig] for chunk in p_cov_it])
    tmp_p_df_ph = pd.concat([chunk[chunk['contig']== contig] for chunk in ph_cov_it])
    #generarte the rolling windows
    tmp_p_df['Rolling_w1000_p'] = tmp_p_df.rolling(window=1000, min_periods=1, center=True, win_type='blackmanharris')['coverage'].mean()
    tmp_p_df_ph['Rolling_w1000_ph_p'] = tmp_p_df_ph.rolling(window=1000, min_periods=1, center=True, win_type='blackmanharris')['coverage'].mean()
    tmp_p_df['Rolling_w1000_ph_p'] = tmp_p_df_ph['Rolling_w1000_ph_p']
    #potentially p_unique DNA streatches are defined as p contig cov streatches, while doing p mapping, that are heterozygous coverage
    # coverage -> mean_s2000_ph_p
    # [Rolling_w1000_p < mean_s2000_ph_p + 2*std_s2000_ph_p]
    tmp_p_df_p_unique = tmp_p_df[tmp_p_df['Rolling_w1000_p'] < (mean_s50_ph_p + 2*std_s50_ph_p)]
    if len(tmp_p_df_p_unique) > 0:
        tmp_p_df_p_unique.reset_index(drop=True, inplace=True)
        #add a position +1 column by copying the position datafram 1: and adding making position+1 for the last element
        # in the dataframe equal to its own value
        tmp_p_df_p_unique['position+1']= tmp_p_df_p_unique.loc[1:, 'position'].\
        append(pd.Series(tmp_p_df_p_unique.loc[len(tmp_p_df_p_unique)-1, 'position'], index=[tmp_p_df_p_unique.index[-1]])).reset_index(drop=True)

        tmp_p_df_p_unique['position_diff+1'] = tmp_p_df_p_unique['position+1'] - tmp_p_df_p_unique['position']

        #add a position -1 column by copying the position datafram :len-2 and adding/making position-1 for the first element
        # in the dataframe equal to its own value
        position_1 = list(tmp_p_df_p_unique.loc[:len(tmp_p_df_p_unique)-2, 'position'])
        position_1.insert(0, tmp_p_df_p_unique.loc[0, 'position'])

        tmp_p_df_p_unique['position-1']= position_1

        tmp_p_df_p_unique['position_diff-1'] =  tmp_p_df_p_unique['position'] - tmp_p_df_p_unique['position-1']
        #start points of feature streatch => where previous position is unequal 1 away
        #tmp_p_df_p_unique[tmp_p_df_p_unique['position_diff-1'] != 1 ].head()

        start_pos_index = ''
        stop_pos_index = ''
        contig_name_list = ''
        p_unique_bed = ''
        #this should be good  now as it flows double check and loop around to finish this off
        start_pos_index = tmp_p_df_p_unique[tmp_p_df_p_unique['position_diff-1'] != 1 ].index
        stop_pos_index = tmp_p_df_p_unique[tmp_p_df_p_unique['position_diff+1'] != 1 ].index

        contig_name_list = [contig]*len(start_pos_index)

        start_pos = [tmp_p_df_p_unique.loc[pos, 'position'] -1 for pos in start_pos_index]
        stop_pos = [tmp_p_df_p_unique.loc[pos, 'position']  for pos in stop_pos_index]

        p_unique_bed = pd.DataFrame([contig_name_list, start_pos, stop_pos]).T
        tmp_out_fn = os.path.join(COV_OUT_PATH, '%s_p_uniqe_bed.tmp' % contig)
        p_unique_bed.to_csv(tmp_out_fn, sep='\t', header=None, index=None)
    
    #potentially p_homo DNA streatches are defined as p contig cov streatches, while doing ph mapping, that are homozygous coverage
    # coverage -> 2*mean_s2000_ph_p
    # [Rolling_w1000_p > 2*mean_s2000_ph_p - 2*std_s2000_ph_p]
    #here might be a consideration to ask for a difference in profile (covariance != 1)
    tmp_p_df_p_homo = tmp_p_df[(tmp_p_df['Rolling_w1000_ph_p'] > (2*mean_s50_ph_p - 2*std_s50_ph_p))]
    if len(tmp_p_df_p_homo) > 0:
        tmp_p_df_p_homo.reset_index(drop=True, inplace=True)
        #add a position +1 column by copying the position datafram 1: and adding making position+1 for the last element
        # in the dataframe equal to its own value
        tmp_p_df_p_homo['position+1']= tmp_p_df_p_homo.loc[1:, 'position'].\
        append(pd.Series(tmp_p_df_p_homo.loc[len(tmp_p_df_p_homo)-1, 'position'], index=[tmp_p_df_p_homo.index[-1]])).reset_index(drop=True)

        tmp_p_df_p_homo['position_diff+1'] = tmp_p_df_p_homo['position+1'] - tmp_p_df_p_homo['position']

        #add a position -1 column by copying the position datafram :len-2 and adding/making position-1 for the first element
        # in the dataframe equal to its own value
        position_1 = list(tmp_p_df_p_homo.loc[:len(tmp_p_df_p_homo)-2, 'position'])
        position_1.insert(0, tmp_p_df_p_homo.loc[0, 'position'])

        tmp_p_df_p_homo['position-1']= position_1

        tmp_p_df_p_homo['position_diff-1'] =  tmp_p_df_p_homo['position'] - tmp_p_df_p_homo['position-1']
        #start points of feature streatch => where previous position is unequal 1 away
        #tmp_p_df_p_homo[tmp_p_df_p_homo['position_diff-1'] != 1 ].head()

        start_pos_index = ''
        stop_pos_index = ''
        contig_name_list = ''
        p_homo_bed = ''
        #this should be good  now as it flows double check and loop around to finish this off
        start_pos_index = tmp_p_df_p_homo[tmp_p_df_p_homo['position_diff-1'] != 1 ].index
        stop_pos_index = tmp_p_df_p_homo[tmp_p_df_p_homo['position_diff+1'] != 1 ].index

        contig_name_list = [contig]*len(start_pos_index)

        start_pos = [tmp_p_df_p_homo.loc[pos, 'position'] -1 for pos in start_pos_index]
        stop_pos = [tmp_p_df_p_homo.loc[pos, 'position']  for pos in stop_pos_index]

        p_homo_bed = pd.DataFrame([contig_name_list, start_pos, stop_pos]).T
        tmp_out_fn = os.path.join(COV_OUT_PATH, '%s_p_homo_bed.tmp'% contig)
        p_homo_bed.to_csv(tmp_out_fn, sep='\t', header=None, index=None)

    print('Contig %s done.' % contig)

In [15]:
#read in the mean files generated by the main notebook
with open(mean_file_name, 'r') as infile:
    for line in infile:
        if line.rstrip().startswith('mean_s50_ph_p'):
            mean_s50_ph_p = float(line.rstrip().split(': ')[1])
        elif line.rstrip().startswith('std_s50_ph_p'):
            std_s50_ph_p = float(line.rstrip().split(': ')[1])

In [18]:
#get all primary contigs
primary_contig_list = []
genome_file_name = os.path.join(BASE_A_PATH, 'DK_0911_v04_p_ctg.fa')
with open(genome_file_name, 'r') as gfh:
    for line in gfh:
        if line.startswith('>'):
            primary_contig_list.append(line.rstrip().replace('>',''))
        else:
            continue

In [94]:
Parallel(n_jobs=8)(delayed(homo_het_contig_ana)(contig, mean_s50_ph_p, std_s50_ph_p, ph_cov_fn, p_cov_fn)\
                  for contig in iter(primary_contig_list))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

Contig pcontig_006 done.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

Contig pcontig_005 done.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

Contig pcontig_008 done.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

Contig pcontig_003 done.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Contig pcontig_004 done.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

Contig pcontig_002 done.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

Contig pcontig_000 done.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

Contig pcontig_001 done.
Contig pcontig_009 done.
Contig pcontig_010 done.
Contig pcontig_011 done.
Contig pcontig_012 done.
Contig pcontig_014 done.
Contig pcontig_013 done.
Contig pcontig_015 done.
Contig pcontig_016 done.
Contig pcontig_017 done.
Contig pcontig_018 done.
Contig pcontig_019 done.
Contig pcontig_020 done.
Contig pcontig_022 done.
Contig pcontig_021 done.
Contig pcontig_023 done.
Contig pcontig_025 done.
Contig pcontig_024 done.
Contig pcontig_026 done.
Contig pcontig_027 done.
Contig pcontig_028 done.
Contig pcontig_029 done.
Contig pcontig_030 done.
Contig pcontig_031 done.
Contig pcontig_032 done.
Contig pcontig_034 done.
Contig pcontig_035 done.
Contig pcontig_033 done.
Contig pcontig_036 done.
Contig pcontig_037 done.
Contig pcontig_039 done.
Contig pcontig_038 done.
Contig pcontig_040 done.
Contig pcontig_041 done.
Contig pcontig_043 done.
Contig pcontig_042 done.
Contig pcontig_045 done.
Contig pcontig_047 done.
Contig pcontig_046 done.
Contig pcontig_048 done.


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [95]:
#now get all the dataframes in and generate a combiend dataframe
bed_p_homo_list = [pd.read_csv(os.path.join(COV_OUT_PATH, x), header=None,sep='\t') for x in os.listdir(COV_OUT_PATH) if x.endswith('_p_homo_bed.tmp')]
bed_p_uniqe_list = [pd.read_csv(os.path.join(COV_OUT_PATH, x), header=None,sep='\t') for x in os.listdir(COV_OUT_PATH) if x.endswith('_p_uniqe_bed.tmp')]

In [96]:
p_homo_bed_df = pd.concat(bed_p_homo_list).sort_values(by=[0,1])
p_unique_bed_df =  pd.concat(bed_p_uniqe_list).sort_values(by=[0,1])
p_homo_bed_df.to_csv(os.path.join(COV_OUT_PATH, output_genome + '_ph_ctg.ph_p_homo_cov.bed'), header=None, index = None, sep ='\t')
p_unique_bed_df.to_csv(os.path.join(COV_OUT_PATH, output_genome + '_ph_ctg.p_p_het_cov.bed'), header=None, index = None, sep ='\t')

In [99]:
#add a cleanup step
[os.remove(os.path.join(COV_OUT_PATH, x)) for x in os.listdir(COV_OUT_PATH) if x.endswith('.tmp')]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,