In [2]:
import pandas as pd 
import os
import scipy.stats as stats

In [12]:
contigs_file = '/media/supertramp/ssd_ratul/thesis/renal/fresh_run/final_results_no_ref_filter/contigs/abyss/B_contigs.tsv'
kmers_file = '/media/supertramp/ssd_ratul/thesis/renal/fresh_run/final_results_no_ref_filter/contigs/bipartite_0_with_counts.csv'
output_file = '/media/supertramp/ssd_ratul/thesis/renal/fresh_run/final_results_no_ref_filter/contigs/abyss/B_contigs_pvals.tsv'
k = 31

In [4]:
def get_all_kmers(contig, k=31):
    kmers = []
    for i in range(len(contig) - k + 1):
        kmers.append(contig[i:i+k])
    return kmers    

In [5]:
def get_kmer_pvalues(kmers, kmers_df):
    pvalues = []
    for kmer in kmers:
        if kmer in kmers_df.index:
            pvalues.append(kmers_df.loc[kmer].padj)
            
    # print(stats.combine_pvalues(pvalues, method='fisher'))
    # print(stats.combine_pvalues(pvalues, method='stouffer'))    
    
    # return pvalues
    if len(pvalues) == 0:
        return 1
    return stats.combine_pvalues(pvalues, method='stouffer')[1]

In [8]:
contigs_df = pd.read_csv(contigs_file, sep='\t')
contigs_df['len'] = contigs_df['contig'].apply(len)
contigs_df.head()

Unnamed: 0,name,contig,len
0,2,GGGGTGGCTCACGCCTGTAATCCC,24
1,3,GCCTTTATTGCAGCCCACCGGCGGCACTTCGAGGCAATCCGTGGC,45
2,8,GTTAAATTTACAAGGGGATTTAG,23
3,9,ACTGCAACCTCCACCTCCTGGGCTCAAGCAATTCT,35
4,17,TAATTTTTGTATTTTTAGTAGATACGGGGTTTCACCATGTTGGCC,45


In [9]:
contigs_df = contigs_df[contigs_df['len'] > 45]

In [10]:
contigs_df['kmers'] = contigs_df['contig'].apply(lambda x: get_all_kmers(x, k))

In [16]:
kmers_df = pd.read_csv(kmers_file, sep='\t', index_col=0)

In [17]:
kmers_df = kmers_df[['baseMean', 'log2FoldChange', 'lfcSE', 'stat', 'pvalue', 'padj']]

In [18]:
kmers_df.head()

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC,338.211078,-1.171705,0.0,76.250886,2.04884e-14,6.051529e-13
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT,440.239842,-0.79992,0.0,39.74934,5.284981e-09,4.542567e-08
AAAAAAAAAAAAAAAAAAAAAAAAAAAAACA,293.571574,-1.143619,0.0,70.389939,1.272013e-13,2.827653e-12
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAGA,776.437647,-0.871704,0.0,40.39648,4.129388e-09,3.626049e-08
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAGT,37754.512462,-0.857907,0.0,47.945561,2.510876e-10,2.784244e-09


In [20]:
#change name of index column
kmers_df.index.names = ['kmer']

In [21]:
kmers_df.head()

Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
kmer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC,338.211078,-1.171705,0.0,76.250886,2.04884e-14,6.051529e-13
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT,440.239842,-0.79992,0.0,39.74934,5.284981e-09,4.542567e-08
AAAAAAAAAAAAAAAAAAAAAAAAAAAAACA,293.571574,-1.143619,0.0,70.389939,1.272013e-13,2.827653e-12
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAGA,776.437647,-0.871704,0.0,40.39648,4.129388e-09,3.626049e-08
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAGT,37754.512462,-0.857907,0.0,47.945561,2.510876e-10,2.784244e-09


In [28]:
kmers_df = kmers_df[kmers_df['padj'] < 0.05]

In [29]:
kmers_df = kmers_df[abs(kmers_df['log2FoldChange']) >= 1]

In [30]:
contigs_df['pvalue'] = contigs_df['kmers'].apply(lambda x: get_kmer_pvalues(x, kmers_df))

In [31]:
contigs_df = contigs_df[contigs_df['pvalue'] < 0.05] 

In [35]:
contigs_df = contigs_df.sort_values('pvalue')

In [36]:
contigs_df.to_csv(output_file, sep='\t', index=False)

In [34]:
contigs_df.tail()

Unnamed: 0,name,contig,len,kmers,pvalue
4475,7213,TTTGTACCTAAGTTTTTTAATGAGTGAAATTTGCATTATAAACTTT...,83,"[TTTGTACCTAAGTTTTTTAATGAGTGAAATT, TTGTACCTAAGT...",0.011263
3803,6099,CTAATTTTTGTATTTTTAGTAGGGACGAGATTTCTCCATGTTGGTC...,52,"[CTAATTTTTGTATTTTTAGTAGGGACGAGAT, TAATTTTTGTAT...",0.01318
1407,2282,GCCCCGCATACACCGGCGGCGATGGCGCTGTTCCGACCCACCATCT...,53,"[GCCCCGCATACACCGGCGGCGATGGCGCTGT, CCCCGCATACAC...",0.013826
3711,5945,CCACTGCACTCCAGCCTAGGCAACAGAATGAGATCCTGTCTCACATT,47,"[CCACTGCACTCCAGCCTAGGCAACAGAATGA, CACTGCACTCCA...",0.020828
693,1171,GCCTGGTATTTTTTAATCAAACAAAATATTTATGAAATGGGTTTTCT,47,"[GCCTGGTATTTTTTAATCAAACAAAATATTT, CCTGGTATTTTT...",0.042602
