In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
all_kmers_df = pd.read_csv('filtered_kmers.tsv', sep='\t', index_col=0)

In [None]:
all_kmers_df.head()

In [None]:
kmers = all_kmers_df[(all_kmers_df['log2FoldChange'] > 1) & (all_kmers_df['padj'] < 0.05)]

In [None]:
kmers.head()

In [None]:
A_contigs = pd.read_csv('A_contigs.tsv', sep='\t')

In [None]:
# for each contig, find the constituent kmers of length 31 and store them in a column of the df as a list
A_contigs['kmers'] = A_contigs['contig'].apply(lambda x: [x[i:i+31] for i in range(len(x)-31)])


In [None]:
A_contigs['kmer_count'] = A_contigs['kmers'].apply(lambda x: len(x))

In [None]:
A_contigs = A_contigs[A_contigs['kmer_count'] > 0]

In [None]:
# for each contig, take the kmers and find their padj values and store in another column
A_contigs['padj'] = A_contigs['kmers'].apply(lambda x: [kmers.loc[kmer, 'padj'] for kmer in x if kmer in kmers.index])


In [None]:
A_annotations = pd.read_csv('A_contigs_annotation.tsv', sep='\t')


In [None]:
A_intron_annotation = pd.read_csv('A_contigs_annotation_intron.tsv', sep='\t')


In [None]:
A_annotations = pd.concat([A_annotations, A_intron_annotation])

In [None]:
A_annotations = A_annotations.drop_duplicates(subset=['contig', 'chromosome', 'start', 'end', 'feature_type'])

In [None]:
A_unmapped = A_annotations[A_annotations['feature_type'] == '*']

In [None]:
A_intron = A_annotations[(A_annotations['feature_type'] == 'intron')]

In [None]:
A_CDS = A_annotations[(A_annotations['feature_type'] == 'CDS')]

In [None]:
A_Junction = A_annotations[(A_annotations['feature_type'] == 'exon')]

In [None]:
A_homology = pd.read_csv('A_contigs_homology_annotation.tsv', sep='\t')

In [None]:
A_rRNAs = A_homology[A_homology['annotation'] == 'rRNA']

In [None]:
A_unmapped = A_unmapped.drop_duplicates(subset=['contig'])
A_intron   = A_intron.drop_duplicates(subset=['contig'])
A_CDS      = A_CDS.drop_duplicates(subset=['contig'])
A_Junction = A_Junction.drop_duplicates(subset=['contig'])
A_rRNA     = A_rRNAs.drop_duplicates(subset=['contig'])

In [None]:
A_unmapped_kmers = []
A_intron_kmers = []
A_CDS_kmers = []
A_Junction_kmers = []
A_rRNA_kmers = []

In [None]:
A_unmapped_contigs = A_contigs[A_contigs['contig'].isin(A_unmapped['contig'].to_list())]
A_intron_contigs   = A_contigs[A_contigs['contig'].isin(A_intron['contig'].to_list())]
A_CDS_contigs      = A_contigs[A_contigs['contig'].isin(A_CDS['contig'].to_list())]
A_Junction_contigs = A_contigs[A_contigs['contig'].isin(A_Junction['contig'].to_list())]
A_rRNA_contigs     = A_contigs[A_contigs['contig'].isin(A_rRNA['contig'].to_list())]

In [None]:
for idx, row in A_unmapped_contigs.iterrows():
    A_unmapped_kmers.extend(row['kmers'])

for idx, row in A_intron_contigs.iterrows():
    A_intron_kmers.extend(row['kmers'])

for idx, row in A_CDS_contigs.iterrows():
    A_CDS_kmers.extend(row['kmers'])

for idx, row in A_Junction_contigs.iterrows():
    A_Junction_kmers.extend(row['kmers'])

for idx, row in A_rRNA_contigs.iterrows():
    A_rRNA_kmers.extend(row['kmers'])

In [None]:
A_unmapped_kmers = list(set(A_unmapped_kmers))
A_intron_kmers   = list(set(A_intron_kmers))
A_CDS_kmers      = list(set(A_CDS_kmers))
A_Junction_kmers = list(set(A_Junction_kmers))
A_rRNA_kmers     = list(set(A_rRNA_kmers))

In [None]:
A_unmapped_kmers_padj = [kmers.loc[kmer, 'padj'] for kmer in A_unmapped_kmers if kmer in kmers.index]
A_intron_kmers_padj   = [kmers.loc[kmer, 'padj'] for kmer in A_intron_kmers if kmer in kmers.index]
A_CDS_kmers_padj      = [kmers.loc[kmer, 'padj'] for kmer in A_CDS_kmers if kmer in kmers.index]
A_Junction_kmers_padj = [kmers.loc[kmer, 'padj'] for kmer in A_Junction_kmers if kmer in kmers.index]
A_rRNA_kmers_padj     = [kmers.loc[kmer, 'padj'] for kmer in A_rRNA_kmers if kmer in kmers.index]

In [None]:
# normalize the padj values
# take the negative log of the padj values
import numpy as np
A_unmapped_kmers_padj_norm = [-np.log10(x) for x in A_unmapped_kmers_padj]
A_intron_kmers_padj_norm   = [-np.log10(x) for x in A_intron_kmers_padj]
A_CDS_kmers_padj_norm      = [-np.log10(x) for x in A_CDS_kmers_padj]
A_Junction_kmers_padj_norm = [-np.log10(x) for x in A_Junction_kmers_padj]
A_rRNA_kmers_padj_norm     = [-np.log10(x) for x in A_rRNA_kmers_padj]

In [None]:
import plotly.graph_objects as go
import plotly.io as pio

data = [A_Junction_kmers_padj_norm,
        A_CDS_kmers_padj_norm,
        A_intron_kmers_padj_norm,
        A_unmapped_kmers_padj_norm,
        A_rRNA_kmers_padj_norm
        ]

# Labels and colors for each category
labels = ['Junction', 'CDS', 'Intron', 'Unmapped', 'rRNA']
# Create the boxplot with customized layout
fig = go.Figure()

# Add a box for each category with a unique color
for d, label in zip(data, labels):
    fig.add_trace(go.Box(
        y=d,
        name=label
    ))

# Limit y-axis range
fig.update_yaxes(range=[0, 15])

# Customize layout to show axis lines
fig.update_layout(
    yaxis_title='-log10(padj)',
    xaxis_title='Category',
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(size=14),
    xaxis=dict(showgrid=True, zeroline=False, showline=True, linecolor='black', linewidth=2),
    yaxis=dict(showgrid=True, zeroline=False, showline=True, linecolor='black', linewidth=2),
)

fig.show()
