In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('kmer_dist_prev_div.csv')
df.head()

In [None]:
# if annotation column contains rRNA, remove any other annotation
df.loc[df['annotation'].str.contains('rRNA'), 'annotation'] = 'rRNA'

In [None]:
category_df = pd.read_csv('all_subgroups_div_and_ss.csv')
category_df.head()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objects as go

def plot_expression_boxplot_by_attribute(df, category_df, attribute_type):
    df['is_attribute'] = df['annotation'].apply(lambda x: attribute_type in x.split(','))

    # Define the order of groups for consistent x-axis ordering
    group_order = ["ss", "1_2_wpi", "4_6_wpi", "8_12_wpi"]

    # Initialize a dictionary to store normalized expressions for each group
    normalized_group_expressions = {}

    for group in group_order:
        # Get columns (cell barcodes) belonging to the group
        group_barcodes = category_df[category_df['time'] == group]['sample'].values

        attribute_df = df[df['is_attribute']][group_barcodes]
        size_factor = attribute_df.shape[1]  # number of cells in the group
        normalized_expression = attribute_df.mean(axis=0)
        normalized_expression /= size_factor
        normalized_expression *= 1e2

        # Keep only non-zero values
        normalized_group_expressions[group] = normalized_expression.values
        normalized_group_expressions[group] = normalized_group_expressions[group][normalized_group_expressions[group] > 0]

    
    colors = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA']

    # Create a Plotly figure
    fig = go.Figure()

    # Add a box for each group with custom colors and hover text
    for i, group in enumerate(group_order):
        print(group,normalized_group_expressions[group])
        fig.add_trace(go.Box(
            y=normalized_group_expressions[group],
            name=group,
            marker_color=colors[i],
        ))

    

    # make the title at centre
    fig.update_layout(
        title=f"Count of K-mers mapped to {attribute_type}",
        # title position center
        title_x=0.5,
        title_font=dict(size=16, family='Arial', color='black'),
        xaxis_title="Time",
        yaxis_title="Normalized Count Levels",
        xaxis=dict(tickfont=dict(size=12), title_font=dict(size=14)),
        yaxis=dict(tickfont=dict(size=12), title_font=dict(size=14)),
        # boxmode='group',  # Group boxes together
        template="simple_white",
        # showlegend=False
    )

    # Customize the box plot style
    fig.update_traces(line=dict(width=2),  # Box line width
                      marker=dict(size=5) ) # Outlier marker size

    # Show the plot
    fig.show()
    # save figure as svg
    # fig.write_image(f"{attribute_type}_boxplot.svg")

# Example usage: Plotting box plots for mtRNA, rRNA, and miRNA
for attribute in ['rRNA', 'mtRNA', 'miRNA']:
    plot_expression_boxplot_by_attribute(df, category_df, attribute)
