# 1. Imports

In [1]:
from pathlib import Path
from dataclasses import dataclass

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.patches import Circle
import seaborn as sns
from skimage import io

plt.rcParams.update({
    'axes.unicode_minus': True,  # Correctly display minus sign
    'figure.dpi': 300,            # Image resolution
    'axes.titlesize': 20,         # Title font size
    'axes.titleweight': 'bold', # Title font weight
    'axes.labelsize': 18,         # Axis label font size
    'axes.labelweight': 'bold',  # Axis label font weight
    'font.family': 'Arial',       # Set global font to Arial
    'font.size': 16,              # Base font size
    'legend.fontsize': 16,        # Legend font size
    'axes.spines.top': False,      # Remove top spine
    'axes.spines.right': False,     # Remove right spine
    'axes.linewidth': 3,   # Axis line width
    'lines.linewidth': 3,  # Line width
    'lines.solid_joinstyle': 'round',  # Line join style
    'lines.solid_capstyle': 'round',    # Line cap style
    'image.interpolation': 'nearest',  # Image interpolation
    'pdf.compression': 9  # PDF compression level (0-9)
})

# 2. Configuration

In [2]:
@dataclass
class config:
    verification_result_file: Path = Path("../results/all_combined_all_rounds_crop_summary_manual_annotated_with_genotyping_20260125.xlsx")
    colony_size_result_file: Path = Path("../results/nonE_strain_or_commented_E_genotyping_results.xlsx")

    def __post_init__(self):
        self.verification_result: pd.DataFrame = pd.read_excel(self.verification_result_file).query("Kept == 'YES'").copy()
        self.genotyping_summary = self.verification_result.query("Genotyping == 'YES' ").set_index(["gene_num", "gene_name", "round", "colony_id","date"])
        self.colony_size_result: pd.DataFrame = pd.read_excel(self.colony_size_result_file, index_col=[1,2,0,3,4])

# 3. Functions

In [3]:
# %% ------------------------------------ Functions ------------------------------------ %%
def tetrad_plate_plot(genes, cfg, relative_fraction_per_day, filtered_genotyping_results, return_fig=False):
    # gene_index = cfg.genotyping_summary[cfg.genotyping_summary.index.get_level_values("gene_name").isin(genes)].index
    index_names = ['gene_num', 'gene_name', 'round', 'colony_id', 'date']
    verification_result = cfg.verification_result.set_index(index_names)
    gene_index = cfg.verification_result.query("gene_name in @genes")[index_names]
    # Sort by gene order in the list
    # selected_colony = gene_index.to_frame().assign(
    selected_colony = gene_index.assign(
        gene_order=lambda df: df['gene_name'].map({gene: i for i, gene in enumerate(genes)})
    ).sort_values('gene_order').drop('gene_order', axis=1).set_index(index_names).index
    sorted_colony_size_result = cfg.colony_size_result.sort_index()
    filtered_genotyping_results = filtered_genotyping_results.sort_index()
    fig, axes = plt.subplots(len(selected_colony), 6, figsize=(30, 3 * len(selected_colony)))
    for row, colony in enumerate(selected_colony):
        for col, col_image in enumerate(["3d", "4d", "5d", "6d", "HYG"]):
            image_path = verification_result.loc[colony, f"{col_image}_image_path"]
            if col_image != "HYG":
                try:
                    image = io.imread(image_path)
                    axes[row, col].imshow(image, cmap="gray")
                except:
                    axes[row, col].axis("off")
                    continue
                try:
                    area_fraction = relative_fraction_per_day.loc[colony, f"area_day{col+3}"]
                    axes[row, col].set_title(f"area fraction (deletion/WT): {area_fraction}")
                    day=col + 3
                    colony_coordinates = sorted_colony_size_result.loc[colony]
                    for idx, region in colony_coordinates.iterrows():
                        cx, cy = region[f"grid_point_x_day{day}"], region[f"grid_point_y_day{day}"]
                        genotype = region["genotype"]
                        color = 'green' if genotype == "WT" else 'red'
                        area = region[f"area_day{day}"]
                        if area != 0:
                            circle = Circle((cx, cy), 30, edgecolor=color, facecolor='none', linewidth=2, alpha=0.4)
                        else:
                            circle = Circle((cx, cy), 30, edgecolor=color, facecolor='none', linewidth=2, alpha=0.4, linestyle="--")
                        axes[row, col].add_patch(circle)
                except KeyError:
                    pass
            else:
                try:
                    image = io.imread(image_path)
                    axes[row, col].imshow(image)
                except:
                    pass
                axes[row, col].set_title("_".join(map(str, colony)) + "\nHYG Plate")
            
            axes[row, col].axis("off")
        try:
            ax_area = axes[row, -1]
            colony_regions = filtered_genotyping_results.loc[colony]
            area_stat = colony_regions.set_index("genotype", append=True).filter(like="area_day")
            area_stat["area_day0"] = 0
            area_stat = area_stat.rename_axis("day", axis=1).stack().reset_index().rename(columns={0: "area"})
            area_stat["day_num"] = area_stat["day"].str.extract(r'day(\d+)').astype(int)
            last_day = area_stat["day_num"].max()
            # area_stat = area_stat.groupby(["col", "day"]).filter(lambda x: x.query("genotype == 'WT'").shape[0]/x.shape[0] == 0.5)
            last_day_WT_colonies_area_mean = area_stat.query("genotype == 'WT' and day_num == @last_day")["area"].median()
            area_stat["area[normalized]"] = area_stat["area"] / last_day_WT_colonies_area_mean
            area_stat = area_stat.query("genotype in ['WT', 'Deletion']")
            sns.lineplot(x="day_num", y="area[normalized]", hue="genotype", data=area_stat, ax=ax_area, palette={"WT": "green", "Deletion": "red"}, errorbar=("pi", 50), estimator="median")
            WT_count = colony_regions.query("genotype == 'WT'")["genotype"].count()
            deletion_count = colony_regions.query("genotype == 'Deletion'")["genotype"].count()
            ax_area.set_title(f"Colony Area Over Time:\n WT ({WT_count}) vs Deletion ({deletion_count})")
            ax_area.axhline(1, color='gray', linestyle='--')
        except KeyError:
            axes[row, -1].axis("off")
    plt.tight_layout()
    if return_fig:
        return fig
    else:
        plt.show()
        plt.close()            


In [4]:
def calculate_area_fraction_per_day(sub_df: pd.DataFrame):
    WT_median = sub_df.query("genotype == 'WT'")["area"].median()
    deletion_median = sub_df.query("genotype == 'Deletion'")["area"].median()
    return deletion_median / WT_median if WT_median != 0 else 0

def calculate_area_fraction(sub_df: pd.DataFrame):
    sub_df = sub_df.rename_axis("day", axis=1).stack().reset_index().rename(columns={0: "area"})
    return sub_df.groupby("day").apply(calculate_area_fraction_per_day).to_frame().T

# 4. Load data

In [5]:
# %% ------------------------------------ Main Function ------------------------------------ %%
cfg = config()
index = cfg.genotyping_summary.index
filtered_genotyping_results = cfg.colony_size_result.loc[index].groupby(["gene_num", "gene_name", "round", "colony_id","date", "col"]).filter(lambda x: x.query("genotype == 'WT'").shape[0]/x.shape[0] == 0.5)
area_table = filtered_genotyping_results.set_index("genotype", append=True).filter(like="area_day")
area_table.fillna(0, inplace=True)

area_table_unstack = area_table.groupby(by=area_table.index.names).median().unstack(level="genotype").reorder_levels([1,0], axis=1)
relative_fraction_per_day = (area_table_unstack["Deletion"] / area_table_unstack["WT"]).round(3).sort_index()
area_fraction = area_table.groupby("gene_name").apply(calculate_area_fraction).droplevel(-1).reset_index()

  return sub_df.groupby("day").apply(calculate_area_fraction_per_day).to_frame().T
  return sub_df.groupby("day").apply(calculate_area_fraction_per_day).to_frame().T
  return sub_df.groupby("day").apply(calculate_area_fraction_per_day).to_frame().T
  return sub_df.groupby("day").apply(calculate_area_fraction_per_day).to_frame().T
  return sub_df.groupby("day").apply(calculate_area_fraction_per_day).to_frame().T
  return sub_df.groupby("day").apply(calculate_area_fraction_per_day).to_frame().T
  return sub_df.groupby("day").apply(calculate_area_fraction_per_day).to_frame().T
  return sub_df.groupby("day").apply(calculate_area_fraction_per_day).to_frame().T
  return sub_df.groupby("day").apply(calculate_area_fraction_per_day).to_frame().T
  return sub_df.groupby("day").apply(calculate_area_fraction_per_day).to_frame().T
  return sub_df.groupby("day").apply(calculate_area_fraction_per_day).to_frame().T
  return sub_df.groupby("day").apply(calculate_area_fraction_per_day).to_frame().T
  re

# 5. Summary

In [6]:
summary = cfg.verification_result.query("Kept =='YES'").drop_duplicates(subset=["systematic_id", "gene_name", "verification_phenotype", "verification_essentiality"])[["systematic_id", "gene_name", "verification_phenotype", "verification_essentiality", "Comments"]]
summary["gene_name"].value_counts()
summary.to_csv("../results/verification_summary_20260125.csv", index=False)

# 6. Analysis

In [7]:
summary_with_area_fraction = summary.merge(area_fraction, on="gene_name", how="left").copy()
E_genes_with_comments = summary_with_area_fraction.query("verification_phenotype == 'E' and Comments.notna()").sort_values("area_day6")["gene_name"].tolist()

In [8]:
fig = tetrad_plate_plot(E_genes_with_comments, cfg, relative_fraction_per_day, filtered_genotyping_results, return_fig=True)
plt.savefig("../results/tetrad_plate_plots_E_genes_with_comments_20260125.pdf", bbox_inches='tight', dpi=300)
plt.close()

In [9]:
phenotype_order = ["E","E,tiny colonies","E,small colonies","E,WT","tiny colonies","small colonies","WT"]
summary_with_area_fraction["verification_phenotype"].value_counts().sort_index(
    key = lambda x: x.map({v: i for i, v in enumerate(phenotype_order)})
)

verification_phenotype
E                   170
E,tiny colonies       4
E,small colonies     10
E,WT                  1
tiny colonies         6
small colonies      169
WT                   50
Leu-condition         1
Name: count, dtype: int64

In [10]:
E_and_tiny_colonies_genes = summary_with_area_fraction.query("verification_phenotype == 'E,tiny colonies'").sort_values("area_day6")["gene_name"].tolist()
fig = tetrad_plate_plot(E_and_tiny_colonies_genes, cfg, relative_fraction_per_day, filtered_genotyping_results, return_fig=True)
plt.savefig("../results/tetrad_plate_plots_E_and_tiny_colonies_genes_20260125.pdf", bbox_inches='tight', dpi=300)
plt.close()

In [11]:
E_and_small_colonies_genes = summary_with_area_fraction.query("verification_phenotype == 'E,small colonies'").sort_values("area_day6")["gene_name"].tolist()
fig = tetrad_plate_plot(E_and_small_colonies_genes, cfg, relative_fraction_per_day, filtered_genotyping_results, return_fig=True)
plt.savefig("../results/tetrad_plate_plots_E_and_small_colonies_genes_20260125.pdf", bbox_inches='tight', dpi=300)
plt.close()

In [12]:
tiny_colonies_genes = summary_with_area_fraction.query("verification_phenotype == 'tiny colonies'").sort_values("area_day6")["gene_name"].tolist()
fig = tetrad_plate_plot(tiny_colonies_genes, cfg, relative_fraction_per_day, filtered_genotyping_results, return_fig=True)
plt.savefig("../results/tetrad_plate_plots_tiny_colonies_genes_20260125.pdf", bbox_inches='tight', dpi=300)
plt.close()

In [13]:
top_small_colonies_genes = summary_with_area_fraction.query("verification_phenotype == 'small colonies'").sort_values("area_day6")["gene_name"].tolist()[:20]
fig = tetrad_plate_plot(top_small_colonies_genes, cfg, relative_fraction_per_day, filtered_genotyping_results, return_fig=True)
plt.savefig("../results/tetrad_plate_plots_top_small_colonies_genes_20260125.pdf", bbox_inches='tight', dpi=300)
plt.close()