In [1]:
from pathlib import Path

import altair as alt
import numpy as np
import pandas as pd

# 1.Load data

In [2]:
inputFolder = Path("../data/0_raw")

GDRs = pd.read_excel(inputFolder / "V12_DR/V12_GDR_annotated_updated.xlsx")

verified_genes_by_XH = pd.read_excel(
    inputFolder / "20230512_verifiedGenesFromXiaohui_Phenotypes.xlsx",
    sheet_name="0.4-0.55",
)

verified_genes_by_XH["deletion mutant (3d)"] = (
    verified_genes_by_XH["deletion mutant (3d)"].str.strip().fillna("No result")
)
verified_genes_by_XH["deletion mutant (6d)"] = (
    verified_genes_by_XH["deletion mutant (6d)"].str.strip().fillna("No result")
)
verified_genes_by_XH = verified_genes_by_XH[
    verified_genes_by_XH["deletion mutant (6d)"] != "No result"
].copy()

verified_genes_by_XH.loc[
    verified_genes_by_XH[
        "deletion mutant (6d)"
        ].str.startswith("invisible"),
    "deletion mutant (6d)"
    ] = "invisible"

verified_genes_by_XH.loc[
    verified_genes_by_XH[
        "deletion mutant (6d)"
        ].str.startswith("microcolony"),
    "deletion mutant (6d)"
    ] = "microcolony"

# merge the two dataframes
GDRs_of_verified_genes = pd.merge(
    GDRs.loc[:, :"GDL_50"],
    verified_genes_by_XH,
    left_on="SysID",
    right_on="Interval",
    how="right",
).drop(columns="Interval")


outputFolder = Path("../results/updated_GDR_of_verified_genes")
outputFolder.mkdir(exist_ok=True)

# 2.GDR VS Phenotypes of the deleltion

In [4]:
phenotype_order = [
    "No result","invisible",
    # "invisible and microcolony",
    # "invisible and small colony",
    "microcolony and invisible",
    "microcolony",
    "small colony and invisible",
    "small colony",
    "small colony and slightly small colony",
    "slightly small colony",
]

selection = alt.selection_multi(fields=['deletion mutant (6d)'], bind='legend')

(
    alt.Chart(GDRs_of_verified_genes)
    .mark_point()
    .encode(
        # same scale for x and y
        x=alt.X("GDR_50:Q", scale=alt.Scale(domain=[0, 1.1]), title="GDR V12"),
        y=alt.Y("GDR:Q", scale=alt.Scale(domain=[0, 1.1]), title="Previous GDR"),
        # Gene dispensability as color
        color=alt.Color("Gene dispensability:N"),
        size=alt.Size(
            "deletion mutant (6d):N",
            scale=alt.Scale(range=[50, 500]),
            sort=phenotype_order,
        ),
        tooltip=[
            "SysID",
            "Gene name",
            "deletion mutant (3d)",
            "deletion mutant (6d)",
            "variation between deletion mutant (6d)",
            "time of the appearance of phenotype",
            "Product description",
            "GDR_50",
            "GDR_40",
            "GDR",
        ],
        opacity=alt.condition(selection, alt.value(1), alt.value(0.1))
    )
    + alt.Chart(pd.DataFrame({"GDR_50": [0, 1.1], "GDR": [0, 1.1]}))
    .mark_line(color="black")
    .encode(x="GDR_50:Q", y="GDR:Q")
    .properties(width=600, height=600)
    .interactive()
).add_params(selection)



In [7]:
GDRs_of_verified_genes.sort_values("GDR_50", inplace=True)

In [54]:
GDRs_of_verified_genes["variation between deletion mutant (6d)"].value_counts()

homogeneous      51
heterogeneous    12
Name: variation between deletion mutant (6d), dtype: int64

In [55]:
GDRs_of_verified_genes["deletion mutant (6d)"].value_counts()

small colony                              25
invisible                                 13
microcolony                                7
slightly small colony                      6
small colony and invisible                 5
small colony and slightly small colony     3
microcolony and invisible                  2
invisible and small colony                 1
invisible and microcolony                  1
Name: deletion mutant (6d), dtype: int64

In [8]:
GDRs_of_verified_genes.columns

Index(['SysID', 'Sample_count_40', 'Samples_40', 'Insertion_count_40',
       'Confidence_score_40', 'GDG_40', 'GDR_40', 'GDL_40', 'Sample_count_50',
       'Samples_50', 'Insertion_count_50', 'Confidence_score_50', 'GDG_50',
       'GDR_50', 'GDL_50', 'Gene name', 'deletion mutant (3d)',
       'deletion mutant (6d)', 'variation between deletion mutant (6d)',
       'time of the appearance of phenotype', 'Product description',
       'Gene dispensability', 'Insertion_density', 'ipkm', 'uipkm',
       'Sample_count', 'Samples', 'Insertion_count', 'Confidence_score', 'GDG',
       'GDR', 'GDL', 'GDR_Interval', 'GDR_Sub_Interval', 'Pheno_disc',
       'Viability', 'tmp_idx'],
      dtype='object')

In [12]:
GDRs_of_verified_genes[["SysID", "Gene name", "Gene dispensability", "GDR_50", "GDR"]].to_excel("../tmp/verified_genes_GDR_50.xlsx", index=False, float_format="%.3f")