In [2]:
import pandas as pd

# Load LOF mutations file
damage_df = pd.read_csv("LOF mutes.csv")

# Get the unique genes to consider
uniq_genes = damage_df["Hugo_Symbol"].unique()

In [11]:
columns_to_load = [
    "Hugo_Symbol", "mut_id", "Tumor_Sample_Barcode",
    "Chromosome", "Start_Position", "End_Position",
    "Reference_Allele", "Tumor_Seq_Allele2",
    "Variant_Classification", "Variant_Type",
    "HGVSp_Short", "case_id", "PolyPhen"
]

luad_df = pd.read_csv("LUAD_mutations.csv", usecols=columns_to_load)
lusc_df = pd.read_csv("LUSC_mutations.csv", usecols=columns_to_load)

In [12]:
luad_df["Cancer_Type"] = "LUAD"
lusc_df["Cancer_Type"] = "LUSC"

In [10]:
# Merge LUAD and LUSC together
tcga_df = pd.concat([luad_df, lusc_df], ignore_index=True)

In [13]:
# Total unique patients per cancer type
n_luad = luad_df["case_id"].nunique()
n_lusc = lusc_df["case_id"].nunique()
n_total = len(set(luad_df["case_id"]).union(lusc_df["case_id"]))

In [14]:
# Create lookup: mut_id -> confidence level
confidence_map = dict(zip(damage_df["mut_id"], damage_df["considered_dameged"]))
lof_mut_ids = set(confidence_map.keys())

In [30]:
def compute_lof_stats_by_gene(conf_levels):
    # Filter for LOF mutations only
    lof_df = tcga_df[tcga_df["mut_id"].isin(lof_mut_ids)].copy()
    lof_df["confidence"] = lof_df["mut_id"].map(confidence_map)

    # Keep only selected confidence levels
    filtered = lof_df[lof_df["confidence"].isin(conf_levels)]

    # Keep only unique patient-gene pairs
    unique_pairs = filtered[["Hugo_Symbol", "case_id", "Cancer_Type"]].drop_duplicates()

    # Count patients per gene and cancer type
    gene_counts = unique_pairs.groupby(["Hugo_Symbol", "Cancer_Type"])["case_id"].nunique().unstack(fill_value=0)

    # Add totals and frequency columns
    gene_counts["Total_LOF_Patients"] = gene_counts.sum(axis=1)
    gene_counts["Total_Patients"] = n_total
    gene_counts["Total_Patients_LAUD"] = n_luad
    gene_counts["Total_Patients_LASC"] = n_lusc
    gene_counts["LUAD_LOF_Frequency"] = gene_counts.get("LUAD", 0) / n_luad
    gene_counts["LUSC_LOF_Frequency"] = gene_counts.get("LUSC", 0) / n_lusc
    gene_counts["Total_LOF_Frequency"] = gene_counts["Total_LOF_Patients"] / n_total
    

    
    gene_counts.index.name = None
    return gene_counts.reset_index()


In [31]:
table_high = compute_lof_stats_by_gene(["yes high confidence"])
table_mid_high = compute_lof_stats_by_gene(["yes high confidence", "yes mid confidence"])
table_all = compute_lof_stats_by_gene(["yes high confidence", "yes mid confidence", "yes low confidence"])

In [33]:
table_high.to_csv("lof_stats_high_conf.csv", index=False)
table_mid_high.to_csv("lof_stats_mid_high_conf.csv", index=False)
table_all.to_csv("lof_stats_all_conf.csv", index=False)