In [None]:
import pandas as pd

# Input files configuration
input_files = [
    {
        "file_path": "/home/agaro/verma_shared/projects/HCC/PMBB_6Gene_Burden_Analysis_Plan_100924/group_files/vep_annot_msh6_chr2_with_curated_list_cols.txt",
        "gene": "msh6",
        "chr": 2
    },
    {
        "file_path": "/home/agaro/verma_shared/projects/HCC/PMBB_6Gene_Burden_Analysis_Plan_100924/group_files/vep_annot_pms2_chr7_with_curated_list_cols.txt",
        "gene": "pms2",
        "chr": 7
    },
    {
        "file_path": "/home/agaro/verma_shared/projects/HCC/PMBB_6Gene_Burden_Analysis_Plan_100924/group_files/vep_annot_brca2_chr13_with_curated_list_cols.txt",
        "gene": "brca2",
        "chr": 13
    },
    {
        "file_path": "/home/agaro/verma_shared/projects/HCC/PMBB_6Gene_Burden_Analysis_Plan_100924/group_files/vep_annot_fanca_chr16_with_curated_list_cols.txt",
        "gene": "fanca",
        "chr": 16
    },
    {
        "file_path": "/home/agaro/verma_shared/projects/HCC/PMBB_6Gene_Burden_Analysis_Plan_100924/group_files/vep_annot_brip1_chr17_with_curated_list_cols.txt",
        "gene": "brip1",
        "chr": 17
    },
    {
        "file_path": "/home/agaro/verma_shared/projects/HCC/PMBB_6Gene_Burden_Analysis_Plan_100924/group_files/vep_annot_chek2_chr22_with_curated_list_cols.txt",
        "gene": "chek2",
        "chr": 22
    }
]

# Initialize combined results dictionary
combined_dataframes = {}

# Define sanity check results structure
sanity_check_results = {
    "pLoF": {},
    "damaging_missense": {}
}

def print_sanity_check_results(sanity_check_results):
    for annotation_type, genes in sanity_check_results.items():
        print(f"Annotation Type: {annotation_type}")
        for gene, counts in genes.items():
            print(f"  Gene: {gene}")
            for count_type, count_value in counts.items():
                print(f"    {count_type}: {count_value}")
            print()  # Add a blank line between genes for readability

# Process each input file
for input_file in input_files:
    # Load the DataFrame
    df = pd.read_csv(input_file["file_path"], sep='\t')

    # Select relevant columns
    subset_df = df[[
        "Uploaded_variation", "Location", "Allele", "Consequence", "SYMBOL", "CLIN_SIG",
        "LoF", "SpliceAI_pred", "REVEL_score", "MANE_SELECT",
        "CADD_RAW_RawScore_merge", "CADD_PHRED_merge", "clinvar_hcc_inc", "clinvar_hcc_excl"
    ]]

    # Filter rows based on MANE_SELECT column
    filtered_df = subset_df[subset_df['MANE_SELECT'] != '-']

    # Process SpliceAI predictions
    splice = filtered_df['SpliceAI_pred'].str.split('|', expand=True)
    splice.columns = ['SYMBOL', 'DS_AG', 'DS_AL', 'DS_DG', 'DS_DL', 'DP_AG', 'DP_AL', 'DP_DG', 'DP_DL']
    splice = splice.mask(splice == 'None')
    splice[['DS_AG', 'DS_AL', 'DS_DG', 'DS_DL', 'DP_AG', 'DP_AL', 'DP_DG', 'DP_DL']] = splice[[
        'DS_AG', 'DS_AL', 'DS_DG', 'DS_DL', 'DP_AG', 'DP_AL', 'DP_DG', 'DP_DL']].astype(float)

    # Calculate the maximum SpliceAI score
    splice['SpliceAI_DS'] = splice[['DS_AG', 'DS_AL', 'DS_DG', 'DS_DL']].max(axis=1)
    filtered_df = pd.concat([filtered_df.reset_index(drop=True), splice.reset_index(drop=True)], axis=1)

    # Identify pLoF variants
    plof_df = filtered_df[
        ((filtered_df['clinvar_hcc_inc'] == 1) |
         ((filtered_df['LoF'] == "HC") |
          ((filtered_df['Consequence'].isin(["splice_donor_variant", "splice_acceptor_variant", "splice_region_variant"])) &
           (filtered_df['SpliceAI_DS'] >= 0.2)))) &
        ~(filtered_df['clinvar_hcc_excl'] == 1)
    ].copy()

    # Identify damaging missense variants
    filtered_df['CADD_PHRED_merge'] = pd.to_numeric(filtered_df['CADD_PHRED_merge'], errors='coerce')
    filtered_df['REVEL_score'] = pd.to_numeric(filtered_df['REVEL_score'], errors='coerce')

    damaging_missense_df = filtered_df[
        ~filtered_df.index.isin(plof_df.index) &
        (filtered_df['Consequence'].isin([
            "missense_variant", "start_lost", "stop_lost",
            "inframe_insertion", "inframe_deletion",
            "splice_region_variant", "splice_donor_variant", "splice_acceptor_variant"
        ])) &
        ((filtered_df['REVEL_score'] >= 0.773) |
         (filtered_df['CADD_PHRED_merge'] >= 28.1) |
         (filtered_df['SpliceAI_DS'] >= 0.2) |
         (filtered_df['LoF'] == "LC"))
    ].copy()

    # Add 'anno' column to each DataFrame
    plof_df.loc[:, 'anno'] = 'pLoF'
    damaging_missense_df.loc[:, 'anno'] = 'damaging_missense'

    # Combine both DataFrames
    combined_df = pd.concat([plof_df, damaging_missense_df], ignore_index=True)

    # Initialize results for current gene in sanity check
    sanity_check_results["pLoF"][input_file['gene']] = {}
    sanity_check_results["damaging_missense"][input_file['gene']] = {}

    # Calculate pLoF counts
    sanity_check_results["pLoF"][input_file['gene']] = {
        "clinvar_hcc_inc": len(plof_df[plof_df['clinvar_hcc_inc'] == 1]),
        "LoF_HC": len(plof_df[plof_df['LoF'] == "HC"]),
        "splice_variants": len(plof_df[
            (plof_df['Consequence'].isin(["splice_donor_variant", "splice_acceptor_variant", "splice_region_variant"])) &
            (plof_df['SpliceAI_DS'] >= 0.2)
        ]),
        "non_excluded": len(plof_df[plof_df['clinvar_hcc_excl'] != 1])
    }

    # Calculate damaging missense counts
    sanity_check_results["damaging_missense"][input_file['gene']] = {
        "Not_plof": len(damaging_missense_df[~damaging_missense_df.index.isin(plof_df.index)]),
        "Consequence_missense_start_stop_indel_splicing": len(damaging_missense_df[
            damaging_missense_df['Consequence'].isin([
                "missense_variant", "start_lost", "stop_lost",
                "inframe_insertion", "inframe_deletion",
                "splice_region_variant", "splice_donor_variant", "splice_acceptor_variant"
            ])
        ]),
        "Revel_score": len(damaging_missense_df[damaging_missense_df['REVEL_score'] >= 0.773]),
        "CADD_PHRED_merge": len(damaging_missense_df[damaging_missense_df['CADD_PHRED_merge'] >= 28.1]),
        "Splice_variants": len(damaging_missense_df[damaging_missense_df['SpliceAI_DS'] >= 0.2]),
        "LoF_LC": len(damaging_missense_df[damaging_missense_df['LoF'] == "HC"])
    }

    # Remove duplicates based on the 'Uploaded_variation' column
    combined_df = combined_df.drop_duplicates(subset='Uploaded_variation')

    # Store the combined DataFrame in the dictionary
    combined_dataframes[input_file['gene']] = combined_df
    
# Print the sanity check results
print_sanity_check_results(sanity_check_results)