#  DataBase contruction

In [2]:
import pandas as pd
import itertools
from itertools import combinations
import os
import glob
from pandas.core.nanops import nanall
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn import tree
from XGBoost_Model import *
import sns
from matplotlib import pyplot as plt
from scipy.stats import chi2_contingency

In [3]:
# reading all the data files
data_clinical_patient = pd.read_csv('pan_origimed_2020/data_clinical_patient.txt', sep="\t")
data_clinical_sample = pd.read_csv('pan_origimed_2020/data_clinical_sample.txt', sep="\t")
data_cna_log2 = pd.read_csv('pan_origimed_2020/data_cna_log2.txt', sep="\t")
data_cna = pd.read_csv('pan_origimed_2020/data_cna.txt', sep="\t")
data_mutations = pd.read_csv('pan_origimed_2020/data_mutations.txt', sep="\t", header=2, dtype={"Exon_Number": "string"})
data_sv = pd.read_csv('pan_origimed_2020/data_sv.txt', sep="\t")

In [4]:
# removing bad rows
data_clinical_sample = data_clinical_sample[4:]
data_clinical_patient = data_clinical_patient[4:]

In [5]:
data_clinical_patient = data_clinical_patient.rename(columns={'#Patient Identifier': 'PATIENT_ID'})

In [6]:
data_clinical_patient.head()

Unnamed: 0,PATIENT_ID,Sex,Diagnosis Age,Smoke Status,Treatment
4,Patient0001,Female,67,Unknown,Other_Treatments
5,Patient0002,Male,75,Unknown,Treatment-naive
6,Patient0003,Female,45,Unknown,Treatment-naive
7,Patient0004,Male,70,Unknown,Treatment-naive
8,Patient0005,Male,53,Unknown,Treatment-naive


In [7]:
# matching the sample id to match other tables
data_clinical_patient["SAMPLE_ID"] = data_clinical_patient["PATIENT_ID"].apply(lambda x: "P-" + x[7:])

In [8]:
# make all sample id header name the same - "SAMPLE_ID"
data_clinical_sample.rename(columns={"Sample Identifier": 'SAMPLE_ID'}, inplace=True)
data_mutations.rename(columns={"Tumor_Sample_Barcode": 'SAMPLE_ID'}, inplace=True)
data_sv.rename(columns={"Sample_Id": 'SAMPLE_ID'}, inplace=True)

In [9]:
# merge everything
merged_clinical_data = data_clinical_patient.merge(data_clinical_sample, on="SAMPLE_ID", how='outer')
merged_mutations_data = merged_clinical_data.merge(data_mutations, on="SAMPLE_ID", how='outer')
merged_all_data = merged_mutations_data.merge(data_sv, on="SAMPLE_ID", how='outer')

In [10]:
merged_all_data["SNP_event"] = merged_all_data["Reference_Allele"].fillna("").astype(str) + ">" + merged_all_data["Tumor_Seq_Allele2"].fillna("").astype(str)


In [11]:
data_for_model = merged_all_data[["PATIENT_ID", "Cancer Type", 'Cancer Type Detailed', 'Tumor Stage',
                                'Sample Type', "Sex", "Diagnosis Age", "Smoke Status", "TMB (nonsynonymous)",
                                "Hugo_Symbol", "Chromosome", "Start_Position", "End_Position",
                                "Consequence", "Variant_Type", "SNP_event", "Protein_position", "Codons",
                                "Exon_Number","VAR_TYPE_SX", "Site1_Hugo_Symbol", "Site2_Hugo_Symbol","Event_Info"]]

In [12]:
data_for_model["Exon_Number"].isnull().sum()

np.int64(15278)

In [13]:
data_for_model.head(20)

Unnamed: 0,PATIENT_ID,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,Smoke Status,TMB (nonsynonymous),Hugo_Symbol,...,Consequence,Variant_Type,SNP_event,Protein_position,Codons,Exon_Number,VAR_TYPE_SX,Site1_Hugo_Symbol,Site2_Hugo_Symbol,Event_Info
0,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,KMT2C,...,missense_variant,SNP,C>T,4822.0,cGt/cAt,57/59,Substitution/Indel,,,
1,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,KRAS,...,missense_variant,SNP,C>A,12.0,Ggt/Tgt,5-Feb,Substitution/Indel,,,
2,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,SOX9,...,frameshift_variant,DEL,C>-,236.0,aCc/ac,3-Mar,Truncation,,,
3,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,APC,...,"stop_gained,splice_region_variant",SNP,C>T,216.0,Cga/Tga,16-Jul,Truncation,,,
4,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,APC,...,stop_gained,SNP,G>T,1286.0,Gaa/Taa,16/16,Truncation,,,
5,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,QKI,...,missense_variant,SNP,G>A,47.0,Gaa/Aaa,8-Jan,Substitution/Indel,,,
6,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,FBXW7,...,missense_variant,SNP,G>A,505.0,Cgc/Tgc,12-Oct,Substitution/Indel,,,
7,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,PTPN11,...,intron_variant,DEL,TTTC>-,,,,Substitution/Indel,,,
8,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,GLI1,...,missense_variant,SNP,G>T,488.0,Gct/Tct,12-Nov,Substitution/Indel,,,
9,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,ATM,...,"splice_region_variant,intron_variant",SNP,G>A,1026.0,,,Substitution/Indel,,,


In [14]:
# Function to handle the conversion
def convert_exon_number(val):
    try:
        # First, try to convert to 'Month-Year' format (e.g., 'Sep-89' -> '09/89')
        return pd.to_datetime(val, format='%b-%y').strftime('%m/%y')
    except ValueError:
        pass

    try:
        # Then, try to convert to 'DD-Mon' format (e.g., '14-Sep' -> '09/14')
        date_obj = pd.to_datetime(val, format='%d-%b', errors='raise')
        return date_obj.strftime('%m/%d')
    except ValueError:
        # If neither format matches, return the value as is (non-date-like string)
        return val

In [15]:
# Apply the function to the column
data_for_model.loc[:, 'Exon_Number'] = data_for_model['Exon_Number'].apply(convert_exon_number)

In [16]:
data_for_model["Exon_Number"].isnull().sum()

np.int64(15278)

In [17]:
data_for_model["Cancer Type"].value_counts()

Cancer Type
Colorectal Carcinoma                     28396
Non Small Cell Lung Cancer               19526
Gastric Cancer                           10832
Liver Hepatocellular Carcinoma            9633
Esophageal Carcinoma                      6298
Intrahepatic Cholangiocarcinoma           3939
Small Cell Lung Cancer                    3025
Pancreatic Cancer                         2957
Extrahepatic Cholangiocarcinoma           2883
Breast Carcinoma                          2350
Soft Tissue Sarcoma                       2270
Gallbladder Carcinoma                     2192
Ovarian Carcinoma                         1851
Urothelial Carcinoma                      1631
Kidney Renal Cell Carcinoma               1500
Uterine Corpus Endometrial Carcinoma      1358
Cancer of Unknown Primary                 1068
Head and Neck Carcinoma                    997
Carcinoma of Uterine Cervix                787
Small Bowel Carcinoma                      755
Bone Sarcoma                               601
G

In [18]:
def create_age_range(x):
    if x <= 10:
        return "0-10"
    elif x <= 20:
        return "11-20"
    elif x <= 30:
        return "21-30"
    elif x <= 40:
        return "31-40"
    elif x <= 50:
        return "41-50"
    elif x <= 60:
        return "51-60"
    elif x <= 70:
        return "61-70"
    elif x <= 80:
        return "71-80"
    else:
        return "80+"

In [19]:
data_for_model.loc[:, 'Diagnosis Age'] = data_for_model['Diagnosis Age'].astype(int).apply(create_age_range).astype("category")

In [21]:
tt = dict(enumerate(data_for_model["Exon_Number"].astype('category').cat.categories))
tt.values()
# data[object_columns] = data[object_columns].astype('category')

dict_values(['01/01', '01/02', '01/03', '01/04', '01/05', '01/06', '01/07', '01/08', '01/09', '01/10', '01/11', '01/12', '01/13', '01/14', '01/15', '01/16', '01/17', '01/18', '01/19', '01/20', '01/21', '01/22', '01/23', '01/24', '01/25', '01/26', '01/27', '01/28', '01/29', '01/30', '01/31', '01/33', '01/34', '01/35', '01/36', '01/37', '01/38', '01/40', '01/43', '01/45', '01/47', '01/49', '01/51', '01/52', '01/54', '01/57', '01/59', '01/79', '01/87', '01/89', '01/91', '02/02', '02/03', '02/04', '02/05', '02/06', '02/07', '02/08', '02/09', '02/10', '02/11', '02/12', '02/13', '02/14', '02/15', '02/16', '02/17', '02/18', '02/19', '02/20', '02/21', '02/22', '02/23', '02/24', '02/25', '02/26', '02/27', '02/28', '02/29', '02/30', '02/31', '02/32', '02/33', '02/34', '02/35', '02/36', '02/37', '02/38', '02/39', '02/40', '02/42', '02/43', '02/44', '02/45', '02/46', '02/49', '02/51', '02/52', '02/54', '02/57', '02/58', '02/59', '02/63', '02/79', '02/87', '02/89', '02/91', '03/03', '03/04', '03/05

In [22]:
# Filter cancer types with at least 2000 samples
cancer_counts = data_for_model['Cancer Type'].value_counts()
valid_cancer_types = cancer_counts[cancer_counts >= 2000].index
data_for_model = data_for_model[data_for_model['Cancer Type'].isin(valid_cancer_types)]
data_for_model = data_for_model[data_for_model['Chromosome'].notnull()]

In [None]:
data_for_lift = data_for_model.copy()

In [None]:
data_for_lift["Consequence"].unique()

In [23]:
data_for_model['Consequence'].str.split(',')
dummy_vars = data_for_model['Consequence'].str.split(',').explode().str.get_dummies().groupby(level=0).sum()
data_for_model = data_for_model.join(dummy_vars)
data_for_model.drop('Consequence', axis=1, inplace=True)

In [24]:
data_for_model[data_for_model["PATIENT_ID"] == "Patient8178"]

Unnamed: 0,PATIENT_ID,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,Smoke Status,TMB (nonsynonymous),Hugo_Symbol,...,splice_acceptor_variant,splice_donor_variant,splice_region_variant,start_lost,start_retained_variant,stop_gained,stop_lost,stop_retained_variant,synonymous_variant,upstream_gene_variant
87594,Patient8178,Colorectal Carcinoma,Colorectal Adenocarcinoma,I,Primary,Female,71-80,Unknown,6.166666667,TGFBR2,...,0,0,0,0,0,0,0,0,0,0
87595,Patient8178,Colorectal Carcinoma,Colorectal Adenocarcinoma,I,Primary,Female,71-80,Unknown,6.166666667,POLE,...,0,0,0,0,0,0,0,0,0,0
87596,Patient8178,Colorectal Carcinoma,Colorectal Adenocarcinoma,I,Primary,Female,71-80,Unknown,6.166666667,CTNNA1,...,0,0,0,0,0,0,0,0,0,0
87597,Patient8178,Colorectal Carcinoma,Colorectal Adenocarcinoma,I,Primary,Female,71-80,Unknown,6.166666667,PTPN11,...,0,0,0,0,0,0,0,0,0,0
87598,Patient8178,Colorectal Carcinoma,Colorectal Adenocarcinoma,I,Primary,Female,71-80,Unknown,6.166666667,SETD2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87797,Patient8178,Colorectal Carcinoma,Colorectal Adenocarcinoma,I,Primary,Female,71-80,Unknown,6.166666667,BRAF,...,0,0,0,0,0,0,0,0,0,0
87798,Patient8178,Colorectal Carcinoma,Colorectal Adenocarcinoma,I,Primary,Female,71-80,Unknown,6.166666667,PIK3C2B,...,0,0,0,0,0,0,0,0,0,0
87799,Patient8178,Colorectal Carcinoma,Colorectal Adenocarcinoma,I,Primary,Female,71-80,Unknown,6.166666667,RECQL,...,0,0,0,0,0,0,0,0,0,0
87800,Patient8178,Colorectal Carcinoma,Colorectal Adenocarcinoma,I,Primary,Female,71-80,Unknown,6.166666667,NSD1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
data_for_model.to_csv("pan_cancer_data_for_model.csv", index=False)

In [None]:
hypo_data = pd.read_csv("hypotheses.csv")

In [None]:
hypo_data_updates = hypo_data[hypo_data["support"] > 2].sort_values(["cancer_type", 'support'], ascending=[True, False])
hypo_data_updates.head()

In [None]:
def cancer_type_correlations(df):
    """
    Print cancer type and non-null feature-value pairs for each row in the DataFrame.

    Parameters:
    - df: DataFrame with columns "cancer_type", feature columns, and "support".
    """
    corr_list = []
    # Iterate through each row
    for index, row in df.iterrows():
        # Extract cancer type and support
        cancer_type = row["cancer_type"]
        support = row["support"]

        # Get feature-value pairs where the feature value is not null
        features = [
            f"{feature}={row[feature]}"
            for feature in df.columns
            if feature not in {"cancer_type", "support"} and not pd.isnull(row[feature])
        ]

        # Format and print the result
        features_str = ", ".join(features)
        corr_list.append(f"{cancer_type}: {features_str}, Support: {support}")
    return corr_list


In [None]:
corr_list = cancer_type_correlations(hypo_data_updates)

In [None]:
corr_list

Lift Calculation

In [None]:
cancer_type_dummy = data_for_lift['Cancer Type'].str.get_dummies().groupby(level=0).sum()
data_for_lift = data_for_lift.join(cancer_type_dummy)

In [None]:
data_for_lift.head()

In [None]:
# Combine columns to create specific mutation identifiers
# data_for_lift['Mutation'] = data_for_lift['Chromosome'] + "_" + data_for_lift['Start_Position'].astype(str) + "_" + data_for_lift['Variant_Type']
data_for_lift['Position'] = data_for_lift['Start_Position'].astype(str) + "-" + data_for_lift['End_Position'].astype(str)

In [None]:
data_for_lift.to_csv("data_for_lift.csv", index=False)

In [None]:
# Select a subset of columns to analyze (e.g., most relevant ones)
columns_to_combine = ['Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'SNP_event', "Consequence", 'Exon_Number',
                      "Diagnosis Age", "TMB (nonsynonymous)", "Position", "Protein_position", "Codons", "VAR_TYPE_SX"]

In [None]:
cancer_probabilities = {cancer_type: data_for_lift[cancer_type].mean() for cancer_type in list(data_for_lift["Cancer Type"].unique())}

In [None]:
cancer_probabilities

In [None]:
    # Iterate over feature combinations
# for num_features in range(2, 6):
feature_combinations = list(combinations(columns_to_combine, 5))
feature_combinations

In [None]:
# Precompute the mean probabilities for cancer types

lifts = []

for cancer_type, P_B in cancer_probabilities.items():
    for feature in feature_combinations:
        # Combine the selected features into a single feature
        combined_feature = data_for_lift[list(feature)].astype(str).agg('_'.join, axis=1)

        # Compute value counts for the combined feature
        combined_counts = combined_feature.value_counts()
        valid_features = combined_counts[combined_counts >= 100].index

        if valid_features.empty:
            continue  # Skip if no valid combined features

        # Filter the combined feature to include only valid entries
        filtered_data = combined_feature[combined_feature.isin(valid_features)]
        P_A = filtered_data.value_counts(normalize=True)

        # Compute joint probabilities for cancer type
        joint_prob = (
            filtered_data[data_for_lift[cancer_type] == 1]
            .value_counts(normalize=True)
            .reindex(P_A.index, fill_value=0)
        )

        # Calculate lift
        lift = (joint_prob / (P_A * P_B)).round(2)

        # Store results
        lifts.append((cancer_type, feature, lift))


In [None]:
lifts = []
# Probability of the cancer type
for num_features in range(2, 6):
    feature_combinations = list(combinations(columns_to_combine, num_features))
    for cancer_type in cancer_type_dummy.columns:
        P_B = data_for_lift[cancer_type].mean()
        for feature in feature_combinations:
            # Create a combined feature from three columns
            combined_feature = data_for_lift[feature[0]].astype(str)

            for f in feature[1:]:
                combined_feature += "_" + data_for_lift[f].astype(str)
            # combined_feature = "_".join(data_for_lift[feature].astype(str) for feature in feature_combinations)

            min_count = 100
            P_A_counts = combined_feature.value_counts()

            # Filter combined features based on minimum count
            valid_features = P_A_counts[P_A_counts >= min_count].index
            filtered_data = combined_feature[combined_feature.isin(valid_features)]

            # Probability of the combined feature
            P_A = filtered_data.value_counts(normalize=True)

            # Joint probability of the combined feature and cancer type
            joint = (filtered_data[data_for_lift[cancer_type] == 1].value_counts(normalize=True).reindex(P_A.index, fill_value=0))

            # Calculate lift
            lift = (joint / (P_A * P_B)).round(2)  # Round lift to 2 decimal places for readability

            # Append results as a tuple of the feature triplet and their associated lift values
            lifts.append((cancer_type, feature, lift))

In [None]:
# Flatten the results for easy visualization
lift_results = []

for cancer_type, feature_pair, lift in lifts:
    for feature_value, lift_value in lift.items():
        lift_results.append({
            'Cancer Type': cancer_type,
            'Feature Pair': feature_pair,
            'Feature Value': feature_value,
            'Lift': lift_value
        })

lift_df = pd.DataFrame(lift_results)
lift_df = lift_df.sort_values(by='Lift', ascending=False)

In [None]:
lift_df.sort_values(by='Lift', ascending=False)

In [None]:
filter_triple_data = data[data["Smoke Status"] == "Nonsmoker"]
filter_triple_data = filter_triple_data[filter_triple_data["Hugo_Symbol"] == "TP53"]
filter_triple_data = filter_triple_data[filter_triple_data["SNP_event"] == "G>A"]

In [None]:
def combine_features(data, feature_combination):
    """
    Combine selected features into a single feature by joining their values.
    """
    return data[list(feature_combination)].astype(str).agg('_'.join, axis=1)

In [None]:
def filter_and_compute_probabilities(combined_feature, data_for_lift, cancer_type, min_count):
    """
    Filter valid features, compute P(A), and joint probabilities for a cancer type.
    """
    # Step 1: Compute value counts
    combined_counts = combined_feature.value_counts()
    valid_features = combined_counts[combined_counts >= min_count].index

    # Step 2: Skip if no valid combined features
    if valid_features.empty:
        return None, None

    # Step 3: Filter the combined feature
    filtered_data = combined_feature[combined_feature.isin(valid_features)]

    # Step 4: Compute probabilities
    P_A = filtered_data.value_counts(normalize=True)
    joint_prob = (
        filtered_data[data_for_lift[cancer_type] == 1]
        .value_counts(normalize=True)
        .reindex(P_A.index, fill_value=0)
    )

    return P_A, joint_prob


In [None]:
def calculate_lifts_for_cancer_type(data_for_lift, cancer_type, P_B, feature_combinations, min_count):
    """
    Compute lifts for a single cancer type across all feature combinations.
    """
    lifts = []

    for feature_combination in feature_combinations:
        # Step 1: Combine features
        combined_feature = combine_features(data_for_lift, feature_combination)

        # Step 2: Filter and compute probabilities
        P_A, joint_prob = filter_and_compute_probabilities(
            combined_feature, data_for_lift, cancer_type, min_count
        )

        if P_A is None or joint_prob is None:
            continue  # Skip if no valid combined features

        # Step 3: Calculate lift
        lift = (joint_prob / (P_A * P_B)).round(2)

        # Step 4: Store result
        lifts.append((cancer_type, feature_combination, lift))

    return lifts

In [None]:
def compute_all_lifts(data_for_lift, cancer_probabilities, feature_combinations, min_count=100):
    """
    Main function to compute lifts for all cancer types.
    """
    all_lifts = []

    for cancer_type, P_B in cancer_probabilities.items():
        lifts = calculate_lifts_for_cancer_type(
            data_for_lift, cancer_type, P_B, feature_combinations, min_count
        )
        all_lifts.extend(lifts)

    return all_lifts

In [None]:
lifts = compute_all_lifts(data_for_lift, cancer_probabilities, feature_combinations, min_count=100)

In [None]:
data_dr = pd.read_csv('data_for_rules.csv')

In [None]:
data_dr.columns

In [25]:
narrowed = pd.read_csv("narrowed_cancers_data.csv")

In [None]:
narrowed[narrowed.index == 529]

In [None]:
from sklearn.impute import KNNImputer
narrowed[['Current_Exon', 'Total_Exons']] = narrowed['Exon_Number'].str.split('/', expand=True)
narrowed[['Current_Exon', 'Total_Exons']] = narrowed[['Current_Exon', 'Total_Exons']].astype(float)

imputer = KNNImputer(n_neighbors=5)
narrowed[["Current_Exon", "Total_Exons"]] = imputer.fit_transform(narrowed[["Current_Exon", "Total_Exons"]])
# ransform(narrowed[["Exon_Number"]])

In [None]:
narrowed[["Exon_Number", "Current_Exon", "Total_Exons"]]

In [None]:
df = pd.read_csv("data_for_rules.csv")

In [None]:
df.head()

In [None]:
df = df[df["Diagnosis Age"] < 71.5]
df = df[df["Diagnosis Age"] > 47.5]
df = df[df["TMB (nonsynonymous)"] < 0.28]
df = df[df["Sex"] == "Female"]
df

In [None]:
df["Cancer Type"].drop_duplicates()

In [None]:
df[["PATIENT_ID", "Cancer Type"]].drop_duplicates()["Cancer Type"].value_counts()

In [None]:
df

In [None]:
df_rules = pd.read_csv("data_for_rules.csv")

In [None]:
df_rules['Exon_Number'].value_counts()

In [None]:
# Smoke Status - Convert to dummies
dummy_smoking = df_rules['Smoke Status'].str.get_dummies().groupby(level=0).sum()

# Hugo Symbol - Convert to dummies
dummy_hugo_symbol = df_rules['Hugo_Symbol'].str.get_dummies().groupby(level=0).sum()

# Variant Type - Convert to dummies
dummy_Variant_Type = df_rules['Variant_Type'].str.get_dummies().groupby(level=0).sum()

# SNP_event - Keep only top 100 most frequent values
top_100 = df_rules['SNP_event'].value_counts().nlargest(100).index
df_rules['SNP_event'] = df_rules['SNP_event'].where(df_rules['SNP_event'].isin(top_100), other=None)
dummy_snp_event = df_rules['SNP_event'].str.get_dummies().groupby(level=0).sum()

# Combine all dummy variables
dummy_vars = pd.concat([dummy_smoking, dummy_hugo_symbol, dummy_snp_event, dummy_Variant_Type], axis=1)

# Join with original DataFrame
df_rules = df_rules.join(dummy_vars)

# Drop original categorical columns
df_rules.drop(['Smoke Status', 'SNP_event', 'Hugo_Symbol', 'Variant_Type'], axis=1, inplace=True)

In [None]:
df_rules.head()

In [None]:
len(list(data_for_model["Event_Info"].unique()))

In [None]:
df_rules["Cancer Type"].unique()

In [None]:
df_rules['Exon_Number'] = df_rules['Exon_Number'].str.split('/').str[0].astype(int)

In [None]:
df_rules.drop('Codons', axis=1, inplace=True)

In [None]:
df_rules.head(10)

In [None]:
df_rules.to_csv("data_for_decision.csv", index=False)

In [None]:
df = data_for_model.copy()
df.drop(['Site1_Hugo_Symbol', 'Site2_Hugo_Symbol', 'Event_Info'], axis=1, inplace=True)

In [None]:
data_for_model.dropna(inplace=True)
data_for_model['Exon_Number'] = data_for_model['Exon_Number'].str.split('/').str[0].astype(int)

In [None]:
# Step 1: Prepare your categorical features
categorical_features = ['Sex', 'VAR_TYPE_SX', 'Smoke Status', 'Hugo_Symbol', 'Variant_Type', 'SNP_event', 'Consequence', 'Chromosome']

# Step 2: Handle the special case of Codons
def prepare_data(df, columns):
    # For high-cardinality features like Codons, we can group them
    # Example: Group by first letter of codon or by some domain knowledge

    # Method 1: Keep only the most frequent codons and group others
    for column in columns:
        top_codons = df[column].value_counts().nlargest(100).index.tolist()
        df[f'{column}_grouped'] = df[column].apply(lambda x: x if x in top_codons else 'Other')

    # OR Method 2: Group by first nucleotide
    # df['Codons_grouped'] = df['Codons'].apply(lambda x: x[0] + '_codons' if isinstance(x, str) else 'Unknown')

    return df

# Step 3: Encoding categorical features
def encode_features(df, categorical_cols):
    label_encoders = {}

    # Store original values for interpretation
    feature_values = {}

    for col in categorical_cols + ['SNP_event_grouped']:
        if col in df.columns:
            le = LabelEncoder()
            df[col + '_encoded'] = le.fit_transform(df[col])

            # Store mapping for interpretation
            label_encoders[col] = le
            feature_values[col] = dict(zip(le.transform(le.classes_), le.classes_))

    return df, label_encoders, feature_values

def extract_rules(clf, feature_names, class_names, feature_values):
    tree_ = clf.tree_

    feature_name = [
        feature_names[i] if i != tree._tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    paths = []

    def recurse(node, path, paths):
        if tree_.feature[node] != tree._tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]

            # Special handling for chromosome dummy variables
            if name.startswith('chr_'):
                # Extract chromosome number
                chr_num = name.split('_')[1]

                # For dummy variables, typically threshold is 0.5
                if threshold <= 0.5:
                    # chr_X ≤ 0.5 means the mutation is NOT on this chromosome
                    path.append((name, "chromosome", {"excluded": chr_num}, f"Chromosome is not {chr_num}"))
                    recurse(tree_.children_left[node], path, paths)
                    path.pop()

                    # chr_X > 0.5 means the mutation IS on this chromosome
                    path.append((name, "chromosome", {"included": chr_num}, f"Chromosome is {chr_num}"))
                    recurse(tree_.children_right[node], path, paths)
                    path.pop()
            # Handle categorical features
            elif name.endswith('_encoded'):
                original_name = name.replace('_encoded', '')

                if original_name in feature_values:
                    # Handle categorical feature
                    left_values = [feature_values[original_name][i] for i in range(len(feature_values[original_name]))
                                  if i <= threshold]
                    right_values = [feature_values[original_name][i] for i in range(len(feature_values[original_name]))
                                   if i > threshold]

                    # Store as tuples: (feature_name, "categorical", values_list, readable_condition)
                    if len(left_values) <= 3:
                        left_condition = f"{original_name} is {' or '.join(map(str, left_values))}"
                    else:
                        left_condition = f"{original_name} is in a group of {len(left_values)} values"

                   # Handle special case for Codons
                    if original_name == 'SNP_event_grouped' and 'Other' in left_values:
                        left_condition = f"SNP event is among the less common types"
                    if original_name == 'SNP_event_grouped' and 'Other' in right_values:
                        right_condition = f"SNP event is among the less common types"

                    path.append((original_name, "categorical", set(left_values), left_condition))
                    recurse(tree_.children_left[node], path, paths)
                    path.pop()

                    if len(right_values) <= 3:
                        right_condition = f"{original_name} is {' or '.join(map(str, right_values))}"
                    else:
                        right_condition = f"{original_name} is in a group of {len(right_values)} values"

                    path.append((original_name, "categorical", set(right_values), right_condition))
                    recurse(tree_.children_right[node], path, paths)
                    path.pop()
                else:
                    # Standard case for encoded features without mapping
                    path.append((original_name, "categorical", {f"≤ category {threshold:.0f}"},
                                f"{original_name} ≤ category {threshold:.0f}"))
                    recurse(tree_.children_left[node], path, paths)
                    path.pop()

                    path.append((original_name, "categorical", {f"> category {threshold:.0f}"},
                                f"{original_name} > category {threshold:.0f}"))
                    recurse(tree_.children_right[node], path, paths)
                    path.pop()
            else:
                # Numerical features - ensure we use consistent 4-tuple format
                left_condition = f"{name} ≤ {threshold:.2f}"
                path.append((name, "numerical", {"min": float("-inf"), "max": threshold}, left_condition))
                recurse(tree_.children_left[node], path, paths)
                path.pop()

                right_condition = f"{name} > {threshold:.2f}"
                path.append((name, "numerical", {"min": threshold, "max": float("inf")}, right_condition))
                recurse(tree_.children_right[node], path, paths)
                path.pop()
        else:
            class_idx = np.argmax(tree_.value[node][0])
            paths.append((path.copy(), class_names[class_idx]))

    recurse(0, [], paths)

    # Generate human-readable sentences with consolidated features
    rules = []
    for path, outcome in paths:
        if path:
            # Group by feature name
            feature_groups = {}
            for condition in path:
                feature, cond_type, value_info, readable = condition  # Now this should always work
                if feature not in feature_groups:
                    feature_groups[feature] = []
                feature_groups[feature].append((cond_type, value_info, readable))

            # Process chromosome features
            chromosomes_included = []
            chromosomes_excluded = []
            other_feature_groups = {}

            for feature, conditions in feature_groups.items():
                if any(c[0] == "chromosome" for c in conditions):
                    for cond_type, value_info, _ in conditions:
                        if "included" in value_info:
                            chromosomes_included.append(value_info["included"])
                        if "excluded" in value_info:
                            chromosomes_excluded.append(value_info["excluded"])
                else:
                    other_feature_groups[feature] = conditions

            # Create consolidated conditions
            consolidated_conditions = []

            # Add chromosome conditions
            if chromosomes_included:
                if len(chromosomes_included) == 1:
                    consolidated_conditions.append(f"Chromosome is {chromosomes_included[0]}")
                else:
                    consolidated_conditions.append(f"Chromosome is one of {', '.join(chromosomes_included)}")

            if chromosomes_excluded:
                if len(chromosomes_excluded) <= 3:
                    consolidated_conditions.append(f"Chromosome is not {', '.join(chromosomes_excluded)}")

            # Process other features
            for feature, conditions in other_feature_groups.items():
                if all(c[0] == "numerical" for c in conditions):
                    # For numerical features
                    min_val = float("-inf")
                    max_val = float("inf")

                    for _, value_info, _ in conditions:
                        min_val = max(min_val, value_info.get("min", float("-inf")))
                        max_val = min(max_val, value_info.get("max", float("inf")))

                    if min_val > float("-inf") and max_val < float("inf"):
                        consolidated_conditions.append(f"{feature} is between {min_val:.2f} and {max_val:.2f}")
                    elif min_val > float("-inf"):
                        consolidated_conditions.append(f"{feature} > {min_val:.2f}")
                    elif max_val < float("inf"):
                        consolidated_conditions.append(f"{feature} ≤ {max_val:.2f}")

                elif all(c[0] == "categorical" for c in conditions):
                    # For categorical features - find intersection of values
                    value_sets = [c[1] for c in conditions]

                    # Find intersection of all sets
                    common_values = set.intersection(*value_sets) if value_sets else set()

                    # If intersection is non-empty, it's the stricter condition
                    if common_values:
                        if len(common_values) <= 3:
                            consolidated_conditions.append(f"{feature} is {' or '.join(map(str, common_values))}")
                        else:
                            consolidated_conditions.append(f"{feature} is in a group of {len(common_values)} values")
                    else:
                        # If no intersection (shouldn't happen in a valid tree), use original conditions
                        for _, _, readable in conditions:
                            consolidated_conditions.append(readable)

            # Create the final rule
            rule = "If " + " AND ".join(consolidated_conditions) + f", THEN cancer type is {outcome}"
            rules.append(rule)

    return rules

In [None]:
df = df_rules.copy()
df.head()

In [None]:
# d = prepare_data(df)
df, label_encoders, feature_values = encode_features(df, ['Sex', 'VAR_TYPE_SX'])
df.head()

In [None]:
features_to_drop = ['Cancer Type', 'Cancer Type Detailed', 'Tumor Stage', 'Sample Type', 'Sex', 'VAR_TYPE_SX']
                    # 'Smoke Status', 'Hugo_Symbol', 'Variant_Type', 'SNP_event', 'SNP_event_grouped', 'Codons', 'Consequence', 'Chromosome']
y = df['Cancer Type']
X = df.drop(features_to_drop, axis=1)
X_train, X_test, y_train, y_test, X_test_with_id = stratified_split_by_patient(X, y)
feature_names = list(X_train.columns)
class_names = list(df['Cancer Type'].unique())
clf = tree.DecisionTreeClassifier(random_state=39)#, min_samples_leaf=10)#, max_depth=1000)
clf.fit(X_train, y_train)
sentences = extract_rules(clf, feature_names, class_names, feature_values)

# for sentence in sentences:
#     print(sentence)
sentences

In [None]:
feature_names

In [None]:
data_for_model.head()

In [None]:
df = prepare_data(data_for_model, ['SNP_event', 'Codons',])

In [None]:
df = pd.read_csv('models_hypotheses/combined_hypotheses.csv')
df['plausibility'] = None
df['novelty'] = None
df['comments'] = None

In [None]:
df[df['rank'] <= 10].to_excel('models_hypotheses/hypotheses_for_professional_evaluation.xlsx', index=False)

In [None]:
df['Tumor Stage'].value_counts()

In [None]:
data = df[df['Tumor Stage'] in ['III', 'IV', 'II', 'I']]

In [None]:
def get_latest_csv(directory):
    list_of_files = glob.glob(os.path.join(directory, '*.csv'))
    if not list_of_files:
        raise FileNotFoundError("No CSV files found in the directory.")
    latest_file = max(list_of_files, key=os.path.getmtime)
    print(f"Using file: {latest_file}")
    return latest_file

In [None]:
latest_csv_path = get_latest_csv("llm_results")
df = pd.read_csv(latest_csv_path)

In [None]:
# df = pd.read_csv('llm_results/evaluations_20250409_151350.csv')

In [None]:
combine_hyp = pd.read_csv('models_hypotheses/combined_hypotheses.csv')
combine_hyp.rename(columns={'hypo_id': 'hypothesis_id'}, inplace=True)

In [None]:
# Make sure both columns are the same type (e.g., convert both to string or both to int)
df['hypothesis_id'] = df['hypothesis_id'].astype(str)
combine_hyp['hypothesis_id'] = combine_hyp['hypothesis_id'].astype(str)

# Now you can safely join
merged_df = df.join(combine_hyp.set_index('hypothesis_id'), on='hypothesis_id')

In [None]:
merged_df.sort_values(by=['novelty', 'plausibility'], ascending=[False, False], inplace=True)

In [None]:
data = merged_df[merged_df['novelty'] >= 6]
data = data[data['plausibility'] >= 6]
data = data[(data['novelty'] > 6) | (data['plausibility'] > 6)]
# data['novelty'] = None
# data['plausibility'] = None
data.drop(columns=['timestamp'], inplace=True)

In [None]:
merged_df[(merged_df['novelty'] == merged_df['plausibility'])]
# merged_df[merged_df["model"] == "openai:o3-mini"]

In [None]:
data.to_excel('models_hypotheses/hypotheses_for_professional2.xlsx', index=False)

In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import shap
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

In [None]:
def load_data(filepath):
    features_to_drop = ['Cancer Type Detailed', 'Tumor Stage', 'Sample Type', "Site2_Hugo_Symbol", "Site1_Hugo_Symbol", "Event_Info"]
    label = 'Cancer Type'
    data = pd.read_csv(filepath)
    cancer_types = data["Cancer Type"].unique()
    # mapping = {}
    # # Convert object columns to categorical
    # object_columns = data.select_dtypes(include=['object', 'bool']).columns
    # for col in object_columns:
    #     mapping[col] = dict(enumerate(data[col].astype('category').cat.categories))
    # data[object_columns] = data[object_columns].astype('category')
    #
    # # Encode categorical columns using cat.codes
    # for col in data.select_dtypes(include='category').columns:
    #     data[col] = data[col].cat.codes

    # Separate features and labels
    data.drop(features_to_drop, axis=1, inplace=True)
    data.dropna(inplace=True)
    X = data.drop(label, axis=1)
    y, uniques = pd.factorize(data['Cancer Type'])
    # label_dict = {cancer: idx for idx, cancer in enumerate(cancer_types)}
    # X.replace(-1, np.nan, inplace=True)
    return X, y#, label_dict, mapping


In [None]:
def stratified_split_by_patient(X, y, train_ratio=0.7, test_ratio=0.3):
    """
    Split data into training and testing sets with stratification by PATIENT_ID.
    """
    # Ensure the ratios sum to 1
    assert train_ratio + test_ratio == 1, "Ratios must sum to 1."

    # Get unique patient IDs
    unique_ids = X['PATIENT_ID'].unique()

    # Map PATIENT_ID to a corresponding target value (first occurrence)
    patient_labels = dict(zip(X['PATIENT_ID'], y))
    unique_patient_labels = [patient_labels[pid] for pid in unique_ids]

    # Initial split: train+val and test
    train_ids, test_ids = train_test_split(
        unique_ids,
        test_size=test_ratio,
        stratify=unique_patient_labels,
        random_state=42
    )

    # Split data into subsets
    X_train = X[X['PATIENT_ID'].isin(train_ids)].drop(columns=['PATIENT_ID'])
    X_test = X[X['PATIENT_ID'].isin(test_ids)].drop(columns=['PATIENT_ID'])
    X_test_with_id = X[X['PATIENT_ID'].isin(test_ids)]  # Keep validation set with PATIENT_ID for patient-level analysis

    y_train = y[X['PATIENT_ID'].isin(train_ids)]
    # y_val = y[X['PATIENT_ID'].isin(val_ids)]
    y_test = y[X['PATIENT_ID'].isin(test_ids)]

    return X_train, X_test, y_train, y_test, X_test_with_id

In [None]:
def analyze_feature_combinations_for_cancer(model, X, y, cancer_type, cancer_names,
                                           top_n=5, cat_features=None,
                                           interaction_depth=2):
    """
    Analyze feature combinations specific to a cancer type

    Parameters:
    -----------
    model : CatBoostClassifier
        Trained model
    X : DataFrame
        Feature data
    y : Series
        Target labels
    cancer_type : int
        The specific cancer type to analyze
    cancer_names : list
        List of cancer type names corresponding to encoded values
    top_n : int
        Number of top combinations to return
    cat_features : list
        List of categorical feature names
    interaction_depth : int
        Max number of features to consider in combinations
    """
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    import shap

    # Get actual cancer name
    cancer_name = cancer_names[cancer_type]

    # Get samples for this cancer type
    cancer_indices = np.where(y == cancer_type)[0]
    X_cancer = X.iloc[cancer_indices]

    # Get samples for other cancer types
    other_indices = np.where(y != cancer_type)[0]
    X_other = X.iloc[other_indices]

    print(f"\n===== ANALYZING CANCER TYPE: {cancer_name} =====")
    print(f"Number of samples: {len(X_cancer)}")

    # Get feature importance for this specific cancer type
    # Use SHAP values for better feature importance
    try:
        print("\nCalculating SHAP values for feature importance...")
        explainer = shap.TreeExplainer(model)

        # Get class-specific SHAP values
        # Sample for efficiency if dataset is large
        sample_size = min(len(X), 1000)
        X_sample = X.sample(sample_size, random_state=42)
        shap_values = explainer.shap_values(X_sample)

        # Get SHAP values for this cancer type
        cancer_shap = shap_values[cancer_type]

        # Calculate mean absolute SHAP value for each feature
        feature_importance = np.abs(cancer_shap).mean(0)
        feature_names = X.columns

        # Sort features by importance
        sorted_idx = np.argsort(-feature_importance)

        # Plot SHAP values
        plt.figure(figsize=(12, 10))
        shap.summary_plot(cancer_shap, X_sample, plot_type="bar", show=False)
        plt.title(f'SHAP Feature Importance for {cancer_name}')
        plt.tight_layout()
        plt.show()

        # List top important features
        print(f"\nTop features for {cancer_name}:")
        for i in range(min(10, len(sorted_idx))):
            idx = sorted_idx[i]
            print(f"{feature_names[idx]}: {feature_importance[idx]:.4f}")

        # Get top features for interaction analysis
        top_features = [feature_names[i] for i in sorted_idx[:15]]  # Use top 15 features

    except Exception as e:
        print(f"SHAP analysis error: {e}")
        print("Falling back to CatBoost feature importance")

        # Use CatBoost feature importance
        feature_importance = model.get_feature_importance()
        feature_names = X.columns
        sorted_idx = np.argsort(-feature_importance)

        # Plot feature importance
        plt.figure(figsize=(12, 10))
        plt.barh(range(min(15, len(sorted_idx))),
                feature_importance[sorted_idx[:15]])
        plt.yticks(range(min(15, len(sorted_idx))),
                  [feature_names[i] for i in sorted_idx[:15]])
        plt.title(f'Feature Importance for {cancer_name}')
        plt.tight_layout()
        plt.show()

        # Get top features for interaction analysis
        top_features = [feature_names[i] for i in sorted_idx[:15]]  # Use top 15 features

    # Analyze feature distributions for this cancer type vs others
    print("\nAnalyzing feature distributions...")

    # Select features for analysis (mix of top important and categorical)
    analysis_features = top_features.copy()
    if cat_features:
        for cf in cat_features:
            if cf not in analysis_features:
                analysis_features.append(cf)

    # Keep unique features only
    analysis_features = list(set(analysis_features))

    # Analyze individual feature distributions
    for feature in analysis_features[:10]:  # Limit to top 10 for clarity
        if feature in cat_features if cat_features else []:
            # Categorical feature analysis
            cancer_dist = X_cancer[feature].value_counts(normalize=True)
            other_dist = X_other[feature].value_counts(normalize=True)

            # Compute lift ratio (how much more likely in this cancer type)
            lift = pd.DataFrame({
                'Cancer': cancer_dist,
                'Other': other_dist
            }).fillna(0)
            lift['Lift'] = lift['Cancer'] / lift['Other'].replace(0, 0.001)
            lift = lift.sort_values('Lift', ascending=False)

            print(f"\nFeature: {feature}")
            print("Top values by lift ratio:")
            print(lift[['Cancer', 'Other', 'Lift']].head(3))

            # Plot distribution comparison
            plt.figure(figsize=(12, 6))

            # Get top categories by frequency
            top_cats = set(cancer_dist.nlargest(5).index) | set(other_dist.nlargest(5).index)

            # Filter both distributions to these categories
            plot_data = pd.DataFrame({
                f'{cancer_name}': cancer_dist.reindex(top_cats).fillna(0),
                'Other Cancer Types': other_dist.reindex(top_cats).fillna(0)
            })

            plot_data.plot(kind='bar', ax=plt.gca())
            plt.title(f'Distribution of {feature} ({cancer_name} vs Others)')
            plt.ylabel('Frequency')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.show()
        else:
            # Numerical feature analysis - Fix for TypeError
            plt.figure(figsize=(12, 6))

            # Check if feature is numeric
            if pd.api.types.is_numeric_dtype(X[feature]):
                # KDE plot for numeric data
                sns.kdeplot(X_cancer[feature].astype(float), label=f'{cancer_name}')
                sns.kdeplot(X_other[feature].astype(float), label='Other Cancer Types')
                plt.title(f'Distribution of {feature} ({cancer_name} vs Others)')
                plt.xlabel(feature)
                plt.ylabel('Density')
                plt.legend()
            else:
                # For non-numeric data, use countplot instead of kdeplot
                plot_data = pd.DataFrame({
                    'value': pd.concat([X_cancer[feature], X_other[feature]]),
                    'group': (['This Cancer'] * len(X_cancer)) + (['Other Cancers'] * len(X_other))
                })
                sns.countplot(x='value', hue='group', data=plot_data)
                plt.title(f'Distribution of {feature} ({cancer_name} vs Others)')
                plt.xlabel(feature)
                plt.ylabel('Count')
                plt.xticks(rotation=45)

            plt.tight_layout()
            plt.show()

            # Calculate statistics for numeric features
            if pd.api.types.is_numeric_dtype(X[feature]):
                cancer_mean = X_cancer[feature].mean()
                other_mean = X_other[feature].mean()
                mean_diff_pct = ((cancer_mean - other_mean) / other_mean * 100
                                if other_mean != 0 else float('inf'))

                print(f"\nFeature: {feature}")
                print(f"Mean for {cancer_name}: {cancer_mean:.4f}")
                print(f"Mean for Other Cancer Types: {other_mean:.4f}")
                print(f"Difference: {mean_diff_pct:.2f}%")

    # Feature interaction analysis
    if cat_features and len(cat_features) >= 2:
        print("\nAnalyzing categorical feature interactions...")

        # Select categorical features for analysis
        cat_features_for_analysis = [f for f in analysis_features if f in cat_features]
        cat_features_for_analysis = cat_features_for_analysis[:min(len(cat_features_for_analysis), 5)]

        if len(cat_features_for_analysis) >= 2:
            feature_combinations = []

            # Analyze pairwise combinations
            for i, feat1 in enumerate(cat_features_for_analysis):
                for feat2 in cat_features_for_analysis[i+1:]:
                    # Get value counts for both features in cancer subset
                    combo_cancer = X_cancer.groupby([feat1, feat2]).size().reset_index()
                    combo_cancer.columns = [feat1, feat2, 'cancer_count']
                    combo_cancer['cancer_pct'] = combo_cancer['cancer_count'] / len(X_cancer) * 100

                    # Get value counts for both features in other subset
                    combo_other = X_other.groupby([feat1, feat2]).size().reset_index()
                    combo_other.columns = [feat1, feat2, 'other_count']
                    combo_other['other_pct'] = combo_other['other_count'] / len(X_other) * 100

                    # Merge and calculate lift
                    combo_merged = pd.merge(combo_cancer, combo_other, on=[feat1, feat2], how='left')
                    combo_merged.fillna({'other_count': 0, 'other_pct': 0.001}, inplace=True)
                    combo_merged['lift'] = combo_merged['cancer_pct'] / combo_merged['other_pct']

                    # Filter to significant combinations (with enough samples)
                    combo_merged = combo_merged[combo_merged['cancer_count'] >= 5]

                    # Sort by lift
                    combo_merged.sort_values('lift', ascending=False, inplace=True)

                    # Add top combinations to results
                    for _, row in combo_merged.head(3).iterrows():
                        feature_combinations.append({
                            'Feature1': feat1,
                            'Value1': row[feat1],
                            'Feature2': feat2,
                            'Value2': row[feat2],
                            'Cancer_Count': row['cancer_count'],
                            'Cancer_Pct': row['cancer_pct'],
                            'Other_Pct': row['other_pct'],
                            'Lift': row['lift']
                        })

            # Sort all combinations by lift and display top results
            if feature_combinations:
                combinations_df = pd.DataFrame(feature_combinations)
                combinations_df.sort_values('Lift', ascending=False, inplace=True)

                print(f"\nTop feature combinations for {cancer_name}:")
                pd.set_option('display.max_colwidth', 30)
                print(combinations_df.head(top_n))

                # Plot top combinations
                plt.figure(figsize=(14, 8))
                bars = plt.barh(range(len(combinations_df.head(top_n))),
                                combinations_df.head(top_n)['Lift'],
                                color='skyblue')
                plt.yticks(range(len(combinations_df.head(top_n))),
                          [f"{row['Feature1']}={row['Value1']}, {row['Feature2']}={row['Value2']}"
                           for _, row in combinations_df.head(top_n).iterrows()])
                plt.xlabel('Lift (How much more common in this cancer type)')
                plt.title(f'Top Feature Combinations for {cancer_name}')

                # Add value labels
                for i, bar in enumerate(bars):
                    plt.text(bar.get_width() + 0.2,
                            bar.get_y() + bar.get_height()/2,
                            f"{combinations_df.iloc[i]['Lift']:.2f}x",
                            va='center')

                plt.tight_layout()
                plt.show()
            else:
                print("No significant feature combinations found.")
        else:
            print("Not enough categorical features for combination analysis.")

In [None]:
def main():
    # Import necessary libraries
    import pandas as pd
    import numpy as np
    from catboost import CatBoostClassifier, Pool
    from sklearn.metrics import classification_report

    # Load your data here
    features_to_drop = ['Cancer Type Detailed', 'Tumor Stage', 'Sample Type', "Site2_Hugo_Symbol", "Site1_Hugo_Symbol", "Event_Info"]
    label = 'Cancer Type'
    data = pd.read_csv("narrowed_cancers_data.csv")
    data.drop(features_to_drop, axis=1, inplace=True)
    data.dropna(inplace=True)
    X = data.drop(label, axis=1)

    # Get cancer names and encoded values
    y, cancer_names = pd.factorize(data['Cancer Type'])
    cancer_names = cancer_names.tolist()  # Convert to list for indexing

    # Split data
    X_train, X_test, y_train, y_test, X_test_with_id = stratified_split_by_patient(X, y)
    categorical_columns = X_train.select_dtypes(include=['object']).columns.tolist()

    # Create CatBoost Pool with categorical features identified
    train_pool = Pool(X_train, y_train, cat_features=categorical_columns)
    test_pool = Pool(X_test, y_test, cat_features=categorical_columns)

    # Initialize and train CatBoost model
    model = CatBoostClassifier(
        iterations=500,
        learning_rate=0.05,
        depth=6,
        loss_function='MultiClass',
        random_seed=42,
        verbose=100
    )

    model.fit(train_pool)

    # Evaluate model performance
    print("\nModel evaluation:")
    predictions = model.predict(X_test)
    pred_proba = model.predict_proba(X_test)

    print("\nClassification Report:")
    print(classification_report(y_test, predictions, target_names=cancer_names))

    cat_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

    # ====== FEATURE COMBINATION ANALYSIS FOR EACH CANCER TYPE ======
    print("Analyzing feature combinations for each cancer type...")
    unique_cancer_types = np.unique(y)

    for cancer_type in unique_cancer_types:
        analyze_feature_combinations_for_cancer(
            model, X, y, cancer_type, cancer_names,
            top_n=5, cat_features=cat_features
        )

    print("Analysis complete!")

In [None]:
if __name__ == "__main__":
    main()

In [None]:
X, y = load_data("narrowed_cancers_data.csv")
X_train, X_test, y_train, y_test, X_test_with_id = stratified_split_by_patient(X, y)
categorical_columns = X_train.select_dtypes(include=['object']).columns.tolist()
print(f"\nCategorical columns: {categorical_columns}")

In [None]:

# Create CatBoost Pool with categorical features identified
train_pool = Pool(X_train, y_train, cat_features=categorical_columns)
test_pool = Pool(X_test, y_test, cat_features=categorical_columns)

# Initialize and train CatBoost model
model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    loss_function='MultiClass',
    random_seed=42,
    verbose=100
)

model.fit(train_pool)

In [None]:

# Evaluate model performance
print("\nModel evaluation:")
predictions = model.predict(X_test)
pred_proba = model.predict_proba(X_test)

print("\nClassification Report:")
print(classification_report(y_test, predictions))