#  DataBase contruction

In [53]:
import pandas as pd
import itertools
from itertools import combinations
import sns
from matplotlib import pyplot as plt
from scipy.stats import chi2_contingency

In [54]:
# reading all the data files
data_clinical_patient = pd.read_csv('pan_origimed_2020/data_clinical_patient.txt', sep="\t")
data_clinical_sample = pd.read_csv('pan_origimed_2020/data_clinical_sample.txt', sep="\t")
data_cna_log2 = pd.read_csv('pan_origimed_2020/data_cna_log2.txt', sep="\t")
data_cna = pd.read_csv('pan_origimed_2020/data_cna.txt', sep="\t")
data_mutations = pd.read_csv('pan_origimed_2020/data_mutations.txt', sep="\t", header=2, dtype={"Exon_Number": "string"})
data_sv = pd.read_csv('pan_origimed_2020/data_sv.txt', sep="\t")

In [55]:
# removing bad rows
data_clinical_sample = data_clinical_sample[4:]
data_clinical_patient = data_clinical_patient[4:]

In [56]:
data_clinical_patient = data_clinical_patient.rename(columns={'#Patient Identifier': 'PATIENT_ID'})

In [57]:
data_clinical_patient.head()

Unnamed: 0,PATIENT_ID,Sex,Diagnosis Age,Smoke Status,Treatment
4,Patient0001,Female,67,Unknown,Other_Treatments
5,Patient0002,Male,75,Unknown,Treatment-naive
6,Patient0003,Female,45,Unknown,Treatment-naive
7,Patient0004,Male,70,Unknown,Treatment-naive
8,Patient0005,Male,53,Unknown,Treatment-naive


In [58]:
# matching the sample id to match other tables
data_clinical_patient["SAMPLE_ID"] = data_clinical_patient["PATIENT_ID"].apply(lambda x: "P-" + x[7:])

In [59]:
# make all sample id header name the same - "SAMPLE_ID"
data_clinical_sample.rename(columns={"Sample Identifier": 'SAMPLE_ID'}, inplace=True)
data_mutations.rename(columns={"Tumor_Sample_Barcode": 'SAMPLE_ID'}, inplace=True)
data_sv.rename(columns={"Sample_Id": 'SAMPLE_ID'}, inplace=True)

In [60]:
# merge everything
merged_clinical_data = data_clinical_patient.merge(data_clinical_sample, on="SAMPLE_ID", how='outer')
merged_mutations_data = merged_clinical_data.merge(data_mutations, on="SAMPLE_ID", how='outer')
merged_all_data = merged_mutations_data.merge(data_sv, on="SAMPLE_ID", how='outer')

In [61]:
merged_all_data["SNP_event"] = merged_all_data["Reference_Allele"].fillna("").astype(str) + ">" + merged_all_data["Tumor_Seq_Allele2"].fillna("").astype(str)


In [62]:
data_for_model = merged_all_data[["PATIENT_ID", "Cancer Type", 'Cancer Type Detailed', 'Tumor Stage',
                                'Sample Type', "Sex", "Diagnosis Age", "Smoke Status", "TMB (nonsynonymous)",
                                "Hugo_Symbol", "Chromosome", "Start_Position", "End_Position",
                                "Consequence", "Variant_Type", "SNP_event", "Protein_position", "Codons",
                                "Exon_Number","VAR_TYPE_SX", "Site1_Hugo_Symbol", "Site2_Hugo_Symbol","Event_Info"]]

In [63]:
data_for_model.shape

(105906, 23)

In [64]:
data_for_model.head(20)

Unnamed: 0,PATIENT_ID,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,Smoke Status,TMB (nonsynonymous),Hugo_Symbol,...,Consequence,Variant_Type,SNP_event,Protein_position,Codons,Exon_Number,VAR_TYPE_SX,Site1_Hugo_Symbol,Site2_Hugo_Symbol,Event_Info
0,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,KMT2C,...,missense_variant,SNP,C>T,4822.0,cGt/cAt,57/59,Substitution/Indel,,,
1,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,KRAS,...,missense_variant,SNP,C>A,12.0,Ggt/Tgt,5-Feb,Substitution/Indel,,,
2,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,SOX9,...,frameshift_variant,DEL,C>-,236.0,aCc/ac,3-Mar,Truncation,,,
3,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,APC,...,"stop_gained,splice_region_variant",SNP,C>T,216.0,Cga/Tga,16-Jul,Truncation,,,
4,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,APC,...,stop_gained,SNP,G>T,1286.0,Gaa/Taa,16/16,Truncation,,,
5,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,QKI,...,missense_variant,SNP,G>A,47.0,Gaa/Aaa,8-Jan,Substitution/Indel,,,
6,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,FBXW7,...,missense_variant,SNP,G>A,505.0,Cgc/Tgc,12-Oct,Substitution/Indel,,,
7,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,PTPN11,...,intron_variant,DEL,TTTC>-,,,,Substitution/Indel,,,
8,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,GLI1,...,missense_variant,SNP,G>T,488.0,Gct/Tct,12-Nov,Substitution/Indel,,,
9,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,ATM,...,"splice_region_variant,intron_variant",SNP,G>A,1026.0,,,Substitution/Indel,,,


In [65]:
# Function to handle the conversion
def convert_exon_number(val):
    try:
        # First, try to convert to 'Month-Year' format (e.g., 'Sep-89' -> '09/89')
        return pd.to_datetime(val, format='%b-%y').strftime('%m/%y')
    except ValueError:
        pass

    try:
        # Then, try to convert to 'DD-Mon' format (e.g., '14-Sep' -> '09/14')
        date_obj = pd.to_datetime(val, format='%d-%b', errors='raise')
        return date_obj.strftime('%m/%d')
    except ValueError:
        # If neither format matches, return the value as is (non-date-like string)
        return val

In [66]:
# Apply the function to the column
data_for_model.loc[:, 'Exon_Number'] = data_for_model['Exon_Number'].apply(convert_exon_number)

In [67]:
data_for_model.head(10)

Unnamed: 0,PATIENT_ID,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,Smoke Status,TMB (nonsynonymous),Hugo_Symbol,...,Consequence,Variant_Type,SNP_event,Protein_position,Codons,Exon_Number,VAR_TYPE_SX,Site1_Hugo_Symbol,Site2_Hugo_Symbol,Event_Info
0,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,KMT2C,...,missense_variant,SNP,C>T,4822.0,cGt/cAt,57/59,Substitution/Indel,,,
1,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,KRAS,...,missense_variant,SNP,C>A,12.0,Ggt/Tgt,02/05,Substitution/Indel,,,
2,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,SOX9,...,frameshift_variant,DEL,C>-,236.0,aCc/ac,03/03,Truncation,,,
3,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,APC,...,"stop_gained,splice_region_variant",SNP,C>T,216.0,Cga/Tga,07/16,Truncation,,,
4,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,APC,...,stop_gained,SNP,G>T,1286.0,Gaa/Taa,16/16,Truncation,,,
5,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,QKI,...,missense_variant,SNP,G>A,47.0,Gaa/Aaa,01/08,Substitution/Indel,,,
6,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,FBXW7,...,missense_variant,SNP,G>A,505.0,Cgc/Tgc,10/12,Substitution/Indel,,,
7,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,PTPN11,...,intron_variant,DEL,TTTC>-,,,,Substitution/Indel,,,
8,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,GLI1,...,missense_variant,SNP,G>T,488.0,Gct/Tct,11/12,Substitution/Indel,,,
9,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,ATM,...,"splice_region_variant,intron_variant",SNP,G>A,1026.0,,,Substitution/Indel,,,


In [68]:
data_for_model["Cancer Type"].value_counts()

Cancer Type
Colorectal Carcinoma                     28396
Non Small Cell Lung Cancer               19526
Gastric Cancer                           10832
Liver Hepatocellular Carcinoma            9633
Esophageal Carcinoma                      6298
Intrahepatic Cholangiocarcinoma           3939
Small Cell Lung Cancer                    3025
Pancreatic Cancer                         2957
Extrahepatic Cholangiocarcinoma           2883
Breast Carcinoma                          2350
Soft Tissue Sarcoma                       2270
Gallbladder Carcinoma                     2192
Ovarian Carcinoma                         1851
Urothelial Carcinoma                      1631
Kidney Renal Cell Carcinoma               1500
Uterine Corpus Endometrial Carcinoma      1358
Cancer of Unknown Primary                 1068
Head and Neck Carcinoma                    997
Carcinoma of Uterine Cervix                787
Small Bowel Carcinoma                      755
Bone Sarcoma                               601
G

In [69]:
def create_age_range(x):
    if x <= 10:
        return "0-10"
    elif x <= 20:
        return "11-20"
    elif x <= 30:
        return "21-30"
    elif x <= 40:
        return "31-40"
    elif x <= 50:
        return "41-50"
    elif x <= 60:
        return "51-60"
    elif x <= 70:
        return "61-70"
    elif x <= 80:
        return "71-80"
    else:
        return "80+"

In [70]:
data_for_model.loc[:, 'Diagnosis Age'] = data_for_model['Diagnosis Age'].astype(int).apply(create_age_range).astype("category")

In [71]:
# Filter cancer types with at least 2000 samples
cancer_counts = data_for_model['Cancer Type'].value_counts()
valid_cancer_types = cancer_counts[cancer_counts >= 2000].index
data_for_model = data_for_model[data_for_model['Cancer Type'].isin(valid_cancer_types)]
data_for_model = data_for_model[data_for_model['Chromosome'].notnull()]

In [72]:
data_for_lift = data_for_model.copy()

In [73]:
data_for_model['Consequence'].str.split(',')
dummy_vars = data_for_model['Consequence'].str.split(',').explode().str.get_dummies().groupby(level=0).sum()
data_for_model = data_for_model.join(dummy_vars)
data_for_model.drop('Consequence', axis=1, inplace=True)

In [74]:
data_for_model.to_csv("pan_cancer_data_for_model.csv", index=False)

In [75]:
hypo_data = pd.read_csv("hypotheses.csv")

In [76]:
hypo_data_updates = hypo_data[hypo_data["support"] > 2].sort_values(["cancer_type", 'support'], ascending=[True, False])
hypo_data_updates.head()

Unnamed: 0,cancer_type,Smoke Status,Hugo_Symbol,TMB (nonsynonymous),Start_Position,Protein_position,Sex,SNP_event,Site1_Hugo_Symbol,Diagnosis Age,...,upstream_gene_variant,Site2_Hugo_Symbol,Event_Info,Exon_Number,splice_region_variant,End_Position,5_prime_UTR_variant,missense_variant,splice_donor_variant,support
137,Bone Sarcoma,,,0.033333,,,,,,11-20,...,,,,,,,,,,8
174,Bone Sarcoma,,,0.0,,,,,,11-20,...,,,,,,,,,,6
190,Bone Sarcoma,,,0.2,,,,,,11-20,...,,,,,,,,,,6
274,Bone Sarcoma,,,0.1,,,,,,11-20,...,,,,,,,,,,3
112,Breast Carcinoma,,,,,,Female,,,41-50,...,,NCOR1,NCOR1-Intragenic,,,,,,,11


In [77]:
def cancer_type_correlations(df):
    """
    Print cancer type and non-null feature-value pairs for each row in the DataFrame.

    Parameters:
    - df: DataFrame with columns "cancer_type", feature columns, and "support".
    """
    corr_list = []
    # Iterate through each row
    for index, row in df.iterrows():
        # Extract cancer type and support
        cancer_type = row["cancer_type"]
        support = row["support"]

        # Get feature-value pairs where the feature value is not null
        features = [
            f"{feature}={row[feature]}"
            for feature in df.columns
            if feature not in {"cancer_type", "support"} and not pd.isnull(row[feature])
        ]

        # Format and print the result
        features_str = ", ".join(features)
        corr_list.append(f"{cancer_type}: {features_str}, Support: {support}")
    return corr_list


In [78]:
corr_list = cancer_type_correlations(hypo_data_updates)

In [79]:
corr_list

['Bone Sarcoma: TMB (nonsynonymous)=0.033333333, Diagnosis Age=11-20, Support: 8',
 'Bone Sarcoma: TMB (nonsynonymous)=0.0, Diagnosis Age=11-20, Support: 6',
 'Bone Sarcoma: TMB (nonsynonymous)=0.2, Diagnosis Age=11-20, Support: 6',
 'Bone Sarcoma: TMB (nonsynonymous)=0.1, Diagnosis Age=11-20, Support: 3',
 'Breast Carcinoma: Sex=Female, Diagnosis Age=41-50, Site2_Hugo_Symbol=NCOR1, Event_Info=NCOR1-Intragenic, Support: 11',
 'Breast Carcinoma: TMB (nonsynonymous)=0.333333333, Sex=Female, Site1_Hugo_Symbol=BRCA2, Site2_Hugo_Symbol=BRCA2, Event_Info=BRCA2-BRCA2, Support: 7',
 'Breast Carcinoma: Hugo_Symbol=PIK3CA, Start_Position=178952085.0, Sex=Female, Exon_Number=21/21, End_Position=178952085.0, Support: 6',
 'Breast Carcinoma: Sex=Female, Site1_Hugo_Symbol=BRCA1, Diagnosis Age=31-40, Site2_Hugo_Symbol=BRCA1, Event_Info=BRCA1-Intragenic, Support: 4',
 'Breast Carcinoma: TMB (nonsynonymous)=0.766666667, Sex=Female, Diagnosis Age=51-60, Event_Info=LRP1B-Intragenic, Support: 4',
 'Breast

Lift Calculation

In [80]:
cancer_type_dummy = data_for_lift['Cancer Type'].str.get_dummies().groupby(level=0).sum()
data_for_lift = data_for_lift.join(cancer_type_dummy)

In [81]:
data_for_lift.head()

Unnamed: 0,PATIENT_ID,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,Smoke Status,TMB (nonsynonymous),Hugo_Symbol,...,Esophageal Carcinoma,Extrahepatic Cholangiocarcinoma,Gallbladder Carcinoma,Gastric Cancer,Intrahepatic Cholangiocarcinoma,Liver Hepatocellular Carcinoma,Non Small Cell Lung Cancer,Pancreatic Cancer,Small Cell Lung Cancer,Soft Tissue Sarcoma
0,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,KMT2C,...,0,0,0,0,0,0,0,0,0,0
1,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,KRAS,...,0,0,0,0,0,0,0,0,0,0
2,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,SOX9,...,0,0,0,0,0,0,0,0,0,0
3,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,APC,...,0,0,0,0,0,0,0,0,0,0
4,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,APC,...,0,0,0,0,0,0,0,0,0,0


In [82]:
# Combine columns to create specific mutation identifiers
# data_for_lift['Mutation'] = data_for_lift['Chromosome'] + "_" + data_for_lift['Start_Position'].astype(str) + "_" + data_for_lift['Variant_Type']
data_for_lift['Position'] = data_for_lift['Start_Position'].astype(str) + "-" + data_for_lift['End_Position'].astype(str)

In [89]:
data_for_lift.to_csv("data_for_lift.csv", index=False)

In [84]:
# Select a subset of columns to analyze (e.g., most relevant ones)
columns_to_combine = ['Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'SNP_event', "Consequence", 'Exon_Number',
                      "Diagnosis Age", "TMB (nonsynonymous)", "Position", "Protein_position", "Codons", "VAR_TYPE_SX"]

In [97]:
cancer_probabilities = {cancer_type: data_for_lift[cancer_type].mean() for cancer_type in list(data_for_lift["Cancer Type"].unique())}

In [86]:
cancer_probabilities

{'Breast Carcinoma': np.float64(0.024886733456703464),
 'Colorectal Carcinoma': np.float64(0.3019271265394678),
 'Esophageal Carcinoma': np.float64(0.06688575500393508),
 'Extrahepatic Cholangiocarcinoma': np.float64(0.030534107587263097),
 'Gallbladder Carcinoma': np.float64(0.023302065386169784),
 'Gastric Cancer': np.float64(0.1149362942164933),
 'Intrahepatic Cholangiocarcinoma': np.float64(0.04178631442367005),
 'Liver Hepatocellular Carcinoma': np.float64(0.1023333971454704),
 'Non Small Cell Lung Cancer': np.float64(0.20719800906132346),
 'Pancreatic Cancer': np.float64(0.03129985323208474),
 'Small Cell Lung Cancer': np.float64(0.03211877565779678),
 'Soft Tissue Sarcoma': np.float64(0.02279156828962202)}

In [87]:
# Iterate over feature combinations
# for num_features in range(2, 6):
feature_combinations = list(combinations(columns_to_combine, 5))
feature_combinations

[('Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'SNP_event'),
 ('Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'Consequence'),
 ('Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'Exon_Number'),
 ('Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'Diagnosis Age'),
 ('Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'TMB (nonsynonymous)'),
 ('Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'Position'),
 ('Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'Protein_position'),
 ('Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'Codons'),
 ('Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'VAR_TYPE_SX'),
 ('Sex', 'Smoke Status', 'Chromosome', 'SNP_event', 'Consequence'),
 ('Sex', 'Smoke Status', 'Chromosome', 'SNP_event', 'Exon_Number'),
 ('Sex', 'Smoke Status', 'Chromosome', 'SNP_event', 'Diagnosis Age'),
 ('Sex', 'Smoke Status', 'Chromosome', 'SNP_event', 'TMB (nonsynonymous)'),
 ('Sex', 'Smoke Status', 'Chromosome', 'SNP_event', 'Position'),
 ('Sex', 'Smoke St

In [88]:
# Precompute the mean probabilities for cancer types

lifts = []

for cancer_type, P_B in cancer_probabilities.items():
    for feature in feature_combinations:
        # Combine the selected features into a single feature
        combined_feature = data_for_lift[list(feature)].astype(str).agg('_'.join, axis=1)

        # Compute value counts for the combined feature
        combined_counts = combined_feature.value_counts()
        valid_features = combined_counts[combined_counts >= 100].index

        if valid_features.empty:
            continue  # Skip if no valid combined features

        # Filter the combined feature to include only valid entries
        filtered_data = combined_feature[combined_feature.isin(valid_features)]
        P_A = filtered_data.value_counts(normalize=True)

        # Compute joint probabilities for cancer type
        joint_prob = (
            filtered_data[data_for_lift[cancer_type] == 1]
            .value_counts(normalize=True)
            .reindex(P_A.index, fill_value=0)
        )

        # Calculate lift
        lift = (joint_prob / (P_A * P_B)).round(2)

        # Store results
        lifts.append((cancer_type, feature, lift))


KeyboardInterrupt: 

In [446]:
lifts = []
# Probability of the cancer type
for num_features in range(2, 6):
    feature_combinations = list(combinations(columns_to_combine, num_features))
    for cancer_type in cancer_type_dummy.columns:
        P_B = data_for_lift[cancer_type].mean()
        for feature in feature_combinations:
            # Create a combined feature from three columns
            combined_feature = data_for_lift[feature[0]].astype(str)

            for f in feature[1:]:
                combined_feature += "_" + data_for_lift[f].astype(str)
            # combined_feature = "_".join(data_for_lift[feature].astype(str) for feature in feature_combinations)

            min_count = 100
            P_A_counts = combined_feature.value_counts()

            # Filter combined features based on minimum count
            valid_features = P_A_counts[P_A_counts >= min_count].index
            filtered_data = combined_feature[combined_feature.isin(valid_features)]

            # Probability of the combined feature
            P_A = filtered_data.value_counts(normalize=True)

            # Joint probability of the combined feature and cancer type
            joint = (filtered_data[data_for_lift[cancer_type] == 1].value_counts(normalize=True).reindex(P_A.index, fill_value=0))

            # Calculate lift
            lift = (joint / (P_A * P_B)).round(2)  # Round lift to 2 decimal places for readability

            # Append results as a tuple of the feature triplet and their associated lift values
            lifts.append((cancer_type, feature, lift))

KeyboardInterrupt: 

In [39]:
# Flatten the results for easy visualization
lift_results = []

for cancer_type, feature_pair, lift in lifts:
    for feature_value, lift_value in lift.items():
        lift_results.append({
            'Cancer Type': cancer_type,
            'Feature Pair': feature_pair,
            'Feature Value': feature_value,
            'Lift': lift_value
        })

lift_df = pd.DataFrame(lift_results)
lift_df = lift_df.sort_values(by='Lift', ascending=False)

In [348]:
lift_df.sort_values(by='Lift', ascending=False)

Unnamed: 0,Cancer Type,Feature Pair,Feature Value,Lift
1078,Breast Carcinoma,"(Sex, Chromosome, Exon_Number)",Female_3_21/21,1075.31
1223,Breast Carcinoma,"(Sex, Hugo_Symbol, Exon_Number)",Female_PIK3CA_21/21,1056.89
1287,Breast Carcinoma,"(Sex, SNP_event, Exon_Number)",Female_A>G_21/21,1056.07
436,Breast Carcinoma,"(Sex, Smoke Status, Exon_Number)",Female_Unknown_21/21,1000.34
24665,Small Cell Lung Cancer,"(Smoke Status, Chromosome, Hugo_Symbol)",Smoker_13_RB1,943.76
...,...,...,...,...
27943,Soft Tissue Sarcoma,"(Hugo_Symbol, SNP_event, Exon_Number)",APC_C>T_16/16,
27944,Soft Tissue Sarcoma,"(Hugo_Symbol, SNP_event, Exon_Number)",APC_G>T_16/16,
27947,Soft Tissue Sarcoma,"(Hugo_Symbol, SNP_event, Exon_Number)",EGFR_GGAATTAAGAGAAGC>-_19/28,
27948,Soft Tissue Sarcoma,"(Hugo_Symbol, SNP_event, Exon_Number)",EGFR_T>G_21/28,


In [136]:
filter_triple_data = data[data["Smoke Status"] == "Nonsmoker"]
filter_triple_data = filter_triple_data[filter_triple_data["Hugo_Symbol"] == "TP53"]
filter_triple_data = filter_triple_data[filter_triple_data["SNP_event"] == "G>A"]

In [40]:
def combine_features(data, feature_combination):
    """
    Combine selected features into a single feature by joining their values.
    """
    return data[list(feature_combination)].astype(str).agg('_'.join, axis=1)

In [41]:
def filter_and_compute_probabilities(combined_feature, data_for_lift, cancer_type, min_count):
    """
    Filter valid features, compute P(A), and joint probabilities for a cancer type.
    """
    # Step 1: Compute value counts
    combined_counts = combined_feature.value_counts()
    valid_features = combined_counts[combined_counts >= min_count].index

    # Step 2: Skip if no valid combined features
    if valid_features.empty:
        return None, None

    # Step 3: Filter the combined feature
    filtered_data = combined_feature[combined_feature.isin(valid_features)]

    # Step 4: Compute probabilities
    P_A = filtered_data.value_counts(normalize=True)
    joint_prob = (
        filtered_data[data_for_lift[cancer_type] == 1]
        .value_counts(normalize=True)
        .reindex(P_A.index, fill_value=0)
    )

    return P_A, joint_prob


In [42]:
def calculate_lifts_for_cancer_type(data_for_lift, cancer_type, P_B, feature_combinations, min_count):
    """
    Compute lifts for a single cancer type across all feature combinations.
    """
    lifts = []

    for feature_combination in feature_combinations:
        # Step 1: Combine features
        combined_feature = combine_features(data_for_lift, feature_combination)

        # Step 2: Filter and compute probabilities
        P_A, joint_prob = filter_and_compute_probabilities(
            combined_feature, data_for_lift, cancer_type, min_count
        )

        if P_A is None or joint_prob is None:
            continue  # Skip if no valid combined features

        # Step 3: Calculate lift
        lift = (joint_prob / (P_A * P_B)).round(2)

        # Step 4: Store result
        lifts.append((cancer_type, feature_combination, lift))

    return lifts

In [43]:
def compute_all_lifts(data_for_lift, cancer_probabilities, feature_combinations, min_count=100):
    """
    Main function to compute lifts for all cancer types.
    """
    all_lifts = []

    for cancer_type, P_B in cancer_probabilities.items():
        lifts = calculate_lifts_for_cancer_type(
            data_for_lift, cancer_type, P_B, feature_combinations, min_count
        )
        all_lifts.extend(lifts)

    return all_lifts

In [44]:
lifts = compute_all_lifts(data_for_lift, cancer_probabilities, feature_combinations, min_count=100)

KeyboardInterrupt: 

In [47]:
data_dt = pd.read_csv('decision_tree_sentences.csv')

In [48]:
data_dt.head()

Unnamed: 0,Cancer Type,Sentence
0,Non Small Cell Lung Cancer,If Smoke Status is 0 AND TMB (nonsynonymous) <...
1,Small Cell Lung Cancer,If Smoke Status is 0 AND TMB (nonsynonymous) <...
2,Non Small Cell Lung Cancer,If Smoke Status is 0 AND TMB (nonsynonymous) <...
3,Non Small Cell Lung Cancer,If Smoke Status is 0 AND TMB (nonsynonymous) <...
4,Small Cell Lung Cancer,If Smoke Status is 0 AND Sex is 0 AND Hugo_Sym...
