#  DataBase contruction

In [302]:
import pandas as pd
import itertools
from itertools import combinations

from pandas.core.nanops import nanall
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn import tree
from XGBoost_Model import *
import sns
from matplotlib import pyplot as plt
from scipy.stats import chi2_contingency

In [243]:
# reading all the data files
data_clinical_patient = pd.read_csv('pan_origimed_2020/data_clinical_patient.txt', sep="\t")
data_clinical_sample = pd.read_csv('pan_origimed_2020/data_clinical_sample.txt', sep="\t")
data_cna_log2 = pd.read_csv('pan_origimed_2020/data_cna_log2.txt', sep="\t")
data_cna = pd.read_csv('pan_origimed_2020/data_cna.txt', sep="\t")
data_mutations = pd.read_csv('pan_origimed_2020/data_mutations.txt', sep="\t", header=2, dtype={"Exon_Number": "string"})
data_sv = pd.read_csv('pan_origimed_2020/data_sv.txt', sep="\t")

In [244]:
# removing bad rows
data_clinical_sample = data_clinical_sample[4:]
data_clinical_patient = data_clinical_patient[4:]

In [245]:
data_clinical_patient = data_clinical_patient.rename(columns={'#Patient Identifier': 'PATIENT_ID'})

In [246]:
data_clinical_patient.head()

Unnamed: 0,PATIENT_ID,Sex,Diagnosis Age,Smoke Status,Treatment
4,Patient0001,Female,67,Unknown,Other_Treatments
5,Patient0002,Male,75,Unknown,Treatment-naive
6,Patient0003,Female,45,Unknown,Treatment-naive
7,Patient0004,Male,70,Unknown,Treatment-naive
8,Patient0005,Male,53,Unknown,Treatment-naive


In [247]:
# matching the sample id to match other tables
data_clinical_patient["SAMPLE_ID"] = data_clinical_patient["PATIENT_ID"].apply(lambda x: "P-" + x[7:])

In [248]:
# make all sample id header name the same - "SAMPLE_ID"
data_clinical_sample.rename(columns={"Sample Identifier": 'SAMPLE_ID'}, inplace=True)
data_mutations.rename(columns={"Tumor_Sample_Barcode": 'SAMPLE_ID'}, inplace=True)
data_sv.rename(columns={"Sample_Id": 'SAMPLE_ID'}, inplace=True)

In [249]:
# merge everything
merged_clinical_data = data_clinical_patient.merge(data_clinical_sample, on="SAMPLE_ID", how='outer')
merged_mutations_data = merged_clinical_data.merge(data_mutations, on="SAMPLE_ID", how='outer')
merged_all_data = merged_mutations_data.merge(data_sv, on="SAMPLE_ID", how='outer')

In [250]:
merged_all_data["SNP_event"] = merged_all_data["Reference_Allele"].fillna("").astype(str) + ">" + merged_all_data["Tumor_Seq_Allele2"].fillna("").astype(str)


In [251]:
data_for_model = merged_all_data[["PATIENT_ID", "Cancer Type", 'Cancer Type Detailed', 'Tumor Stage',
                                'Sample Type', "Sex", "Diagnosis Age", "Smoke Status", "TMB (nonsynonymous)",
                                "Hugo_Symbol", "Chromosome", "Start_Position", "End_Position",
                                "Consequence", "Variant_Type", "SNP_event", "Protein_position", "Codons",
                                "Exon_Number","VAR_TYPE_SX", "Site1_Hugo_Symbol", "Site2_Hugo_Symbol","Event_Info"]]

In [252]:
data_for_model["Exon_Number"].isnull().sum()

np.int64(15278)

In [253]:
data_for_model.head(20)

Unnamed: 0,PATIENT_ID,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,Smoke Status,TMB (nonsynonymous),Hugo_Symbol,...,Consequence,Variant_Type,SNP_event,Protein_position,Codons,Exon_Number,VAR_TYPE_SX,Site1_Hugo_Symbol,Site2_Hugo_Symbol,Event_Info
0,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,KMT2C,...,missense_variant,SNP,C>T,4822.0,cGt/cAt,57/59,Substitution/Indel,,,
1,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,KRAS,...,missense_variant,SNP,C>A,12.0,Ggt/Tgt,5-Feb,Substitution/Indel,,,
2,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,SOX9,...,frameshift_variant,DEL,C>-,236.0,aCc/ac,3-Mar,Truncation,,,
3,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,APC,...,"stop_gained,splice_region_variant",SNP,C>T,216.0,Cga/Tga,16-Jul,Truncation,,,
4,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,APC,...,stop_gained,SNP,G>T,1286.0,Gaa/Taa,16/16,Truncation,,,
5,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,QKI,...,missense_variant,SNP,G>A,47.0,Gaa/Aaa,8-Jan,Substitution/Indel,,,
6,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,FBXW7,...,missense_variant,SNP,G>A,505.0,Cgc/Tgc,12-Oct,Substitution/Indel,,,
7,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,PTPN11,...,intron_variant,DEL,TTTC>-,,,,Substitution/Indel,,,
8,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,GLI1,...,missense_variant,SNP,G>T,488.0,Gct/Tct,12-Nov,Substitution/Indel,,,
9,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,ATM,...,"splice_region_variant,intron_variant",SNP,G>A,1026.0,,,Substitution/Indel,,,


In [254]:
# Function to handle the conversion
def convert_exon_number(val):
    try:
        # First, try to convert to 'Month-Year' format (e.g., 'Sep-89' -> '09/89')
        return pd.to_datetime(val, format='%b-%y').strftime('%m/%y')
    except ValueError:
        pass

    try:
        # Then, try to convert to 'DD-Mon' format (e.g., '14-Sep' -> '09/14')
        date_obj = pd.to_datetime(val, format='%d-%b', errors='raise')
        return date_obj.strftime('%m/%d')
    except ValueError:
        # If neither format matches, return the value as is (non-date-like string)
        return val

In [255]:
# Apply the function to the column
data_for_model.loc[:, 'Exon_Number'] = data_for_model['Exon_Number'].apply(convert_exon_number)

In [256]:
data_for_model["Exon_Number"].isnull().sum()

np.int64(15278)

In [257]:
data_for_model["Cancer Type"].value_counts()

Cancer Type
Colorectal Carcinoma                     28396
Non Small Cell Lung Cancer               19526
Gastric Cancer                           10832
Liver Hepatocellular Carcinoma            9633
Esophageal Carcinoma                      6298
Intrahepatic Cholangiocarcinoma           3939
Small Cell Lung Cancer                    3025
Pancreatic Cancer                         2957
Extrahepatic Cholangiocarcinoma           2883
Breast Carcinoma                          2350
Soft Tissue Sarcoma                       2270
Gallbladder Carcinoma                     2192
Ovarian Carcinoma                         1851
Urothelial Carcinoma                      1631
Kidney Renal Cell Carcinoma               1500
Uterine Corpus Endometrial Carcinoma      1358
Cancer of Unknown Primary                 1068
Head and Neck Carcinoma                    997
Carcinoma of Uterine Cervix                787
Small Bowel Carcinoma                      755
Bone Sarcoma                               601
G

In [149]:
def create_age_range(x):
    if x <= 10:
        return "0-10"
    elif x <= 20:
        return "11-20"
    elif x <= 30:
        return "21-30"
    elif x <= 40:
        return "31-40"
    elif x <= 50:
        return "41-50"
    elif x <= 60:
        return "51-60"
    elif x <= 70:
        return "61-70"
    elif x <= 80:
        return "71-80"
    else:
        return "80+"

In [150]:
data_for_model.loc[:, 'Diagnosis Age'] = data_for_model['Diagnosis Age'].astype(int).apply(create_age_range).astype("category")

In [151]:
data_for_model

Unnamed: 0,PATIENT_ID,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,Smoke Status,TMB (nonsynonymous),Hugo_Symbol,...,Consequence,Variant_Type,SNP_event,Protein_position,Codons,Exon_Number,VAR_TYPE_SX,Site1_Hugo_Symbol,Site2_Hugo_Symbol,Event_Info
0,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,KMT2C,...,missense_variant,SNP,C>T,4822.0,cGt/cAt,57/59,Substitution/Indel,,,
1,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,KRAS,...,missense_variant,SNP,C>A,12.0,Ggt/Tgt,02/05,Substitution/Indel,,,
2,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,SOX9,...,frameshift_variant,DEL,C>-,236.0,aCc/ac,03/03,Truncation,,,
3,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,APC,...,"stop_gained,splice_region_variant",SNP,C>T,216.0,Cga/Tga,07/16,Truncation,,,
4,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,APC,...,stop_gained,SNP,G>T,1286.0,Gaa/Taa,16/16,Truncation,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105901,Patient9998,Gastric Cancer,Gastric Adenocarcinoma,III,Primary,Male,61-70,Unknown,0.066666667,RHOA,...,missense_variant,SNP,T>C,59.0,gAc/gGc,03/05,Substitution/Indel,,,
105902,Patient9999,Gastric Cancer,Gastric Adenocarcinoma,III,Primary,Male,61-70,Unknown,0.133333333,PREX2,...,missense_variant,SNP,G>T,930.0,aGc/aTc,24/40,Substitution/Indel,,,
105903,Patient9999,Gastric Cancer,Gastric Adenocarcinoma,III,Primary,Male,61-70,Unknown,0.133333333,CTNNB1,...,missense_variant,SNP,C>T,582.0,Cgg/Tgg,11/15,Substitution/Indel,,,
105904,Patient9999,Gastric Cancer,Gastric Adenocarcinoma,III,Primary,Male,61-70,Unknown,0.133333333,TP53,...,missense_variant,SNP,T>C,246.0,Atg/Gtg,07/11,Substitution/Indel,,,


In [258]:
tt = dict(enumerate(data_for_model["Exon_Number"].astype('category').cat.categories))
tt.values()
# data[object_columns] = data[object_columns].astype('category')

dict_values(['01/01', '01/02', '01/03', '01/04', '01/05', '01/06', '01/07', '01/08', '01/09', '01/10', '01/11', '01/12', '01/13', '01/14', '01/15', '01/16', '01/17', '01/18', '01/19', '01/20', '01/21', '01/22', '01/23', '01/24', '01/25', '01/26', '01/27', '01/28', '01/29', '01/30', '01/31', '01/33', '01/34', '01/35', '01/36', '01/37', '01/38', '01/40', '01/43', '01/45', '01/47', '01/49', '01/51', '01/52', '01/54', '01/57', '01/59', '01/79', '01/87', '01/89', '01/91', '02/02', '02/03', '02/04', '02/05', '02/06', '02/07', '02/08', '02/09', '02/10', '02/11', '02/12', '02/13', '02/14', '02/15', '02/16', '02/17', '02/18', '02/19', '02/20', '02/21', '02/22', '02/23', '02/24', '02/25', '02/26', '02/27', '02/28', '02/29', '02/30', '02/31', '02/32', '02/33', '02/34', '02/35', '02/36', '02/37', '02/38', '02/39', '02/40', '02/42', '02/43', '02/44', '02/45', '02/46', '02/49', '02/51', '02/52', '02/54', '02/57', '02/58', '02/59', '02/63', '02/79', '02/87', '02/89', '02/91', '03/03', '03/04', '03/05

In [259]:
# Filter cancer types with at least 2000 samples
cancer_counts = data_for_model['Cancer Type'].value_counts()
valid_cancer_types = cancer_counts[cancer_counts >= 2000].index
data_for_model = data_for_model[data_for_model['Cancer Type'].isin(valid_cancer_types)]
data_for_model = data_for_model[data_for_model['Chromosome'].notnull()]

In [260]:
data_for_lift = data_for_model.copy()

In [188]:
data_for_lift["Consequence"].unique()

array(['missense_variant', 'frameshift_variant',
       'stop_gained,splice_region_variant', 'stop_gained',
       'intron_variant', 'splice_region_variant,intron_variant',
       'inframe_deletion', 'intron_variant,non_coding_transcript_variant',
       'missense_variant,splice_region_variant',
       'downstream_gene_variant',
       'splice_region_variant,synonymous_variant', 'synonymous_variant',
       'splice_acceptor_variant', 'upstream_gene_variant',
       'inframe_insertion', 'splice_donor_variant',
       'splice_acceptor_variant,intron_variant',
       'inframe_deletion,splice_region_variant',
       'frameshift_variant,splice_region_variant', '5_prime_UTR_variant',
       'splice_acceptor_variant,coding_sequence_variant,intron_variant',
       nan, 'start_lost', 'protein_altering_variant', 'stop_lost',
       'splice_donor_variant,coding_sequence_variant,intron_variant',
       'splice_acceptor_variant,coding_sequence_variant',
       'start_lost,5_prime_UTR_variant',
    

In [23]:
data_for_model['Consequence'].str.split(',')
dummy_vars = data_for_model['Consequence'].str.split(',').explode().str.get_dummies().groupby(level=0).sum()
data_for_model = data_for_model.join(dummy_vars)
data_for_model.drop('Consequence', axis=1, inplace=True)

In [24]:
data_for_model[data_for_model["PATIENT_ID"] == "Patient8178"]

Unnamed: 0,PATIENT_ID,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,Smoke Status,TMB (nonsynonymous),Hugo_Symbol,...,splice_acceptor_variant,splice_donor_variant,splice_region_variant,start_lost,start_retained_variant,stop_gained,stop_lost,stop_retained_variant,synonymous_variant,upstream_gene_variant
87594,Patient8178,Colorectal Carcinoma,Colorectal Adenocarcinoma,I,Primary,Female,71-80,Unknown,6.166666667,TGFBR2,...,0,0,0,0,0,0,0,0,0,0
87595,Patient8178,Colorectal Carcinoma,Colorectal Adenocarcinoma,I,Primary,Female,71-80,Unknown,6.166666667,POLE,...,0,0,0,0,0,0,0,0,0,0
87596,Patient8178,Colorectal Carcinoma,Colorectal Adenocarcinoma,I,Primary,Female,71-80,Unknown,6.166666667,CTNNA1,...,0,0,0,0,0,0,0,0,0,0
87597,Patient8178,Colorectal Carcinoma,Colorectal Adenocarcinoma,I,Primary,Female,71-80,Unknown,6.166666667,PTPN11,...,0,0,0,0,0,0,0,0,0,0
87598,Patient8178,Colorectal Carcinoma,Colorectal Adenocarcinoma,I,Primary,Female,71-80,Unknown,6.166666667,SETD2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87797,Patient8178,Colorectal Carcinoma,Colorectal Adenocarcinoma,I,Primary,Female,71-80,Unknown,6.166666667,BRAF,...,0,0,0,0,0,0,0,0,0,0
87798,Patient8178,Colorectal Carcinoma,Colorectal Adenocarcinoma,I,Primary,Female,71-80,Unknown,6.166666667,PIK3C2B,...,0,0,0,0,0,0,0,0,0,0
87799,Patient8178,Colorectal Carcinoma,Colorectal Adenocarcinoma,I,Primary,Female,71-80,Unknown,6.166666667,RECQL,...,0,0,0,0,0,0,0,0,0,0
87800,Patient8178,Colorectal Carcinoma,Colorectal Adenocarcinoma,I,Primary,Female,71-80,Unknown,6.166666667,NSD1,...,0,0,0,0,0,0,0,0,0,0


In [25]:
data_for_model.to_csv("pan_cancer_data_for_model.csv", index=False)

In [26]:
hypo_data = pd.read_csv("hypotheses.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'hypotheses.csv'

In [76]:
hypo_data_updates = hypo_data[hypo_data["support"] > 2].sort_values(["cancer_type", 'support'], ascending=[True, False])
hypo_data_updates.head()

Unnamed: 0,cancer_type,Smoke Status,Hugo_Symbol,TMB (nonsynonymous),Start_Position,Protein_position,Sex,SNP_event,Site1_Hugo_Symbol,Diagnosis Age,...,upstream_gene_variant,Site2_Hugo_Symbol,Event_Info,Exon_Number,splice_region_variant,End_Position,5_prime_UTR_variant,missense_variant,splice_donor_variant,support
137,Bone Sarcoma,,,0.033333,,,,,,11-20,...,,,,,,,,,,8
174,Bone Sarcoma,,,0.0,,,,,,11-20,...,,,,,,,,,,6
190,Bone Sarcoma,,,0.2,,,,,,11-20,...,,,,,,,,,,6
274,Bone Sarcoma,,,0.1,,,,,,11-20,...,,,,,,,,,,3
112,Breast Carcinoma,,,,,,Female,,,41-50,...,,NCOR1,NCOR1-Intragenic,,,,,,,11


In [77]:
def cancer_type_correlations(df):
    """
    Print cancer type and non-null feature-value pairs for each row in the DataFrame.

    Parameters:
    - df: DataFrame with columns "cancer_type", feature columns, and "support".
    """
    corr_list = []
    # Iterate through each row
    for index, row in df.iterrows():
        # Extract cancer type and support
        cancer_type = row["cancer_type"]
        support = row["support"]

        # Get feature-value pairs where the feature value is not null
        features = [
            f"{feature}={row[feature]}"
            for feature in df.columns
            if feature not in {"cancer_type", "support"} and not pd.isnull(row[feature])
        ]

        # Format and print the result
        features_str = ", ".join(features)
        corr_list.append(f"{cancer_type}: {features_str}, Support: {support}")
    return corr_list


In [78]:
corr_list = cancer_type_correlations(hypo_data_updates)

In [79]:
corr_list

['Bone Sarcoma: TMB (nonsynonymous)=0.033333333, Diagnosis Age=11-20, Support: 8',
 'Bone Sarcoma: TMB (nonsynonymous)=0.0, Diagnosis Age=11-20, Support: 6',
 'Bone Sarcoma: TMB (nonsynonymous)=0.2, Diagnosis Age=11-20, Support: 6',
 'Bone Sarcoma: TMB (nonsynonymous)=0.1, Diagnosis Age=11-20, Support: 3',
 'Breast Carcinoma: Sex=Female, Diagnosis Age=41-50, Site2_Hugo_Symbol=NCOR1, Event_Info=NCOR1-Intragenic, Support: 11',
 'Breast Carcinoma: TMB (nonsynonymous)=0.333333333, Sex=Female, Site1_Hugo_Symbol=BRCA2, Site2_Hugo_Symbol=BRCA2, Event_Info=BRCA2-BRCA2, Support: 7',
 'Breast Carcinoma: Hugo_Symbol=PIK3CA, Start_Position=178952085.0, Sex=Female, Exon_Number=21/21, End_Position=178952085.0, Support: 6',
 'Breast Carcinoma: Sex=Female, Site1_Hugo_Symbol=BRCA1, Diagnosis Age=31-40, Site2_Hugo_Symbol=BRCA1, Event_Info=BRCA1-Intragenic, Support: 4',
 'Breast Carcinoma: TMB (nonsynonymous)=0.766666667, Sex=Female, Diagnosis Age=51-60, Event_Info=LRP1B-Intragenic, Support: 4',
 'Breast

Lift Calculation

In [27]:
cancer_type_dummy = data_for_lift['Cancer Type'].str.get_dummies().groupby(level=0).sum()
data_for_lift = data_for_lift.join(cancer_type_dummy)

In [28]:
data_for_lift.head()

Unnamed: 0,PATIENT_ID,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,Smoke Status,TMB (nonsynonymous),Hugo_Symbol,...,Esophageal Carcinoma,Extrahepatic Cholangiocarcinoma,Gallbladder Carcinoma,Gastric Cancer,Intrahepatic Cholangiocarcinoma,Liver Hepatocellular Carcinoma,Non Small Cell Lung Cancer,Pancreatic Cancer,Small Cell Lung Cancer,Soft Tissue Sarcoma
0,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,KMT2C,...,0,0,0,0,0,0,0,0,0,0
1,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,KRAS,...,0,0,0,0,0,0,0,0,0,0
2,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,SOX9,...,0,0,0,0,0,0,0,0,0,0
3,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,APC,...,0,0,0,0,0,0,0,0,0,0
4,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,APC,...,0,0,0,0,0,0,0,0,0,0


In [29]:
# Combine columns to create specific mutation identifiers
# data_for_lift['Mutation'] = data_for_lift['Chromosome'] + "_" + data_for_lift['Start_Position'].astype(str) + "_" + data_for_lift['Variant_Type']
data_for_lift['Position'] = data_for_lift['Start_Position'].astype(str) + "-" + data_for_lift['End_Position'].astype(str)

In [30]:
data_for_lift.to_csv("data_for_lift.csv", index=False)

In [84]:
# Select a subset of columns to analyze (e.g., most relevant ones)
columns_to_combine = ['Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'SNP_event', "Consequence", 'Exon_Number',
                      "Diagnosis Age", "TMB (nonsynonymous)", "Position", "Protein_position", "Codons", "VAR_TYPE_SX"]

In [97]:
cancer_probabilities = {cancer_type: data_for_lift[cancer_type].mean() for cancer_type in list(data_for_lift["Cancer Type"].unique())}

In [99]:
cancer_probabilities

{'Colorectal Carcinoma': np.float64(0.3019271265394678),
 'Liver Hepatocellular Carcinoma': np.float64(0.1023333971454704),
 'Soft Tissue Sarcoma': np.float64(0.02279156828962202),
 'Gastric Cancer': np.float64(0.1149362942164933),
 'Pancreatic Cancer': np.float64(0.03129985323208474),
 'Breast Carcinoma': np.float64(0.024886733456703464),
 'Extrahepatic Cholangiocarcinoma': np.float64(0.030534107587263097),
 'Esophageal Carcinoma': np.float64(0.06688575500393508),
 'Small Cell Lung Cancer': np.float64(0.03211877565779678),
 'Intrahepatic Cholangiocarcinoma': np.float64(0.04178631442367005),
 'Non Small Cell Lung Cancer': np.float64(0.20719800906132346),
 'Gallbladder Carcinoma': np.float64(0.023302065386169784)}

In [87]:
    # Iterate over feature combinations
# for num_features in range(2, 6):
feature_combinations = list(combinations(columns_to_combine, 5))
feature_combinations

[('Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'SNP_event'),
 ('Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'Consequence'),
 ('Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'Exon_Number'),
 ('Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'Diagnosis Age'),
 ('Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'TMB (nonsynonymous)'),
 ('Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'Position'),
 ('Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'Protein_position'),
 ('Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'Codons'),
 ('Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'VAR_TYPE_SX'),
 ('Sex', 'Smoke Status', 'Chromosome', 'SNP_event', 'Consequence'),
 ('Sex', 'Smoke Status', 'Chromosome', 'SNP_event', 'Exon_Number'),
 ('Sex', 'Smoke Status', 'Chromosome', 'SNP_event', 'Diagnosis Age'),
 ('Sex', 'Smoke Status', 'Chromosome', 'SNP_event', 'TMB (nonsynonymous)'),
 ('Sex', 'Smoke Status', 'Chromosome', 'SNP_event', 'Position'),
 ('Sex', 'Smoke St

In [88]:
# Precompute the mean probabilities for cancer types

lifts = []

for cancer_type, P_B in cancer_probabilities.items():
    for feature in feature_combinations:
        # Combine the selected features into a single feature
        combined_feature = data_for_lift[list(feature)].astype(str).agg('_'.join, axis=1)

        # Compute value counts for the combined feature
        combined_counts = combined_feature.value_counts()
        valid_features = combined_counts[combined_counts >= 100].index

        if valid_features.empty:
            continue  # Skip if no valid combined features

        # Filter the combined feature to include only valid entries
        filtered_data = combined_feature[combined_feature.isin(valid_features)]
        P_A = filtered_data.value_counts(normalize=True)

        # Compute joint probabilities for cancer type
        joint_prob = (
            filtered_data[data_for_lift[cancer_type] == 1]
            .value_counts(normalize=True)
            .reindex(P_A.index, fill_value=0)
        )

        # Calculate lift
        lift = (joint_prob / (P_A * P_B)).round(2)

        # Store results
        lifts.append((cancer_type, feature, lift))


KeyboardInterrupt: 

In [446]:
lifts = []
# Probability of the cancer type
for num_features in range(2, 6):
    feature_combinations = list(combinations(columns_to_combine, num_features))
    for cancer_type in cancer_type_dummy.columns:
        P_B = data_for_lift[cancer_type].mean()
        for feature in feature_combinations:
            # Create a combined feature from three columns
            combined_feature = data_for_lift[feature[0]].astype(str)

            for f in feature[1:]:
                combined_feature += "_" + data_for_lift[f].astype(str)
            # combined_feature = "_".join(data_for_lift[feature].astype(str) for feature in feature_combinations)

            min_count = 100
            P_A_counts = combined_feature.value_counts()

            # Filter combined features based on minimum count
            valid_features = P_A_counts[P_A_counts >= min_count].index
            filtered_data = combined_feature[combined_feature.isin(valid_features)]

            # Probability of the combined feature
            P_A = filtered_data.value_counts(normalize=True)

            # Joint probability of the combined feature and cancer type
            joint = (filtered_data[data_for_lift[cancer_type] == 1].value_counts(normalize=True).reindex(P_A.index, fill_value=0))

            # Calculate lift
            lift = (joint / (P_A * P_B)).round(2)  # Round lift to 2 decimal places for readability

            # Append results as a tuple of the feature triplet and their associated lift values
            lifts.append((cancer_type, feature, lift))

KeyboardInterrupt: 

In [39]:
# Flatten the results for easy visualization
lift_results = []

for cancer_type, feature_pair, lift in lifts:
    for feature_value, lift_value in lift.items():
        lift_results.append({
            'Cancer Type': cancer_type,
            'Feature Pair': feature_pair,
            'Feature Value': feature_value,
            'Lift': lift_value
        })

lift_df = pd.DataFrame(lift_results)
lift_df = lift_df.sort_values(by='Lift', ascending=False)

In [348]:
lift_df.sort_values(by='Lift', ascending=False)

Unnamed: 0,Cancer Type,Feature Pair,Feature Value,Lift
1078,Breast Carcinoma,"(Sex, Chromosome, Exon_Number)",Female_3_21/21,1075.31
1223,Breast Carcinoma,"(Sex, Hugo_Symbol, Exon_Number)",Female_PIK3CA_21/21,1056.89
1287,Breast Carcinoma,"(Sex, SNP_event, Exon_Number)",Female_A>G_21/21,1056.07
436,Breast Carcinoma,"(Sex, Smoke Status, Exon_Number)",Female_Unknown_21/21,1000.34
24665,Small Cell Lung Cancer,"(Smoke Status, Chromosome, Hugo_Symbol)",Smoker_13_RB1,943.76
...,...,...,...,...
27943,Soft Tissue Sarcoma,"(Hugo_Symbol, SNP_event, Exon_Number)",APC_C>T_16/16,
27944,Soft Tissue Sarcoma,"(Hugo_Symbol, SNP_event, Exon_Number)",APC_G>T_16/16,
27947,Soft Tissue Sarcoma,"(Hugo_Symbol, SNP_event, Exon_Number)",EGFR_GGAATTAAGAGAAGC>-_19/28,
27948,Soft Tissue Sarcoma,"(Hugo_Symbol, SNP_event, Exon_Number)",EGFR_T>G_21/28,


In [136]:
filter_triple_data = data[data["Smoke Status"] == "Nonsmoker"]
filter_triple_data = filter_triple_data[filter_triple_data["Hugo_Symbol"] == "TP53"]
filter_triple_data = filter_triple_data[filter_triple_data["SNP_event"] == "G>A"]

In [40]:
def combine_features(data, feature_combination):
    """
    Combine selected features into a single feature by joining their values.
    """
    return data[list(feature_combination)].astype(str).agg('_'.join, axis=1)

In [41]:
def filter_and_compute_probabilities(combined_feature, data_for_lift, cancer_type, min_count):
    """
    Filter valid features, compute P(A), and joint probabilities for a cancer type.
    """
    # Step 1: Compute value counts
    combined_counts = combined_feature.value_counts()
    valid_features = combined_counts[combined_counts >= min_count].index

    # Step 2: Skip if no valid combined features
    if valid_features.empty:
        return None, None

    # Step 3: Filter the combined feature
    filtered_data = combined_feature[combined_feature.isin(valid_features)]

    # Step 4: Compute probabilities
    P_A = filtered_data.value_counts(normalize=True)
    joint_prob = (
        filtered_data[data_for_lift[cancer_type] == 1]
        .value_counts(normalize=True)
        .reindex(P_A.index, fill_value=0)
    )

    return P_A, joint_prob


In [42]:
def calculate_lifts_for_cancer_type(data_for_lift, cancer_type, P_B, feature_combinations, min_count):
    """
    Compute lifts for a single cancer type across all feature combinations.
    """
    lifts = []

    for feature_combination in feature_combinations:
        # Step 1: Combine features
        combined_feature = combine_features(data_for_lift, feature_combination)

        # Step 2: Filter and compute probabilities
        P_A, joint_prob = filter_and_compute_probabilities(
            combined_feature, data_for_lift, cancer_type, min_count
        )

        if P_A is None or joint_prob is None:
            continue  # Skip if no valid combined features

        # Step 3: Calculate lift
        lift = (joint_prob / (P_A * P_B)).round(2)

        # Step 4: Store result
        lifts.append((cancer_type, feature_combination, lift))

    return lifts

In [43]:
def compute_all_lifts(data_for_lift, cancer_probabilities, feature_combinations, min_count=100):
    """
    Main function to compute lifts for all cancer types.
    """
    all_lifts = []

    for cancer_type, P_B in cancer_probabilities.items():
        lifts = calculate_lifts_for_cancer_type(
            data_for_lift, cancer_type, P_B, feature_combinations, min_count
        )
        all_lifts.extend(lifts)

    return all_lifts

In [44]:
lifts = compute_all_lifts(data_for_lift, cancer_probabilities, feature_combinations, min_count=100)

KeyboardInterrupt: 

In [100]:
data_dr = pd.read_csv('data_for_rules.csv')

In [102]:
data_dr.columns

Index(['PATIENT_ID', 'Cancer Type', 'Cancer Type Detailed', 'Tumor Stage',
       'Sample Type', 'Sex', 'Diagnosis Age', 'Smoke Status',
       'TMB (nonsynonymous)', 'Hugo_Symbol', 'Start_Position', 'End_Position',
       'Variant_Type', 'SNP_event', 'Protein_position', 'Codons',
       'Exon_Number', 'VAR_TYPE_SX', '3_prime_UTR_variant',
       '5_prime_UTR_variant', 'NMD_transcript_variant',
       'coding_sequence_variant', 'downstream_gene_variant',
       'frameshift_variant', 'inframe_deletion', 'inframe_insertion',
       'intergenic_variant', 'intron_variant', 'mature_miRNA_variant',
       'missense_variant', 'non_coding_transcript_exon_variant',
       'non_coding_transcript_variant', 'protein_altering_variant',
       'splice_acceptor_variant', 'splice_donor_variant',
       'splice_region_variant', 'start_lost', 'start_retained_variant',
       'stop_gained', 'stop_lost', 'stop_retained_variant',
       'synonymous_variant', 'upstream_gene_variant', 'chr_1', 'chr_10',
    

In [55]:
narrowed = pd.read_csv("narrowed_cancers_data.csv")

In [60]:
narrowed[narrowed.index == 529]

Unnamed: 0,PATIENT_ID,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,Smoke Status,TMB (nonsynonymous),Hugo_Symbol,...,splice_acceptor_variant,splice_donor_variant,splice_region_variant,start_lost,start_retained_variant,stop_gained,stop_lost,stop_retained_variant,synonymous_variant,upstream_gene_variant
529,Patient0025,Liver Hepatocellular Carcinoma,Liver Hepatocellular Carcinoma,I,Primary,Male,61-70,Unknown,0.166667,TERT,...,0,0,0,0,0,0,0,0,0,1


In [63]:
from sklearn.impute import KNNImputer
narrowed[['Current_Exon', 'Total_Exons']] = narrowed['Exon_Number'].str.split('/', expand=True)
narrowed[['Current_Exon', 'Total_Exons']] = narrowed[['Current_Exon', 'Total_Exons']].astype(float)

imputer = KNNImputer(n_neighbors=5)
narrowed[["Current_Exon", "Total_Exons"]] = imputer.fit_transform(narrowed[["Current_Exon", "Total_Exons"]])
# ransform(narrowed[["Exon_Number"]])

In [66]:
narrowed[["Exon_Number", "Current_Exon", "Total_Exons"]]

Unnamed: 0,Exon_Number,Current_Exon,Total_Exons
0,57/59,57.000000,59.000000
1,02/05,2.000000,5.000000
2,03/03,3.000000,3.000000
3,07/16,7.000000,16.000000
4,16/16,16.000000,16.000000
...,...,...,...
94021,03/05,3.000000,5.000000
94022,24/40,24.000000,40.000000
94023,11/15,11.000000,15.000000
94024,07/11,7.000000,11.000000


In [33]:
df = pd.read_csv("data_for_rules.csv")

In [34]:
df.head()

Unnamed: 0,PATIENT_ID,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,Smoke Status,TMB (nonsynonymous),Hugo_Symbol,...,chr_21,chr_22,chr_3,chr_4,chr_5,chr_6,chr_7,chr_8,chr_9,chr_X
0,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333,KMT2C,...,0,0,0,0,0,0,1,0,0,0
1,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333,KRAS,...,0,0,0,0,0,0,0,0,0,0
2,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333,SOX9,...,0,0,0,0,0,0,0,0,0,0
3,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333,APC,...,0,0,0,0,1,0,0,0,0,0
4,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333,APC,...,0,0,0,0,1,0,0,0,0,0


In [63]:
df = df[df["Diagnosis Age"] < 71.5]
df = df[df["Diagnosis Age"] > 47.5]
df = df[df["TMB (nonsynonymous)"] < 0.28]
df = df[df["Sex"] == "Female"]
df

Unnamed: 0,PATIENT_ID,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,Smoke Status,TMB (nonsynonymous),Hugo_Symbol,...,chr_21,chr_22,chr_3,chr_4,chr_5,chr_6,chr_7,chr_8,chr_9,chr_X
410,Patient0009,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Metastasis,Female,65,Unknown,0.200000,FGF14,...,0,0,0,0,0,0,0,0,0,0
411,Patient0009,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Metastasis,Female,65,Unknown,0.200000,TP53,...,0,0,0,0,0,0,0,0,0,0
412,Patient0009,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Metastasis,Female,65,Unknown,0.200000,GATA6,...,0,0,0,0,0,0,0,0,0,0
413,Patient0009,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Metastasis,Female,65,Unknown,0.200000,GATA6,...,0,0,0,0,0,0,0,0,0,0
414,Patient0009,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Metastasis,Female,65,Unknown,0.200000,FAT1,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80111,Patient9987,Extrahepatic Cholangiocarcinoma,Extrahepatic Cholangiocarcinoma,II,Primary,Female,65,Unknown,0.266667,TP53,...,0,0,0,0,0,0,0,0,0,0
80119,Patient9990,Extrahepatic Cholangiocarcinoma,Extrahepatic Cholangiocarcinoma,II,Primary,Female,67,Unknown,0.133333,CDKN2A,...,0,0,0,0,0,0,0,0,1,0
80120,Patient9990,Extrahepatic Cholangiocarcinoma,Extrahepatic Cholangiocarcinoma,II,Primary,Female,67,Unknown,0.133333,ACVR1B,...,0,0,0,0,0,0,0,0,0,0
80121,Patient9990,Extrahepatic Cholangiocarcinoma,Extrahepatic Cholangiocarcinoma,II,Primary,Female,67,Unknown,0.133333,TP53,...,0,0,0,0,0,0,0,0,0,0


In [62]:
df["Cancer Type"].drop_duplicates()

410                 Colorectal Carcinoma
442                     Breast Carcinoma
484                  Soft Tissue Sarcoma
661                    Pancreatic Cancer
678                       Gastric Cancer
781      Extrahepatic Cholangiocarcinoma
938      Intrahepatic Cholangiocarcinoma
1038          Non Small Cell Lung Cancer
1257               Gallbladder Carcinoma
2366      Liver Hepatocellular Carcinoma
6585                Esophageal Carcinoma
31460             Small Cell Lung Cancer
Name: Cancer Type, dtype: object

In [64]:
df[["PATIENT_ID", "Cancer Type"]].drop_duplicates()["Cancer Type"].value_counts()

Cancer Type
Non Small Cell Lung Cancer         525
Colorectal Carcinoma               156
Intrahepatic Cholangiocarcinoma    134
Breast Carcinoma                   130
Pancreatic Cancer                  127
Gastric Cancer                     117
Soft Tissue Sarcoma                 94
Gallbladder Carcinoma               78
Extrahepatic Cholangiocarcinoma     77
Liver Hepatocellular Carcinoma      56
Esophageal Carcinoma                45
Small Cell Lung Cancer               5
Name: count, dtype: int64

In [45]:
df

Unnamed: 0,PATIENT_ID,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,Smoke Status,TMB (nonsynonymous),Hugo_Symbol,...,chr_21,chr_22,chr_3,chr_4,chr_5,chr_6,chr_7,chr_8,chr_9,chr_X
0,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333,KMT2C,...,0,0,0,0,0,0,1,0,0,0
1,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333,KRAS,...,0,0,0,0,0,0,0,0,0,0
2,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333,SOX9,...,0,0,0,0,0,0,0,0,0,0
3,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333,APC,...,0,0,0,0,1,0,0,0,0,0
4,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333,APC,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80148,Patient9998,Gastric Cancer,Gastric Adenocarcinoma,III,Primary,Male,66,Unknown,0.066667,CDH1,...,0,0,0,0,0,0,0,0,0,0
80149,Patient9998,Gastric Cancer,Gastric Adenocarcinoma,III,Primary,Male,66,Unknown,0.066667,RHOA,...,0,0,1,0,0,0,0,0,0,0
80150,Patient9999,Gastric Cancer,Gastric Adenocarcinoma,III,Primary,Male,63,Unknown,0.133333,PREX2,...,0,0,0,0,0,0,0,1,0,0
80151,Patient9999,Gastric Cancer,Gastric Adenocarcinoma,III,Primary,Male,63,Unknown,0.133333,CTNNB1,...,0,0,1,0,0,0,0,0,0,0


In [279]:
df_rules = pd.read_csv("data_for_rules.csv")

In [280]:
df_rules['Exon_Number'].value_counts()

Exon_Number
02/05    1848
05/11    1664
07/11    1415
16/16    1378
08/11    1356
         ... 
03/58       1
14/44       1
13/44       1
06/79       1
28/51       1
Name: count, Length: 1841, dtype: int64

In [281]:
# Smoke Status - Convert to dummies
dummy_smoking = df_rules['Smoke Status'].str.get_dummies().groupby(level=0).sum()

# Hugo Symbol - Convert to dummies
dummy_hugo_symbol = df_rules['Hugo_Symbol'].str.get_dummies().groupby(level=0).sum()

# Variant Type - Convert to dummies
dummy_Variant_Type = df_rules['Variant_Type'].str.get_dummies().groupby(level=0).sum()

# SNP_event - Keep only top 100 most frequent values
top_100 = df_rules['SNP_event'].value_counts().nlargest(100).index
df_rules['SNP_event'] = df_rules['SNP_event'].where(df_rules['SNP_event'].isin(top_100), other=None)
dummy_snp_event = df_rules['SNP_event'].str.get_dummies().groupby(level=0).sum()

# Combine all dummy variables
dummy_vars = pd.concat([dummy_smoking, dummy_hugo_symbol, dummy_snp_event, dummy_Variant_Type], axis=1)

# Join with original DataFrame
df_rules = df_rules.join(dummy_vars)

# Drop original categorical columns
df_rules.drop(['Smoke Status', 'SNP_event', 'Hugo_Symbol', 'Variant_Type'], axis=1, inplace=True)

In [282]:
df_rules.head()

Unnamed: 0,PATIENT_ID,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,TMB (nonsynonymous),Start_Position,End_Position,...,TGTT>-,TT>-,TTAAGAGAAGCAACATCT>-,TTC>-,TTG>-,DEL,DNP,INS,SNP,TNP
0,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,0.333333,151836340.0,151836340.0,...,0,0,0,0,0,0,0,0,1,0
1,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,0.333333,25398285.0,25398285.0,...,0,0,0,0,0,0,0,0,1,0
2,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,0.333333,70119705.0,70119705.0,...,0,0,0,0,0,1,0,0,0,0
3,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,0.333333,112128143.0,112128143.0,...,0,0,0,0,0,0,0,0,1,0
4,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,0.333333,112175147.0,112175147.0,...,0,0,0,0,0,0,0,0,1,0


In [53]:
len(list(data_for_model["Event_Info"].unique()))

1263

In [64]:
df_rules["Cancer Type"].unique()

array(['Colorectal Carcinoma', 'Liver Hepatocellular Carcinoma',
       'Soft Tissue Sarcoma', 'Gastric Cancer', 'Pancreatic Cancer',
       'Breast Carcinoma', 'Extrahepatic Cholangiocarcinoma',
       'Esophageal Carcinoma', 'Small Cell Lung Cancer',
       'Intrahepatic Cholangiocarcinoma', 'Non Small Cell Lung Cancer',
       'Gallbladder Carcinoma'], dtype=object)

In [283]:
df_rules['Exon_Number'] = df_rules['Exon_Number'].str.split('/').str[0].astype(int)

In [284]:
df_rules.drop('Codons', axis=1, inplace=True)

In [285]:
df_rules.head(10)

Unnamed: 0,PATIENT_ID,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,TMB (nonsynonymous),Start_Position,End_Position,...,TGTT>-,TT>-,TTAAGAGAAGCAACATCT>-,TTC>-,TTG>-,DEL,DNP,INS,SNP,TNP
0,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,0.333333,151836340.0,151836340.0,...,0,0,0,0,0,0,0,0,1,0
1,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,0.333333,25398285.0,25398285.0,...,0,0,0,0,0,0,0,0,1,0
2,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,0.333333,70119705.0,70119705.0,...,0,0,0,0,0,1,0,0,0,0
3,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,0.333333,112128143.0,112128143.0,...,0,0,0,0,0,0,0,0,1,0
4,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,0.333333,112175147.0,112175147.0,...,0,0,0,0,0,0,0,0,1,0
5,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,0.333333,163836364.0,163836364.0,...,0,0,0,0,0,0,0,0,1,0
6,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,0.333333,153247289.0,153247289.0,...,0,0,0,0,0,0,0,0,1,0
7,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,0.333333,57863367.0,57863367.0,...,0,0,0,0,0,0,0,0,1,0
8,Patient0002,Colorectal Carcinoma,Colorectal Adenocarcinoma,II,Primary,Male,75,0.3,70120165.0,70120165.0,...,0,0,0,0,0,1,0,0,0,0
9,Patient0002,Colorectal Carcinoma,Colorectal Adenocarcinoma,II,Primary,Male,75,0.3,134920413.0,134920413.0,...,0,0,0,0,0,0,0,0,1,0


In [286]:
df_rules.to_csv("data_for_decision.csv", index=False)

In [265]:
df = data_for_model.copy()
df.drop(['Site1_Hugo_Symbol', 'Site2_Hugo_Symbol', 'Event_Info'], axis=1, inplace=True)

In [266]:
data_for_model.dropna(inplace=True)
data_for_model['Exon_Number'] = data_for_model['Exon_Number'].str.split('/').str[0].astype(int)

AttributeError: Can only use .str accessor with string values!

In [278]:
# Step 1: Prepare your categorical features
categorical_features = ['Sex', 'VAR_TYPE_SX', 'Smoke Status', 'Hugo_Symbol', 'Variant_Type', 'SNP_event', 'Consequence', 'Chromosome']

# Step 2: Handle the special case of Codons
def prepare_data(df, columns):
    # For high-cardinality features like Codons, we can group them
    # Example: Group by first letter of codon or by some domain knowledge

    # Method 1: Keep only the most frequent codons and group others
    for column in columns:
        top_codons = df[column].value_counts().nlargest(100).index.tolist()
        df[f'{column}_grouped'] = df[column].apply(lambda x: x if x in top_codons else 'Other')

    # OR Method 2: Group by first nucleotide
    # df['Codons_grouped'] = df['Codons'].apply(lambda x: x[0] + '_codons' if isinstance(x, str) else 'Unknown')

    return df

# Step 3: Encoding categorical features
def encode_features(df, categorical_cols):
    label_encoders = {}

    # Store original values for interpretation
    feature_values = {}

    for col in categorical_cols + ['SNP_event_grouped']:
        if col in df.columns:
            le = LabelEncoder()
            df[col + '_encoded'] = le.fit_transform(df[col])

            # Store mapping for interpretation
            label_encoders[col] = le
            feature_values[col] = dict(zip(le.transform(le.classes_), le.classes_))

    return df, label_encoders, feature_values

def extract_rules(clf, feature_names, class_names, feature_values):
    tree_ = clf.tree_

    feature_name = [
        feature_names[i] if i != tree._tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    paths = []

    def recurse(node, path, paths):
        if tree_.feature[node] != tree._tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]

            # Special handling for chromosome dummy variables
            if name.startswith('chr_'):
                # Extract chromosome number
                chr_num = name.split('_')[1]

                # For dummy variables, typically threshold is 0.5
                if threshold <= 0.5:
                    # chr_X ≤ 0.5 means the mutation is NOT on this chromosome
                    path.append((name, "chromosome", {"excluded": chr_num}, f"Chromosome is not {chr_num}"))
                    recurse(tree_.children_left[node], path, paths)
                    path.pop()

                    # chr_X > 0.5 means the mutation IS on this chromosome
                    path.append((name, "chromosome", {"included": chr_num}, f"Chromosome is {chr_num}"))
                    recurse(tree_.children_right[node], path, paths)
                    path.pop()
            # Handle categorical features
            elif name.endswith('_encoded'):
                original_name = name.replace('_encoded', '')

                if original_name in feature_values:
                    # Handle categorical feature
                    left_values = [feature_values[original_name][i] for i in range(len(feature_values[original_name]))
                                  if i <= threshold]
                    right_values = [feature_values[original_name][i] for i in range(len(feature_values[original_name]))
                                   if i > threshold]

                    # Store as tuples: (feature_name, "categorical", values_list, readable_condition)
                    if len(left_values) <= 3:
                        left_condition = f"{original_name} is {' or '.join(map(str, left_values))}"
                    else:
                        left_condition = f"{original_name} is in a group of {len(left_values)} values"

                   # Handle special case for Codons
                    if original_name == 'SNP_event_grouped' and 'Other' in left_values:
                        left_condition = f"SNP event is among the less common types"
                    if original_name == 'SNP_event_grouped' and 'Other' in right_values:
                        right_condition = f"SNP event is among the less common types"

                    path.append((original_name, "categorical", set(left_values), left_condition))
                    recurse(tree_.children_left[node], path, paths)
                    path.pop()

                    if len(right_values) <= 3:
                        right_condition = f"{original_name} is {' or '.join(map(str, right_values))}"
                    else:
                        right_condition = f"{original_name} is in a group of {len(right_values)} values"

                    path.append((original_name, "categorical", set(right_values), right_condition))
                    recurse(tree_.children_right[node], path, paths)
                    path.pop()
                else:
                    # Standard case for encoded features without mapping
                    path.append((original_name, "categorical", {f"≤ category {threshold:.0f}"},
                                f"{original_name} ≤ category {threshold:.0f}"))
                    recurse(tree_.children_left[node], path, paths)
                    path.pop()

                    path.append((original_name, "categorical", {f"> category {threshold:.0f}"},
                                f"{original_name} > category {threshold:.0f}"))
                    recurse(tree_.children_right[node], path, paths)
                    path.pop()
            else:
                # Numerical features - ensure we use consistent 4-tuple format
                left_condition = f"{name} ≤ {threshold:.2f}"
                path.append((name, "numerical", {"min": float("-inf"), "max": threshold}, left_condition))
                recurse(tree_.children_left[node], path, paths)
                path.pop()

                right_condition = f"{name} > {threshold:.2f}"
                path.append((name, "numerical", {"min": threshold, "max": float("inf")}, right_condition))
                recurse(tree_.children_right[node], path, paths)
                path.pop()
        else:
            class_idx = np.argmax(tree_.value[node][0])
            paths.append((path.copy(), class_names[class_idx]))

    recurse(0, [], paths)

    # Generate human-readable sentences with consolidated features
    rules = []
    for path, outcome in paths:
        if path:
            # Group by feature name
            feature_groups = {}
            for condition in path:
                feature, cond_type, value_info, readable = condition  # Now this should always work
                if feature not in feature_groups:
                    feature_groups[feature] = []
                feature_groups[feature].append((cond_type, value_info, readable))

            # Process chromosome features
            chromosomes_included = []
            chromosomes_excluded = []
            other_feature_groups = {}

            for feature, conditions in feature_groups.items():
                if any(c[0] == "chromosome" for c in conditions):
                    for cond_type, value_info, _ in conditions:
                        if "included" in value_info:
                            chromosomes_included.append(value_info["included"])
                        if "excluded" in value_info:
                            chromosomes_excluded.append(value_info["excluded"])
                else:
                    other_feature_groups[feature] = conditions

            # Create consolidated conditions
            consolidated_conditions = []

            # Add chromosome conditions
            if chromosomes_included:
                if len(chromosomes_included) == 1:
                    consolidated_conditions.append(f"Chromosome is {chromosomes_included[0]}")
                else:
                    consolidated_conditions.append(f"Chromosome is one of {', '.join(chromosomes_included)}")

            if chromosomes_excluded:
                if len(chromosomes_excluded) <= 3:
                    consolidated_conditions.append(f"Chromosome is not {', '.join(chromosomes_excluded)}")

            # Process other features
            for feature, conditions in other_feature_groups.items():
                if all(c[0] == "numerical" for c in conditions):
                    # For numerical features
                    min_val = float("-inf")
                    max_val = float("inf")

                    for _, value_info, _ in conditions:
                        min_val = max(min_val, value_info.get("min", float("-inf")))
                        max_val = min(max_val, value_info.get("max", float("inf")))

                    if min_val > float("-inf") and max_val < float("inf"):
                        consolidated_conditions.append(f"{feature} is between {min_val:.2f} and {max_val:.2f}")
                    elif min_val > float("-inf"):
                        consolidated_conditions.append(f"{feature} > {min_val:.2f}")
                    elif max_val < float("inf"):
                        consolidated_conditions.append(f"{feature} ≤ {max_val:.2f}")

                elif all(c[0] == "categorical" for c in conditions):
                    # For categorical features - find intersection of values
                    value_sets = [c[1] for c in conditions]

                    # Find intersection of all sets
                    common_values = set.intersection(*value_sets) if value_sets else set()

                    # If intersection is non-empty, it's the stricter condition
                    if common_values:
                        if len(common_values) <= 3:
                            consolidated_conditions.append(f"{feature} is {' or '.join(map(str, common_values))}")
                        else:
                            consolidated_conditions.append(f"{feature} is in a group of {len(common_values)} values")
                    else:
                        # If no intersection (shouldn't happen in a valid tree), use original conditions
                        for _, _, readable in conditions:
                            consolidated_conditions.append(readable)

            # Create the final rule
            rule = "If " + " AND ".join(consolidated_conditions) + f", THEN cancer type is {outcome}"
            rules.append(rule)

    return rules

In [299]:
df = df_rules.copy()
df.head()

Unnamed: 0,PATIENT_ID,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,TMB (nonsynonymous),Start_Position,End_Position,...,TGTT>-,TT>-,TTAAGAGAAGCAACATCT>-,TTC>-,TTG>-,DEL,DNP,INS,SNP,TNP
0,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,0.333333,151836340.0,151836340.0,...,0,0,0,0,0,0,0,0,1,0
1,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,0.333333,25398285.0,25398285.0,...,0,0,0,0,0,0,0,0,1,0
2,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,0.333333,70119705.0,70119705.0,...,0,0,0,0,0,1,0,0,0,0
3,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,0.333333,112128143.0,112128143.0,...,0,0,0,0,0,0,0,0,1,0
4,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,0.333333,112175147.0,112175147.0,...,0,0,0,0,0,0,0,0,1,0


In [300]:
# d = prepare_data(df)
df, label_encoders, feature_values = encode_features(df, ['Sex', 'VAR_TYPE_SX'])
df.head()

Unnamed: 0,PATIENT_ID,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,TMB (nonsynonymous),Start_Position,End_Position,...,TTAAGAGAAGCAACATCT>-,TTC>-,TTG>-,DEL,DNP,INS,SNP,TNP,Sex_encoded,VAR_TYPE_SX_encoded
0,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,0.333333,151836340.0,151836340.0,...,0,0,0,0,0,0,1,0,0,0
1,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,0.333333,25398285.0,25398285.0,...,0,0,0,0,0,0,1,0,0,0
2,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,0.333333,70119705.0,70119705.0,...,0,0,0,1,0,0,0,0,0,1
3,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,0.333333,112128143.0,112128143.0,...,0,0,0,0,0,0,1,0,0,1
4,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,0.333333,112175147.0,112175147.0,...,0,0,0,0,0,0,1,0,0,1


In [301]:
features_to_drop = ['Cancer Type', 'Cancer Type Detailed', 'Tumor Stage', 'Sample Type', 'Sex', 'VAR_TYPE_SX']
                    # 'Smoke Status', 'Hugo_Symbol', 'Variant_Type', 'SNP_event', 'SNP_event_grouped', 'Codons', 'Consequence', 'Chromosome']
y = df['Cancer Type']
X = df.drop(features_to_drop, axis=1)
X_train, X_test, y_train, y_test, X_test_with_id = stratified_split_by_patient(X, y)
feature_names = list(X_train.columns)
class_names = list(df['Cancer Type'].unique())
clf = tree.DecisionTreeClassifier(random_state=39)#, min_samples_leaf=10)#, max_depth=1000)
clf.fit(X_train, y_train)
sentences = extract_rules(clf, feature_names, class_names, feature_values)

# for sentence in sentences:
#     print(sentence)
sentences

['If Unknown ≤ 0.50 AND TMB (nonsynonymous) ≤ 0.25 AND Smoker ≤ 0.50 AND NOTCH2 ≤ 0.50 AND RPTOR ≤ 0.50 AND RUNX1T1 ≤ 0.50 AND PRDM1 ≤ 0.50 AND CYP17A1 ≤ 0.50 AND MYCN ≤ 0.50 AND PDGFRB ≤ 0.50 AND Sex is Female AND End_Position ≤ 7574008.50 AND stop_gained ≤ 0.50, THEN cancer type is Small Cell Lung Cancer',
 'If Unknown ≤ 0.50 AND TMB (nonsynonymous) ≤ 0.25 AND Smoker ≤ 0.50 AND NOTCH2 ≤ 0.50 AND RPTOR ≤ 0.50 AND RUNX1T1 ≤ 0.50 AND PRDM1 ≤ 0.50 AND CYP17A1 ≤ 0.50 AND MYCN ≤ 0.50 AND PDGFRB ≤ 0.50 AND Sex is Female AND End_Position ≤ 7574008.50 AND stop_gained > 0.50 AND Diagnosis Age ≤ 68.50, THEN cancer type is Small Cell Lung Cancer',
 'If Unknown ≤ 0.50 AND TMB (nonsynonymous) ≤ 0.25 AND Smoker ≤ 0.50 AND NOTCH2 ≤ 0.50 AND RPTOR ≤ 0.50 AND RUNX1T1 ≤ 0.50 AND PRDM1 ≤ 0.50 AND CYP17A1 ≤ 0.50 AND MYCN ≤ 0.50 AND PDGFRB ≤ 0.50 AND Sex is Female AND End_Position ≤ 7574008.50 AND stop_gained > 0.50 AND Diagnosis Age > 68.50, THEN cancer type is Non Small Cell Lung Cancer',
 'If Unknown ≤

In [274]:
feature_names

['Diagnosis Age',
 'TMB (nonsynonymous)',
 'Start_Position',
 'End_Position',
 'Protein_position',
 'Exon_Number',
 'Sex_encoded',
 'VAR_TYPE_SX_encoded',
 'Smoke Status_encoded',
 'Hugo_Symbol_encoded',
 'Variant_Type_encoded',
 'SNP_event_encoded',
 'Consequence_encoded',
 'Chromosome_encoded',
 'SNP_event_grouped_encoded']

In [277]:
data_for_model.head()

Unnamed: 0,PATIENT_ID,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,Smoke Status,TMB (nonsynonymous),Hugo_Symbol,...,SNP_event_grouped,Sex_encoded,VAR_TYPE_SX_encoded,Smoke Status_encoded,Hugo_Symbol_encoded,Variant_Type_encoded,SNP_event_encoded,Consequence_encoded,Chromosome_encoded,SNP_event_grouped_encoded
19,Patient0003,Colorectal Carcinoma,Colorectal Adenocarcinoma,III,Primary,Female,45,Unknown,0.5,APC,...,C>T,0,1,2,10,3,220,13,17,43
21,Patient0003,Colorectal Carcinoma,Colorectal Adenocarcinoma,III,Primary,Female,45,Unknown,0.5,APC,...,C>T,0,1,2,10,3,220,13,17,43
22,Patient0003,Colorectal Carcinoma,Colorectal Adenocarcinoma,III,Primary,Female,45,Unknown,0.5,ETV5,...,C>T,0,0,2,122,3,220,8,15,43
24,Patient0003,Colorectal Carcinoma,Colorectal Adenocarcinoma,III,Primary,Female,45,Unknown,0.5,AMER1,...,G>A,0,1,2,9,3,320,13,22,61
25,Patient0003,Colorectal Carcinoma,Colorectal Adenocarcinoma,III,Primary,Female,45,Unknown,0.5,LRP1B,...,T>C,0,0,2,233,3,462,9,11,81


In [None]:
df = prepare_data(data_for_model, ['SNP_event', 'Codons',])

In [307]:
df = pd.read_csv('models_hypotheses/combined_hypotheses.csv')
df['plausibility'] = None
df['novelty'] = None
df['comments'] = None


In [312]:
df[df['rank'] <= 10].to_excel('models_hypotheses/hypotheses_for_professional_evaluation.xlsx', index=False)