#  DataBase contruction

In [1]:
import pandas as pd
import itertools
from itertools import combinations
import sns
from matplotlib import pyplot as plt
from scipy.stats import chi2_contingency

In [2]:
# reading all the data files
data_clinical_patient = pd.read_csv('pan_origimed_2020/data_clinical_patient.txt', sep="\t")
data_clinical_sample = pd.read_csv('pan_origimed_2020/data_clinical_sample.txt', sep="\t")
data_cna_log2 = pd.read_csv('pan_origimed_2020/data_cna_log2.txt', sep="\t")
data_cna = pd.read_csv('pan_origimed_2020/data_cna.txt', sep="\t")
data_mutations = pd.read_csv('pan_origimed_2020/data_mutations.txt', sep="\t", header=2, dtype={"Exon_Number": "string"})
data_sv = pd.read_csv('pan_origimed_2020/data_sv.txt', sep="\t")

In [3]:
# removing bad rows
data_clinical_sample = data_clinical_sample[4:]
data_clinical_patient = data_clinical_patient[4:]

In [4]:
data_clinical_patient = data_clinical_patient.rename(columns={'#Patient Identifier': 'PATIENT_ID'})

In [5]:
data_clinical_patient.head()

Unnamed: 0,PATIENT_ID,Sex,Diagnosis Age,Smoke Status,Treatment
4,Patient0001,Female,67,Unknown,Other_Treatments
5,Patient0002,Male,75,Unknown,Treatment-naive
6,Patient0003,Female,45,Unknown,Treatment-naive
7,Patient0004,Male,70,Unknown,Treatment-naive
8,Patient0005,Male,53,Unknown,Treatment-naive


In [6]:
# matching the sample id to match other tables
data_clinical_patient["SAMPLE_ID"] = data_clinical_patient["PATIENT_ID"].apply(lambda x: "P-" + x[7:])

In [7]:
# make all sample id header name the same - "SAMPLE_ID"
data_clinical_sample.rename(columns={"Sample Identifier": 'SAMPLE_ID'}, inplace=True)
data_mutations.rename(columns={"Tumor_Sample_Barcode": 'SAMPLE_ID'}, inplace=True)
data_sv.rename(columns={"Sample_Id": 'SAMPLE_ID'}, inplace=True)

In [8]:
# merge everything
merged_clinical_data = data_clinical_patient.merge(data_clinical_sample, on="SAMPLE_ID", how='outer')
merged_mutations_data = merged_clinical_data.merge(data_mutations, on="SAMPLE_ID", how='outer')
merged_all_data = merged_mutations_data.merge(data_sv, on="SAMPLE_ID", how='outer')

In [9]:
merged_all_data.to_csv("pan_cancer_db_merged.csv")

In [10]:
merged_all_data["SNP_event"] = merged_all_data["Reference_Allele"].fillna("").astype(str) + ">" + merged_all_data["Tumor_Seq_Allele2"].fillna("").astype(str)


In [11]:
merged_all_data.head()

Unnamed: 0,PATIENT_ID,Sex,Diagnosis Age,Smoke Status,Treatment,SAMPLE_ID,#Patient Identifier,Cancer Type,Cancer Type Detailed,Tumor Stage,...,Site2_Hugo_Symbol,Center_y,Event_Info,DNA_support,RNA_support,Method,Connection_Type,SV_Status,Group,SNP_event
0,Patient0001,Female,67,Unknown,Other_Treatments,P-0001,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,...,,,,,,,,,,C>T
1,Patient0001,Female,67,Unknown,Other_Treatments,P-0001,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,...,,,,,,,,,,C>A
2,Patient0001,Female,67,Unknown,Other_Treatments,P-0001,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,...,,,,,,,,,,C>-
3,Patient0001,Female,67,Unknown,Other_Treatments,P-0001,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,...,,,,,,,,,,C>T
4,Patient0001,Female,67,Unknown,Other_Treatments,P-0001,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,...,,,,,,,,,,G>T


In [12]:
data_for_model = merged_all_data[["PATIENT_ID", "Cancer Type", 'Cancer Type Detailed', 'Tumor Stage',
                                'Sample Type', "Sex", "Diagnosis Age", "Smoke Status", "TMB (nonsynonymous)",
                                "Hugo_Symbol", "Chromosome", "Start_Position", "End_Position",
                                "Consequence", "Variant_Type", "SNP_event", "Protein_position", "Codons",
                                "Exon_Number","VAR_TYPE_SX", "Site1_Hugo_Symbol", "Site2_Hugo_Symbol","Event_Info"]]

In [13]:
data_for_model.to_csv("pan_cancer_db_for_model.csv")

In [14]:
data_for_model.shape

(105906, 23)

In [15]:
data_for_model.head(20)

Unnamed: 0,PATIENT_ID,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,Smoke Status,TMB (nonsynonymous),Hugo_Symbol,...,Consequence,Variant_Type,SNP_event,Protein_position,Codons,Exon_Number,VAR_TYPE_SX,Site1_Hugo_Symbol,Site2_Hugo_Symbol,Event_Info
0,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,KMT2C,...,missense_variant,SNP,C>T,4822.0,cGt/cAt,57/59,Substitution/Indel,,,
1,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,KRAS,...,missense_variant,SNP,C>A,12.0,Ggt/Tgt,5-Feb,Substitution/Indel,,,
2,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,SOX9,...,frameshift_variant,DEL,C>-,236.0,aCc/ac,3-Mar,Truncation,,,
3,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,APC,...,"stop_gained,splice_region_variant",SNP,C>T,216.0,Cga/Tga,16-Jul,Truncation,,,
4,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,APC,...,stop_gained,SNP,G>T,1286.0,Gaa/Taa,16/16,Truncation,,,
5,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,QKI,...,missense_variant,SNP,G>A,47.0,Gaa/Aaa,8-Jan,Substitution/Indel,,,
6,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,FBXW7,...,missense_variant,SNP,G>A,505.0,Cgc/Tgc,12-Oct,Substitution/Indel,,,
7,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,PTPN11,...,intron_variant,DEL,TTTC>-,,,,Substitution/Indel,,,
8,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,GLI1,...,missense_variant,SNP,G>T,488.0,Gct/Tct,12-Nov,Substitution/Indel,,,
9,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,ATM,...,"splice_region_variant,intron_variant",SNP,G>A,1026.0,,,Substitution/Indel,,,


In [16]:
# checking if the dates in Exon number are dd/mm or mm/dd by comparing the dates and not dates for gene APC
# data_for_model[data_for_model["Hugo_Symbol"].str.contains("APC", na=False)]
# result - the format of the dates are total exons-exon number

In [17]:
# Function to handle the conversion
def convert_exon_number(val):
    try:
        # First, try to convert to 'Month-Year' format (e.g., 'Sep-89' -> '09/89')
        return pd.to_datetime(val, format='%b-%y').strftime('%m/%y')
    except ValueError:
        pass

    try:
        # Then, try to convert to 'DD-Mon' format (e.g., '14-Sep' -> '09/14')
        date_obj = pd.to_datetime(val, format='%d-%b', errors='raise')
        return date_obj.strftime('%m/%d')
    except ValueError:
        # If neither format matches, return the value as is (non-date-like string)
        return val

In [18]:
# Apply the function to the column
data_for_model.loc[:, 'Exon_Number'] = data_for_model['Exon_Number'].apply(convert_exon_number)

In [19]:
data_for_model.head(10)

Unnamed: 0,PATIENT_ID,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,Smoke Status,TMB (nonsynonymous),Hugo_Symbol,...,Consequence,Variant_Type,SNP_event,Protein_position,Codons,Exon_Number,VAR_TYPE_SX,Site1_Hugo_Symbol,Site2_Hugo_Symbol,Event_Info
0,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,KMT2C,...,missense_variant,SNP,C>T,4822.0,cGt/cAt,57/59,Substitution/Indel,,,
1,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,KRAS,...,missense_variant,SNP,C>A,12.0,Ggt/Tgt,02/05,Substitution/Indel,,,
2,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,SOX9,...,frameshift_variant,DEL,C>-,236.0,aCc/ac,03/03,Truncation,,,
3,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,APC,...,"stop_gained,splice_region_variant",SNP,C>T,216.0,Cga/Tga,07/16,Truncation,,,
4,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,APC,...,stop_gained,SNP,G>T,1286.0,Gaa/Taa,16/16,Truncation,,,
5,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,QKI,...,missense_variant,SNP,G>A,47.0,Gaa/Aaa,01/08,Substitution/Indel,,,
6,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,FBXW7,...,missense_variant,SNP,G>A,505.0,Cgc/Tgc,10/12,Substitution/Indel,,,
7,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,PTPN11,...,intron_variant,DEL,TTTC>-,,,,Substitution/Indel,,,
8,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,GLI1,...,missense_variant,SNP,G>T,488.0,Gct/Tct,11/12,Substitution/Indel,,,
9,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,ATM,...,"splice_region_variant,intron_variant",SNP,G>A,1026.0,,,Substitution/Indel,,,


In [20]:
data_for_model["Hugo_Symbol"].value_counts()

Hugo_Symbol
TP53            6719
LRP1B           2138
APC             1882
KRAS            1781
EGFR            1402
                ... 
STK24-AS1          1
EGFR-AS1           1
DPYD-AS1           1
MIR4466            1
RP11-770J1.3       1
Name: count, Length: 479, dtype: int64

In [21]:
data_for_model["Chromosome"].value_counts()

Chromosome
17    11798
2      8468
1      7888
7      7764
3      7694
12     7669
5      6356
4      5822
8      4713
19     4491
X      4341
11     4282
9      3657
16     3449
6      3194
13     2753
10     2587
20     1711
22     1663
14     1648
15     1583
18     1516
21      446
Name: count, dtype: int64

In [22]:
data_for_model["Cancer Type"].value_counts()


Cancer Type
Colorectal Carcinoma                     28396
Non Small Cell Lung Cancer               19526
Gastric Cancer                           10832
Liver Hepatocellular Carcinoma            9633
Esophageal Carcinoma                      6298
Intrahepatic Cholangiocarcinoma           3939
Small Cell Lung Cancer                    3025
Pancreatic Cancer                         2957
Extrahepatic Cholangiocarcinoma           2883
Breast Carcinoma                          2350
Soft Tissue Sarcoma                       2270
Gallbladder Carcinoma                     2192
Ovarian Carcinoma                         1851
Urothelial Carcinoma                      1631
Kidney Renal Cell Carcinoma               1500
Uterine Corpus Endometrial Carcinoma      1358
Cancer of Unknown Primary                 1068
Head and Neck Carcinoma                    997
Carcinoma of Uterine Cervix                787
Small Bowel Carcinoma                      755
Bone Sarcoma                               601
G

In [23]:
data_for_model['Consequence'].str.split(',')
dummy_vars = data_for_model['Consequence'].str.split(',').explode().str.get_dummies().groupby(level=0).sum()
data_for_model = data_for_model.join(dummy_vars)
data_for_model.drop('Consequence', axis=1, inplace=True)



In [24]:
# data_for_model = pd.get_dummies(data_for_model, columns=['Smoke Status'], drop_first=False)
# data_for_model.drop('Smoke Status_Unknown', axis=1, inplace=True)

In [25]:
def create_age_range(x):
    if x <= 10:
        return "0-10"
    elif x <= 20:
        return "11-20"
    elif x <= 30:
        return "21-30"
    elif x <= 40:
        return "31-40"
    elif x <= 50:
        return "41-50"
    elif x <= 60:
        return "51-60"
    elif x <= 70:
        return "61-70"
    elif x <= 80:
        return "71-80"
    else:
        return "80+"

In [26]:
data_for_model['Diagnosis Age'] = data_for_model['Diagnosis Age'].astype(int).apply(create_age_range).astype("category")

In [27]:
data_for_model.to_csv("pan_cancer_data_for_model.csv", index=False)

In [28]:
data_for_model.head()

Unnamed: 0,PATIENT_ID,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,Smoke Status,TMB (nonsynonymous),Hugo_Symbol,...,splice_acceptor_variant,splice_donor_variant,splice_region_variant,start_lost,start_retained_variant,stop_gained,stop_lost,stop_retained_variant,synonymous_variant,upstream_gene_variant
0,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,KMT2C,...,0,0,0,0,0,0,0,0,0,0
1,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,KRAS,...,0,0,0,0,0,0,0,0,0,0
2,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,SOX9,...,0,0,0,0,0,0,0,0,0,0
3,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,APC,...,0,0,1,0,0,1,0,0,0,0
4,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,APC,...,0,0,0,0,0,1,0,0,0,0


In [29]:
hypo_data = pd.read_csv("hypotheses.csv")

In [30]:
hypo_data_updates = hypo_data[hypo_data["support"] > 2].sort_values(["cancer_type", 'support'], ascending=[True, False])
hypo_data_updates.head()

Unnamed: 0,cancer_type,Smoke Status,Hugo_Symbol,TMB (nonsynonymous),Start_Position,Protein_position,Sex,SNP_event,Site1_Hugo_Symbol,Diagnosis Age,...,upstream_gene_variant,Site2_Hugo_Symbol,Event_Info,Exon_Number,splice_region_variant,End_Position,5_prime_UTR_variant,missense_variant,splice_donor_variant,support
137,Bone Sarcoma,,,0.033333,,,,,,11-20,...,,,,,,,,,,8
174,Bone Sarcoma,,,0.0,,,,,,11-20,...,,,,,,,,,,6
190,Bone Sarcoma,,,0.2,,,,,,11-20,...,,,,,,,,,,6
274,Bone Sarcoma,,,0.1,,,,,,11-20,...,,,,,,,,,,3
112,Breast Carcinoma,,,,,,Female,,,41-50,...,,NCOR1,NCOR1-Intragenic,,,,,,,11


In [31]:
def cancer_type_correlations(df):
    """
    Print cancer type and non-null feature-value pairs for each row in the DataFrame.

    Parameters:
    - df: DataFrame with columns "cancer_type", feature columns, and "support".
    """
    corr_list = []
    # Iterate through each row
    for index, row in df.iterrows():
        # Extract cancer type and support
        cancer_type = row["cancer_type"]
        support = row["support"]

        # Get feature-value pairs where the feature value is not null
        features = [
            f"{feature}={row[feature]}"
            for feature in df.columns
            if feature not in {"cancer_type", "support"} and not pd.isnull(row[feature])
        ]

        # Format and print the result
        features_str = ", ".join(features)
        corr_list.append(f"{cancer_type}: {features_str}, Support: {support}")
    return corr_list


In [32]:
corr_list = cancer_type_correlations(hypo_data_updates)

In [33]:
corr_list

['Bone Sarcoma: TMB (nonsynonymous)=0.033333333, Diagnosis Age=11-20, Support: 8',
 'Bone Sarcoma: TMB (nonsynonymous)=0.0, Diagnosis Age=11-20, Support: 6',
 'Bone Sarcoma: TMB (nonsynonymous)=0.2, Diagnosis Age=11-20, Support: 6',
 'Bone Sarcoma: TMB (nonsynonymous)=0.1, Diagnosis Age=11-20, Support: 3',
 'Breast Carcinoma: Sex=Female, Diagnosis Age=41-50, Site2_Hugo_Symbol=NCOR1, Event_Info=NCOR1-Intragenic, Support: 11',
 'Breast Carcinoma: TMB (nonsynonymous)=0.333333333, Sex=Female, Site1_Hugo_Symbol=BRCA2, Site2_Hugo_Symbol=BRCA2, Event_Info=BRCA2-BRCA2, Support: 7',
 'Breast Carcinoma: Hugo_Symbol=PIK3CA, Start_Position=178952085.0, Sex=Female, Exon_Number=21/21, End_Position=178952085.0, Support: 6',
 'Breast Carcinoma: Sex=Female, Site1_Hugo_Symbol=BRCA1, Diagnosis Age=31-40, Site2_Hugo_Symbol=BRCA1, Event_Info=BRCA1-Intragenic, Support: 4',
 'Breast Carcinoma: TMB (nonsynonymous)=0.766666667, Sex=Female, Diagnosis Age=51-60, Event_Info=LRP1B-Intragenic, Support: 4',
 'Breast

In [34]:
data_for_model.head()

Unnamed: 0,PATIENT_ID,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,Smoke Status,TMB (nonsynonymous),Hugo_Symbol,...,splice_acceptor_variant,splice_donor_variant,splice_region_variant,start_lost,start_retained_variant,stop_gained,stop_lost,stop_retained_variant,synonymous_variant,upstream_gene_variant
0,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,KMT2C,...,0,0,0,0,0,0,0,0,0,0
1,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,KRAS,...,0,0,0,0,0,0,0,0,0,0
2,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,SOX9,...,0,0,0,0,0,0,0,0,0,0
3,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,APC,...,0,0,1,0,0,1,0,0,0,0
4,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,APC,...,0,0,0,0,0,1,0,0,0,0


In [35]:
from scipy.stats import chi2_contingency
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [36]:
# Example for a single categorical feature and target
contingency_table = pd.crosstab(data_for_model['Diagnosis Age'], data_for_model['Cancer Type'])
chi2, p, dof, expected = chi2_contingency(contingency_table)
print(f"Chi-square statistic: {chi2}, p-value: {p}")


Chi-square statistic: 52578.23608677788, p-value: 0.0


In [37]:
# Example: Feature1 and Target are categorical
contingency_table = pd.crosstab(data_for_model['Sex'], data_for_model['Cancer Type'])
print(contingency_table)

Cancer Type  Bone Sarcoma  Breast Carcinoma  Cancer of Unknown Primary  \
Sex                                                                      
Female                254              2340                        378   
Male                  347                10                        690   

Cancer Type  Carcinoma of Uterine Cervix  Colorectal Carcinoma  \
Sex                                                              
Female                               787                 10721   
Male                                   0                 17675   

Cancer Type  Esophageal Carcinoma  Extrahepatic Cholangiocarcinoma  \
Sex                                                                  
Female                       1173                             1313   
Male                         5125                             1570   

Cancer Type  Gallbladder Carcinoma  Gastric Cancer  \
Sex                                                  
Female                        1362            300

In [38]:
# Normalize to get probabilities
joint_distribution = contingency_table / contingency_table.sum().sum()
print(joint_distribution)

Cancer Type  Bone Sarcoma  Breast Carcinoma  Cancer of Unknown Primary  \
Sex                                                                      
Female           0.002398          0.022095                   0.003569   
Male             0.003276          0.000094                   0.006515   

Cancer Type  Carcinoma of Uterine Cervix  Colorectal Carcinoma  \
Sex                                                              
Female                          0.007431              0.101231   
Male                            0.000000              0.166893   

Cancer Type  Esophageal Carcinoma  Extrahepatic Cholangiocarcinoma  \
Sex                                                                  
Female                   0.011076                         0.012398   
Male                     0.048392                         0.014824   

Cancer Type  Gallbladder Carcinoma  Gastric Cancer  \
Sex                                                  
Female                    0.012860        0.02832

In [39]:
data = data_for_model

In [40]:
data["Cancer Type"].value_counts()

Cancer Type
Colorectal Carcinoma                     28396
Non Small Cell Lung Cancer               19526
Gastric Cancer                           10832
Liver Hepatocellular Carcinoma            9633
Esophageal Carcinoma                      6298
Intrahepatic Cholangiocarcinoma           3939
Small Cell Lung Cancer                    3025
Pancreatic Cancer                         2957
Extrahepatic Cholangiocarcinoma           2883
Breast Carcinoma                          2350
Soft Tissue Sarcoma                       2270
Gallbladder Carcinoma                     2192
Ovarian Carcinoma                         1851
Urothelial Carcinoma                      1631
Kidney Renal Cell Carcinoma               1500
Uterine Corpus Endometrial Carcinoma      1358
Cancer of Unknown Primary                 1068
Head and Neck Carcinoma                    997
Carcinoma of Uterine Cervix                787
Small Bowel Carcinoma                      755
Bone Sarcoma                               601
G

In [41]:
# Example: Combine columns to create specific mutation identifiers
data['Mutation'] = data['Chromosome'] + "_" + data['Start_Position'].astype(str) + "_" + data['Variant_Type']

# Target variable (e.g., specific cancer type)
data['Is_Lung_Cancer'] = (data['Cancer Type'] == 'Non Small Cell Lung Cancer').astype(int)


In [42]:
data.head()

Unnamed: 0,PATIENT_ID,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,Smoke Status,TMB (nonsynonymous),Hugo_Symbol,...,splice_region_variant,start_lost,start_retained_variant,stop_gained,stop_lost,stop_retained_variant,synonymous_variant,upstream_gene_variant,Mutation,Is_Lung_Cancer
0,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,KMT2C,...,0,0,0,0,0,0,0,0,7_151836340.0_SNP,0
1,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,KRAS,...,0,0,0,0,0,0,0,0,12_25398285.0_SNP,0
2,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,SOX9,...,0,0,0,0,0,0,0,0,17_70119705.0_DEL,0
3,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,APC,...,1,0,0,1,0,0,0,0,5_112128143.0_SNP,0
4,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,Unknown,0.333333333,APC,...,0,0,0,1,0,0,0,0,5_112175147.0_SNP,0


In [43]:
# Select a subset of columns to analyze (e.g., most relevant ones)
columns_to_combine = ['Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'SNP_event', 'Exon_Number']

# Generate pairwise combinations of features
feature_combinations = list(combinations(columns_to_combine, 2))


In [44]:
feature_combinations

[('Sex', 'Smoke Status'),
 ('Sex', 'Chromosome'),
 ('Sex', 'Hugo_Symbol'),
 ('Sex', 'SNP_event'),
 ('Sex', 'Exon_Number'),
 ('Smoke Status', 'Chromosome'),
 ('Smoke Status', 'Hugo_Symbol'),
 ('Smoke Status', 'SNP_event'),
 ('Smoke Status', 'Exon_Number'),
 ('Chromosome', 'Hugo_Symbol'),
 ('Chromosome', 'SNP_event'),
 ('Chromosome', 'Exon_Number'),
 ('Hugo_Symbol', 'SNP_event'),
 ('Hugo_Symbol', 'Exon_Number'),
 ('SNP_event', 'Exon_Number')]

In [45]:
lifts = []

P_B = data['Is_Lung_Cancer'].mean()
print("P_B:", P_B)

for feature_pair in feature_combinations:
    # Create a combined feature
    combined_feature = data[feature_pair[0]].astype(str) + "_" + data[feature_pair[1]].astype(str)
    # Filter out combinations with very low counts
    min_count = 3  # Adjust as needed
    P_A_counts = combined_feature.value_counts()

    # Filter combined features based on minimum count
    valid_features = P_A_counts[P_A_counts >= min_count].index
    filtered_data = combined_feature[combined_feature.isin(valid_features)]

    # Recalculate probabilities with filtered data
    P_A = filtered_data.value_counts(normalize=True)
    P_A_and_B = filtered_data[data['Is_Lung_Cancer'] == 1].value_counts(normalize=True)
    lift = (P_A_and_B / (P_A * P_B)).round(2)
    lifts.append((feature_pair, lift))

P_B: 0.18437104602194399


In [46]:
# Flatten the results for easy visualization
lift_results = []

for feature_pair, lift in lifts:
    for feature_value, lift_value in lift.items():
        lift_results.append({
            'Feature Pair': feature_pair,
            'Feature Value': feature_value,
            'Lift': lift_value
        })

lift_df = pd.DataFrame(lift_results)
lift_df = lift_df.sort_values(by='Lift', ascending=False)


In [47]:
lift_df["Lift"].value_counts()

Lift
9.83    636
4.90    514
4.20    434
9.81    426
7.37    424
       ... 
5.39      1
4.92      1
4.87      1
0.23      1
0.28      1
Name: count, Length: 1636, dtype: int64

In [48]:
lift_df.head(100)

Unnamed: 0,Feature Pair,Feature Value,Lift
6988,"(Smoke Status, Exon_Number)",Smoker_77/91,31.02
6989,"(Smoke Status, Exon_Number)",Smoker_78/91,31.02
6990,"(Smoke Status, Exon_Number)",Smoker_79/91,31.02
6991,"(Smoke Status, Exon_Number)",Smoker_81/91,31.02
6993,"(Smoke Status, Exon_Number)",Smoker_84/91,31.02
...,...,...,...
6808,"(Smoke Status, Exon_Number)",Smoker_20/28,31.02
6809,"(Smoke Status, Exon_Number)",Smoker_20/29,31.02
6810,"(Smoke Status, Exon_Number)",Smoker_20/30,31.02
6781,"(Smoke Status, Exon_Number)",Smoker_18/24,31.02


In [49]:
feature_combinations_3 = list(combinations(columns_to_combine, 3))
feature_combinations_3

[('Sex', 'Smoke Status', 'Chromosome'),
 ('Sex', 'Smoke Status', 'Hugo_Symbol'),
 ('Sex', 'Smoke Status', 'SNP_event'),
 ('Sex', 'Smoke Status', 'Exon_Number'),
 ('Sex', 'Chromosome', 'Hugo_Symbol'),
 ('Sex', 'Chromosome', 'SNP_event'),
 ('Sex', 'Chromosome', 'Exon_Number'),
 ('Sex', 'Hugo_Symbol', 'SNP_event'),
 ('Sex', 'Hugo_Symbol', 'Exon_Number'),
 ('Sex', 'SNP_event', 'Exon_Number'),
 ('Smoke Status', 'Chromosome', 'Hugo_Symbol'),
 ('Smoke Status', 'Chromosome', 'SNP_event'),
 ('Smoke Status', 'Chromosome', 'Exon_Number'),
 ('Smoke Status', 'Hugo_Symbol', 'SNP_event'),
 ('Smoke Status', 'Hugo_Symbol', 'Exon_Number'),
 ('Smoke Status', 'SNP_event', 'Exon_Number'),
 ('Chromosome', 'Hugo_Symbol', 'SNP_event'),
 ('Chromosome', 'Hugo_Symbol', 'Exon_Number'),
 ('Chromosome', 'SNP_event', 'Exon_Number'),
 ('Hugo_Symbol', 'SNP_event', 'Exon_Number')]

In [50]:
lifts = []
# Probability of the cancer type
P_B = data['Is_Lung_Cancer'].mean()

for feature_triplet in feature_combinations_3:
    # Create a combined feature from three columns
    combined_feature = (
        data[feature_triplet[0]].astype(str) + "_" +
        data[feature_triplet[1]].astype(str) + "_" +
        data[feature_triplet[2]].astype(str)
    )
    min_count = 50
    P_A_counts = combined_feature.value_counts()

    # Filter combined features based on minimum count
    valid_features = P_A_counts[P_A_counts >= min_count].index
    filtered_data = combined_feature[combined_feature.isin(valid_features)]

    # Probability of the combined feature
    P_A = filtered_data.value_counts(normalize=True)

    # Joint probability of the combined feature and cancer type
    joint = filtered_data[data['Is_Lung_Cancer'] == 1].value_counts(normalize=True)

    # Calculate lift
    lift = (joint / (P_A * P_B)).round(2)  # Round lift to 2 decimal places for readability

    # Append results as a tuple of the feature triplet and their associated lift values
    lifts.append((feature_triplet, lift))

# Example output of top lift values
# sorted_lifts = sorted(lifts, key=lambda x: x[1].max() if not x[1].empty else 0, reverse=True)

# for triplet, lift_values in sorted_lifts[:10]:  # Top 10 triplets
#     print(f"Feature Triplet: {triplet}")
#     print(f"Lift Values:\n{lift_values}\n")


In [51]:
# Flatten the results for easy visualization
lift_results = []

for feature_triplet, lift in lifts:
    for feature_value, lift_value in lift.items():
        lift_results.append({
            'Feature Triplet': feature_triplet,
            'Feature Value': feature_value,
            'Lift': lift_value
        })

lift_df = pd.DataFrame(lift_results)
lift_df = lift_df.sort_values(by='Lift', ascending=False)


In [57]:
lift_df.sort_values(by='Lift', ascending=True)

Unnamed: 0,Feature Triplet,Feature Value,Lift
4137,"(Smoke Status, Hugo_Symbol, SNP_event)",Unknown_APC_C>T,0.11
680,"(Sex, Smoke Status, SNP_event)",Female_Unknown_A>-,0.13
3729,"(Smoke Status, Chromosome, SNP_event)",Unknown_2_A>-,0.14
748,"(Sex, Smoke Status, SNP_event)",Male_Unknown_T>-,0.17
4658,"(Smoke Status, SNP_event, Exon_Number)",Unknown_A>-_<NA>,0.18
...,...,...,...
5485,"(Chromosome, SNP_event, Exon_Number)",17_C>-_09/10,
5572,"(Chromosome, SNP_event, Exon_Number)",4_T>-_<NA>,
5588,"(Chromosome, SNP_event, Exon_Number)",7_A>-_<NA>,
5674,"(Hugo_Symbol, SNP_event, Exon_Number)",RNF43_C>-_09/10,


In [53]:
filter_triple_data = data[data["Smoke Status"] == "Nonsmoker"]
filter_triple_data = filter_triple_data[filter_triple_data["Hugo_Symbol"] == "TP53"]
filter_triple_data = filter_triple_data[filter_triple_data["SNP_event"] == "G>A"]

In [54]:
filter_triple_data

Unnamed: 0,PATIENT_ID,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,Smoke Status,TMB (nonsynonymous),Hugo_Symbol,...,splice_region_variant,start_lost,start_retained_variant,stop_gained,stop_lost,stop_retained_variant,synonymous_variant,upstream_gene_variant,Mutation,Is_Lung_Cancer
2475,Patient0210,Non Small Cell Lung Cancer,Lung Adenocarcinoma,IV,Primary,Female,61-70,Nonsmoker,0.2,TP53,...,0,0,0,0,0,0,0,0,17_7578389.0_SNP,1
2854,Patient0258,Non Small Cell Lung Cancer,Lung Adenocarcinoma,I,Primary,Male,51-60,Nonsmoker,0.1,TP53,...,0,0,0,0,0,0,0,0,17_7577539.0_SNP,1
4546,Patient0461,Non Small Cell Lung Cancer,Lung Adenocarcinoma,II,Primary,Female,41-50,Nonsmoker,0.233333333,TP53,...,0,0,0,0,0,0,0,0,17_7574018.0_SNP,1
6611,Patient0645,Non Small Cell Lung Cancer,Lung Adenocarcinoma,I,Primary,Female,71-80,Nonsmoker,0.4,TP53,...,0,0,0,0,0,0,0,0,17_7578517.0_SNP,1
6997,Patient0681,Non Small Cell Lung Cancer,Lung Adenocarcinoma,IV,Primary,Female,51-60,Nonsmoker,0.2,TP53,...,0,0,0,1,0,0,0,0,17_7578524.0_SNP,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88745,Patient8267,Small Cell Lung Cancer,Small Cell Lung Cancer,IV,Primary,Female,41-50,Nonsmoker,1.066666667,TP53,...,0,0,0,0,0,0,0,0,17_7578479.0_SNP,0
89303,Patient8330,Non Small Cell Lung Cancer,Lung Adenocarcinoma,III,Primary,Female,61-70,Nonsmoker,0.2,TP53,...,1,0,0,0,0,0,0,0,17_7576926.0_SNP,1
90975,Patient8492,Small Cell Lung Cancer,Small Cell Lung Cancer,III,Primary,Female,51-60,Nonsmoker,0.266666667,TP53,...,0,0,0,1,0,0,0,0,17_7578275.0_SNP,0
93404,Patient8807,Head and Neck Carcinoma,Head and Neck Squamous Cell Carcinoma,I,Primary,Male,41-50,Nonsmoker,0.1,TP53,...,0,0,0,0,0,0,0,0,17_7577121.0_SNP,0
