In [314]:
import pandas as pd
from itertools import combinations
from tqdm import tqdm
import numpy as np

In [264]:
def data_prep_lift_fix(data_for_lift):
    columns_to_combine = ['Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'SNP_event',
                          "Consequence", 'Exon_Number', "Diagnosis Age", "TMB (nonsynonymous)",
                          "Position", "Protein_position", "Codons", "VAR_TYPE_SX"]

    # Calculate cancer probabilities using the correct dataframe
    cancer_probabilities = {
        cancer_type: data_for_lift[cancer_type].mean()
        for cancer_type in data_for_lift.select_dtypes(include=['bool', 'int']).columns
    }

    # Create feature combinations
    feature_combinations = list(combinations(columns_to_combine, 5))

    return cancer_probabilities, feature_combinations

In [199]:
def data_prep_lift(df):
    # Select a subset of columns to analyze (e.g., most relevant ones)
    columns_to_combine = ['Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'SNP_event', "Consequence", 'Exon_Number', "Diagnosis Age", "TMB (nonsynonymous)", "Position", "Protein_position", "Codons", "VAR_TYPE_SX"]

    cancer_probabilities = {cancer_type: data_for_lift[cancer_type].mean() for cancer_type in list(df["Cancer Type"].unique())}
    feature_combinations = list(combinations(columns_to_combine, 5))

    return cancer_probabilities, feature_combinations

In [296]:
P_B = cancer_probabilities['Gallbladder Carcinoma']
data_for_lift = pd.read_csv("./pan_cancer/data_for_lift.csv", index_col=0)

FileNotFoundError: [Errno 2] No such file or directory: './pan_cancer/data_for_lift.csv'

In [301]:
lifts = []
feature = ('Smoke Status', 'Consequence', 'Diagnosis Age', 'TMB (nonsynonymous)', 'VAR_TYPE_SX')
cancer_type = 'Gallbladder Carcinoma'

# Combine the selected features into a single feature
combined_feature = data_for_lift[list(feature)].astype(str).agg('_'.join, axis=1)
combined_feature.reset_index().drop_duplicates().set_index("PATIENT_ID", inplace=True)

# Compute value counts for the combined feature
combined_counts = combined_feature.value_counts()
valid_features = combined_counts[combined_counts >= 50].index

# Filter the combined feature to include only valid entries
filtered_data = combined_feature[combined_feature.isin(valid_features)]

# Reset index to align with the original DataFrame for filtering the cancer type
filtered_data = filtered_data.reset_index(drop=True)
cancer_data = data_for_lift[cancer_type].reset_index(drop=True)  # Make sure cancer_data has the same index

# Compute joint probabilities for cancer type
joint_prob = (
    filtered_data[cancer_data == 1]
    .value_counts(normalize=True)
    .reindex(filtered_data.value_counts(normalize=True).index, fill_value=0)
)

# Calculate lift
P_A = filtered_data.value_counts(normalize=True)
lift = (joint_prob / (P_A * P_B)).round(2)

In [443]:
lifts = []
feature = ('Smoke Status', 'Consequence', 'Diagnosis Age', 'TMB (nonsynonymous)', 'VAR_TYPE_SX')
cancer_type = 'Gallbladder Carcinoma'

# Count the number of unique PATIENT_IDs per feature combination
feature_counts = (
    data_for_lift.reset_index()
    .groupby(list(feature))['PATIENT_ID']
    .nunique()
    .reset_index(name='patient_count')
)

# Create the feature combination column
feature_counts["feature_combination"] = feature_counts[list(feature)].fillna('missing').astype(str).agg('_'.join, axis=1)

feature_counts = feature_counts[feature_counts["patient_count"] >= 50]
# Filter valid feature combinations with at least 50 unique patients
valid_features = set(feature_counts["feature_combination"].astype(str).unique())

# Combine features in the original dataset
combined_feature = data_for_lift[list(feature)].fillna('missing').astype(str).agg('_'.join, axis=1)

# Apply mask for valid features
valid_mask = combined_feature.astype(str).isin(valid_features)
filtered_data = combined_feature[valid_mask]
cancer_data = data_for_lift.loc[valid_mask, cancer_type]

# Compute probabilities
P_A = filtered_data.value_counts(normalize=True)

# Calculate joint probability more efficiently
joint_counts = pd.Series(0, index=P_A.index)
positive_counts = filtered_data[cancer_data == 1].value_counts()
joint_counts.update(positive_counts)
joint_prob = joint_counts / len(filtered_data)

# Calculate lift
lift = (joint_prob / (P_A * P_B)).round(2)


In [448]:
lift#[lift.index == "Unknown_missense_variant_51-60_3.233333333_Substitution/Indel"]
# Create a valid_mask ensuring both sets match correctly
# combined_feature.name ==


Unknown_missense_variant_61-70_0.266666667_Substitution/Indel    2.33
Unknown_missense_variant_51-60_0.3_Substitution/Indel            3.46
Unknown_missense_variant_51-60_0.233333333_Substitution/Indel    1.56
Unknown_missense_variant_61-70_0.233333333_Substitution/Indel    2.49
Unknown_missense_variant_51-60_0.2_Substitution/Indel            1.18
                                                                 ... 
Unknown_stop_gained_41-50_0.166666667_Truncation                 0.65
Unknown_stop_gained_51-60_0.1_Truncation                         0.68
Unknown_frameshift_variant_51-60_0.1_Truncation                  3.41
Unknown_missense_variant_51-60_0.033333333_Substitution/Indel    0.74
Unknown_missense_variant_41-50_0.033333333_Substitution/Indel    1.68
Length: 82, dtype: float64

In [None]:
if __name__ == "__main__":
    data_for_lift = pd.read_csv("./pan_cancer/data_for_lift.csv", index_col=0)
    cancer_prob, features_comb = data_prep_lift(data_for_lift)
    lifts_df = calculate_lift(data_for_lift, cancer_prob, features_comb)
    lifts_df.to_csv("lifts.csv", index=False)

In [310]:
combined_feature[combined_feature.index == "Unknown_missense_variant_51-60_3.233333333_Substitution/Indel"]
# combined_counts
# combined_feature[combined_feature.index == "Female_Unknown_71-80_6.166666667_Substitution/Indel"]
# combined_feature[combined_feature.index == "Patient8178"]
# joint_prob[joint_prob.index == "Female_Unknown_71-80_6.166666667_Substitution/Indel"]
# P_A[P_A.index == "Female_Unknown_71-80_6.166666667_Substitution/Indel"]
# joint_prob / (P_A * P_B)

Series([], dtype: object)

In [292]:
cancer_probabilities, features_combinations = data_prep_lift_fix(data_for_lift)

In [270]:
lift_df = pd.read_csv("lifts.csv", index_col=0)
data_for_lift = pd.read_csv("data_for_lift.csv", index_col=0)

In [370]:
# try_df = data_for_lift[data_for_lift["Cancer Type"] == "Colorectal Carcinoma"]
try_df = data_for_lift[data_for_lift['Smoke Status'] == "Nonsmoker"]
try_df = try_df[try_df['Diagnosis Age'] == "31-40"]
try_df = try_df[try_df['Consequence'] == "frameshift_variant"]
try_df = try_df[try_df['TMB (nonsynonymous)'] == 0.266666667]
try_df = try_df[try_df['VAR_TYPE_SX'] == "Truncation"]

# set(try_df.index)
try_df

Unnamed: 0_level_0,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,Smoke Status,TMB (nonsynonymous),Hugo_Symbol,Chromosome,...,Gallbladder Carcinoma,Gastric Cancer,Intrahepatic Cholangiocarcinoma,Liver Hepatocellular Carcinoma,Non Small Cell Lung Cancer,Pancreatic Cancer,Small Cell Lung Cancer,Soft Tissue Sarcoma,Position,combined_feature
PATIENT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Patient2805,Non Small Cell Lung Cancer,Lung Adenocarcinoma,IV,Metastasis,Male,31-40,Nonsmoker,0.266667,PTEN,10,...,0,0,0,0,1,0,0,0,89693003.0-89693003.0,Nonsmoker_frameshift_variant_31-40_0.266666667...
Patient3738,Non Small Cell Lung Cancer,Lung Adenocarcinoma,I,Primary,Male,31-40,Nonsmoker,0.266667,RANBP2,2,...,0,0,0,0,1,0,0,0,109379868.0-109379883.0,Nonsmoker_frameshift_variant_31-40_0.266666667...
Patient4008,Non Small Cell Lung Cancer,Lung Adenocarcinoma,I,Primary,Female,31-40,Nonsmoker,0.266667,DICER1,14,...,0,0,0,0,1,0,0,0,95570210.0-95570219.0,Nonsmoker_frameshift_variant_31-40_0.266666667...


In [271]:
lift_df.sort_values(by="Lift Value", ascending=False).head(1000)

Unnamed: 0_level_0,Feature Combination,Lift Value,Feature
Cancer Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Pancreatic Cancer,"('Sex', 'Smoke Status', 'Diagnosis Age', 'TMB ...",834.90,Female_Unknown_71-80_6.166666667_Substitution/...
Soft Tissue Sarcoma,"('Smoke Status', 'Consequence', 'Diagnosis Age...",734.73,Unknown_missense_variant_41-50_3.266666667_Sub...
Soft Tissue Sarcoma,"('Sex', 'Smoke Status', 'Diagnosis Age', 'TMB ...",604.14,Male_Unknown_31-40_4.366666667_Substitution/Indel
Extrahepatic Cholangiocarcinoma,"('Sex', 'Smoke Status', 'Diagnosis Age', 'TMB ...",560.69,Female_Unknown_51-60_3.5_Substitution/Indel
Intrahepatic Cholangiocarcinoma,"('Sex', 'Smoke Status', 'Diagnosis Age', 'TMB ...",468.76,Male_Unknown_31-40_4.266666667_Substitution/Indel
...,...,...,...
Breast Carcinoma,"('Smoke Status', 'Consequence', 'Diagnosis Age...",122.45,Unknown_missense_variant_51-60_0.066666667_Sub...
Gallbladder Carcinoma,"('Sex', 'Smoke Status', 'Chromosome', 'Hugo_Sy...",122.45,Male_Unknown_19_BRD4_Substitution/Indel
Gallbladder Carcinoma,"('Sex', 'Smoke Status', 'SNP_event', 'TMB (non...",122.43,Female_Unknown_G>A_0.333333333_Substitution/Indel
Gallbladder Carcinoma,"('Exon_Number', 'Diagnosis Age', 'TMB (nonsyno...",122.42,nan_61-70_0.533333333_nan_Substitution/Indel


In [275]:
lifts_par = pd.read_parquet('lifts.parquet', engine='pyarrow')

In [278]:
lifts_par.sort_values(by="Lift Value", ascending=False).head(1000)


Unnamed: 0,Cancer Type,Feature Combination,Feature,Lift Value
317280,Gallbladder Carcinoma,"('Smoke Status', 'Consequence', 'Diagnosis Age...",Unknown_missense_variant_51-60_3.233333333_Sub...,42.91
281107,Gallbladder Carcinoma,"('Sex', 'Smoke Status', 'Consequence', 'TMB (n...",Male_Unknown_missense_variant_3.233333333_Subs...,42.91
299290,Gallbladder Carcinoma,"('Sex', 'Consequence', 'Diagnosis Age', 'TMB (...",Male_missense_variant_51-60_3.233333333_Substi...,42.91
280166,Gallbladder Carcinoma,"('Sex', 'Smoke Status', 'Consequence', 'Diagno...",Male_Unknown_missense_variant_51-60_3.233333333,42.91
281142,Gallbladder Carcinoma,"('Sex', 'Smoke Status', 'Consequence', 'TMB (n...",Female_Unknown_missense_variant_1.4_Substituti...,42.91
...,...,...,...,...
12926,Breast Carcinoma,"('Sex', 'Smoke Status', 'Consequence', 'Diagno...",Female_Unknown_missense_variant_31-40_0.1,17.22
37683,Breast Carcinoma,"('Smoke Status', 'Chromosome', 'SNP_event', 'E...",Unknown_3_A>G_21/21_cAt/cGt,17.22
39029,Breast Carcinoma,"('Smoke Status', 'Chromosome', 'Consequence', ...",Unknown_3_missense_variant_21/21_cAt/cGt,17.22
45430,Breast Carcinoma,"('Smoke Status', 'SNP_event', 'Consequence', '...",Unknown_A>G_missense_variant_21/21_cAt/cGt,17.22


In [280]:
# try_df = data_for_lift[data_for_lift["Cancer Type"] == "Colorectal Carcinoma"]
try_df = data_for_lift[data_for_lift['Smoke Status'] == "Unknown"]
try_df = try_df[try_df['Diagnosis Age'] == "51-60"]
try_df = try_df[try_df['Consequence'] == "missense_variant"]
try_df = try_df[try_df['TMB (nonsynonymous)'] == 3.233333333]
try_df = try_df[try_df['VAR_TYPE_SX'] == "Substitution/Indel"]

set(try_df.index)
# try_df

{'Patient6054'}