In [2]:
import pandas as pd
import numpy as np
from itertools import combinations
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
import warnings

warnings.filterwarnings('ignore')


In [14]:
def data_prep_lift(data_for_lift):
    columns_to_combine = ['Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'SNP_event',
                          "Consequence", 'Exon_Number', "Diagnosis Age", "TMB (nonsynonymous)",
                          "Position", "Protein_position", "Codons", "VAR_TYPE_SX"]

    # Calculate cancer probabilities using the correct dataframe
    cancer_probabilities = {
        cancer_type: data_for_lift[cancer_type].mean()
        for cancer_type in data_for_lift.select_dtypes(include=['bool', 'int']).columns
    }

    # Create feature combinations
    feature_combinations = list(combinations(columns_to_combine, 5))

    return cancer_probabilities, feature_combinations

In [15]:
def process_combination(args):
    data_for_lift, cancer_type, P_B, feature = args

    # Count the number of unique PATIENT_IDs per feature combination
    feature_counts = (
        data_for_lift.reset_index()
        .groupby(list(feature))['PATIENT_ID']
        .nunique()
        .reset_index(name='patient_count')
    )

    # Create the feature combination column
    feature_counts["feature_combination"] = feature_counts[list(feature)].apply(tuple, axis=1)

    feature_counts = feature_counts[feature_counts["patient_count"] >= 50]
    # Filter valid feature combinations with at least 50 unique patients
    valid_features = set(feature_counts["feature_combination"].astype(str).unique())

    # Combine features in the original dataset
    combined_feature = data_for_lift[list(feature)].apply(tuple, axis=1)

    # Apply mask for valid features
    valid_mask = combined_feature.astype(str).isin(valid_features)
    filtered_data = combined_feature[valid_mask]
    cancer_data = data_for_lift.loc[valid_mask, cancer_type]

    # Compute probabilities
    P_A = filtered_data.value_counts(normalize=True)

    # Calculate joint probability more efficiently
    joint_counts = pd.Series(0, index=P_A.index)
    positive_counts = filtered_data[cancer_data == 1].value_counts()
    joint_counts.update(positive_counts)
    joint_prob = joint_counts / len(filtered_data)

    # Calculate lift
    lift = (joint_prob / (P_A * P_B)).round(2)

    # Return results for valid calculations
    return [(cancer_type, feature, idx, lift_val)
            for idx, lift_val in lift.items()
            if not np.isnan(lift_val) and not np.isinf(lift_val)]

In [16]:
def calculate_lift(data_for_lift, cancer_probabilities, feature_combinations):
    # Prepare arguments for parallel processing
    args_list = [
        (data_for_lift, cancer_type, P_B, feature)
        for cancer_type, P_B in cancer_probabilities.items()
        for feature in feature_combinations
    ]

    # Process combinations in parallel
    lift_data = []
    with ProcessPoolExecutor() as executor:
        for result in tqdm(executor.map(process_combination, args_list),
                           total=len(args_list),
                           desc="Processing combinations"):
            if result:
                lift_data.extend([{
                    "Cancer Type": cancer_type,
                    "Feature Combination": feature,
                    "Feature": tuple(idx),  # Ensure "Feature" is stored as a tuple
                    "Lift Value": lift_val
                } for cancer_type, feature, idx, lift_val in result])

    return pd.DataFrame(lift_data)

In [18]:
data_for_lift = pd.read_csv("data_for_lift.csv", index_col=0)
# cancer_prob, features_comb = data_prep_lift(data_for_lift)
# lifts_df = calculate_lift(data_for_lift, cancer_prob, features_comb)
# lifts_df.to_csv("lifts.csv", index=False)

In [19]:
data_for_lift.head()

oid sha256:1e95199cc1ac049f84662b4d46ef75562d54d2e02482136c45a23e9aef4f557f
size 23634849


In [301]:
lifts = []
feature = ('Smoke Status', 'Consequence', 'Diagnosis Age', 'TMB (nonsynonymous)', 'VAR_TYPE_SX')
cancer_type = 'Gallbladder Carcinoma'

# Combine the selected features into a single feature
combined_feature = data_for_lift[list(feature)].astype(str).agg('_'.join, axis=1)
combined_feature.reset_index().drop_duplicates().set_index("PATIENT_ID", inplace=True)

# Compute value counts for the combined feature
combined_counts = combined_feature.value_counts()
valid_features = combined_counts[combined_counts >= 50].index

# Filter the combined feature to include only valid entries
filtered_data = combined_feature[combined_feature.isin(valid_features)]

# Reset index to align with the original DataFrame for filtering the cancer type
filtered_data = filtered_data.reset_index(drop=True)
cancer_data = data_for_lift[cancer_type].reset_index(drop=True)  # Make sure cancer_data has the same index

# Compute joint probabilities for cancer type
joint_prob = (
    filtered_data[cancer_data == 1]
    .value_counts(normalize=True)
    .reindex(filtered_data.value_counts(normalize=True).index, fill_value=0)
)

# Calculate lift
P_A = filtered_data.value_counts(normalize=True)
lift = (joint_prob / (P_A * P_B)).round(2)

In [443]:
lifts = []
feature = ('Smoke Status', 'Consequence', 'Diagnosis Age', 'TMB (nonsynonymous)', 'VAR_TYPE_SX')
cancer_type = 'Gallbladder Carcinoma'

# Count the number of unique PATIENT_IDs per feature combination
feature_counts = (
    data_for_lift.reset_index()
    .groupby(list(feature))['PATIENT_ID']
    .nunique()
    .reset_index(name='patient_count')
)

# Create the feature combination column
feature_counts["feature_combination"] = feature_counts[list(feature)].fillna('missing').astype(str).agg('_'.join, axis=1)

feature_counts = feature_counts[feature_counts["patient_count"] >= 50]
# Filter valid feature combinations with at least 50 unique patients
valid_features = set(feature_counts["feature_combination"].astype(str).unique())

# Combine features in the original dataset
combined_feature = data_for_lift[list(feature)].fillna('missing').astype(str).agg('_'.join, axis=1)

# Apply mask for valid features
valid_mask = combined_feature.astype(str).isin(valid_features)
filtered_data = combined_feature[valid_mask]
cancer_data = data_for_lift.loc[valid_mask, cancer_type]

# Compute probabilities
P_A = filtered_data.value_counts(normalize=True)

# Calculate joint probability more efficiently
joint_counts = pd.Series(0, index=P_A.index)
positive_counts = filtered_data[cancer_data == 1].value_counts()
joint_counts.update(positive_counts)
joint_prob = joint_counts / len(filtered_data)

# Calculate lift
lift = (joint_prob / (P_A * P_B)).round(2)


In [448]:
lift#[lift.index == "Unknown_missense_variant_51-60_3.233333333_Substitution/Indel"]
# Create a valid_mask ensuring both sets match correctly
# combined_feature.name ==


Unknown_missense_variant_61-70_0.266666667_Substitution/Indel    2.33
Unknown_missense_variant_51-60_0.3_Substitution/Indel            3.46
Unknown_missense_variant_51-60_0.233333333_Substitution/Indel    1.56
Unknown_missense_variant_61-70_0.233333333_Substitution/Indel    2.49
Unknown_missense_variant_51-60_0.2_Substitution/Indel            1.18
                                                                 ... 
Unknown_stop_gained_41-50_0.166666667_Truncation                 0.65
Unknown_stop_gained_51-60_0.1_Truncation                         0.68
Unknown_frameshift_variant_51-60_0.1_Truncation                  3.41
Unknown_missense_variant_51-60_0.033333333_Substitution/Indel    0.74
Unknown_missense_variant_41-50_0.033333333_Substitution/Indel    1.68
Length: 82, dtype: float64

In [None]:
if __name__ == "__main__":
    data_for_lift = pd.read_csv("./pan_cancer/data_for_lift.csv", index_col=0)
    cancer_prob, features_comb = data_prep_lift(data_for_lift)
    lifts_df = calculate_lift(data_for_lift, cancer_prob, features_comb)
    # lifts_df.to_csv("lifts.csv", index=False)

In [310]:
combined_feature[combined_feature.index == "Unknown_missense_variant_51-60_3.233333333_Substitution/Indel"]
# combined_counts
# combined_feature[combined_feature.index == "Female_Unknown_71-80_6.166666667_Substitution/Indel"]
# combined_feature[combined_feature.index == "Patient8178"]
# joint_prob[joint_prob.index == "Female_Unknown_71-80_6.166666667_Substitution/Indel"]
# P_A[P_A.index == "Female_Unknown_71-80_6.166666667_Substitution/Indel"]
# joint_prob / (P_A * P_B)

Series([], dtype: object)

In [292]:
cancer_probabilities, features_combinations = data_prep_lift_fix(data_for_lift)

In [3]:
lift_df = pd.read_csv("lifts.csv", index_col=0)
data_for_lift = pd.read_csv("data_for_lift.csv", index_col=0)

In [4]:
lift_df.head()

Unnamed: 0_level_0,Feature Combination,Feature,Lift Value
Cancer Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Breast Carcinoma,"('Sex', 'Smoke Status', 'Chromosome', 'Hugo_Sy...","('Male', 'Unknown', '17', 'TP53', 'C>T')",0.0
Breast Carcinoma,"('Sex', 'Smoke Status', 'Chromosome', 'Hugo_Sy...","('Male', 'Unknown', '17', 'TP53', 'G>A')",0.0
Breast Carcinoma,"('Sex', 'Smoke Status', 'Chromosome', 'Hugo_Sy...","('Male', 'Unknown', '17', 'TP53', 'C>A')",0.0
Breast Carcinoma,"('Sex', 'Smoke Status', 'Chromosome', 'Hugo_Sy...","('Male', 'Unknown', '5', 'TERT', 'G>A')",0.0
Breast Carcinoma,"('Sex', 'Smoke Status', 'Chromosome', 'Hugo_Sy...","('Male', 'Unknown', '12', 'KRAS', 'C>T')",0.0


In [17]:
# try_df = data_for_lift[data_for_lift["Cancer Type"] == "Colorectal Carcinoma"]
try_df = data_for_lift[data_for_lift['Smoke Status'] == "Unknown"]
try_df = try_df[try_df['SNP_event'] == "C>G"]
try_df = try_df[try_df['Exon_Number'] == "02/05"]
try_df = try_df[try_df['Protein_position'] == 12.0]
try_df = try_df[try_df['Codons'] == "Ggt/Cgt"]

# set(try_df.index)
try_df

Unnamed: 0_level_0,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,Smoke Status,TMB (nonsynonymous),Hugo_Symbol,Chromosome,...,Extrahepatic Cholangiocarcinoma,Gallbladder Carcinoma,Gastric Cancer,Intrahepatic Cholangiocarcinoma,Liver Hepatocellular Carcinoma,Non Small Cell Lung Cancer,Pancreatic Cancer,Small Cell Lung Cancer,Soft Tissue Sarcoma,Position
PATIENT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Patient0257,Pancreatic Cancer,Pancreatic Adenocarcinoma,IV,Metastasis,Male,61-70,Unknown,0.133333,KRAS,12,...,0,0,0,0,0,0,1,0,0,25398285.0-25398285.0
Patient0295,Non Small Cell Lung Cancer,Large Cell Lung Carcinoma,IV,Metastasis,Male,51-60,Unknown,1.866667,KRAS,12,...,0,0,0,0,0,1,0,0,0,25398285.0-25398285.0
Patient0378,Intrahepatic Cholangiocarcinoma,Intrahepatic Cholangiocarcinoma,Unknown,Primary,Male,51-60,Unknown,0.233333,KRAS,12,...,0,0,0,1,0,0,0,0,0,25398285.0-25398285.0
Patient0557,Pancreatic Cancer,Pancreatic Adenocarcinoma,IV,Metastasis,Male,71-80,Unknown,0.166667,KRAS,12,...,0,0,0,0,0,0,1,0,0,25398285.0-25398285.0
Patient0582,Breast Carcinoma,Breast Invasive Carcinoma,IV,Metastasis,Female,41-50,Unknown,1.200000,KRAS,12,...,0,0,0,0,0,0,0,0,0,25398285.0-25398285.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Patient8524,Pancreatic Cancer,Pancreatic Adenocarcinoma,II,Primary,Male,51-60,Unknown,0.166667,KRAS,12,...,0,0,0,0,0,0,1,0,0,25398285.0-25398285.0
Patient8538,Pancreatic Cancer,Pancreatic Adenocarcinoma,I,Primary,Male,41-50,Unknown,0.166667,KRAS,12,...,0,0,0,0,0,0,1,0,0,25398285.0-25398285.0
Patient8546,Pancreatic Cancer,Pancreatic Adenocarcinoma,II,Primary,Female,61-70,Unknown,0.100000,KRAS,12,...,0,0,0,0,0,0,1,0,0,25398285.0-25398285.0
Patient8549,Pancreatic Cancer,Pancreatic Adenocarcinoma,III,Primary,Female,51-60,Unknown,0.100000,KRAS,12,...,0,0,0,0,0,0,1,0,0,25398285.0-25398285.0


In [5]:
lift_df.sort_values(by="Lift Value", ascending=False).head(1000)

Unnamed: 0_level_0,Feature Combination,Feature,Lift Value
Cancer Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Pancreatic Cancer,"('Smoke Status', 'Chromosome', 'SNP_event', 'P...","('Unknown', '12', 'C>G', 12.0, 'Ggt/Cgt')",27.03
Pancreatic Cancer,"('Smoke Status', 'Hugo_Symbol', 'Consequence',...","('Unknown', 'KRAS', 'missense_variant', 12.0, ...",27.03
Pancreatic Cancer,"('Smoke Status', 'Hugo_Symbol', 'SNP_event', '...","('Unknown', 'KRAS', 'C>G', '25398285.0-2539828...",27.03
Pancreatic Cancer,"('Smoke Status', 'Hugo_Symbol', 'SNP_event', '...","('Unknown', 'KRAS', 'C>G', 'Ggt/Cgt', 'Substit...",27.03
Pancreatic Cancer,"('Smoke Status', 'Chromosome', 'SNP_event', 'P...","('Unknown', '12', 'C>G', '25398285.0-25398285....",27.03
...,...,...,...
Pancreatic Cancer,"('Smoke Status', 'Consequence', 'Exon_Number',...","('Unknown', 'missense_variant', '02/05', '51-6...",16.64
Pancreatic Cancer,"('Chromosome', 'Hugo_Symbol', 'Exon_Number', '...","('12', 'KRAS', '02/05', 0.133333333, 12.0)",16.64
Pancreatic Cancer,"('Smoke Status', 'SNP_event', 'Diagnosis Age',...","('Unknown', 'C>A', '51-60', '25398284.0-253982...",16.64
Pancreatic Cancer,"('Hugo_Symbol', 'Consequence', 'Exon_Number', ...","('KRAS', 'missense_variant', '02/05', 0.133333...",16.64


In [6]:
df_combined = lift_df.groupby(["Cancer Type", "Lift Value"]).apply(
    lambda x: dict(zip(x["Feature Combination"], x["Feature"]))
).reset_index(name="Feature-Value Pairs")

In [9]:
df_combined.head()

Unnamed: 0,Cancer Type,Lift Value,Feature-Value Pairs
0,Breast Carcinoma,0.0,"{'('Sex', 'Smoke Status', 'Chromosome', 'Hugo_..."
1,Breast Carcinoma,0.01,"{'('Sex', 'Smoke Status', 'Chromosome', 'Conse..."
2,Breast Carcinoma,0.02,"{'('Sex', 'Smoke Status', 'Chromosome', 'Conse..."
3,Breast Carcinoma,0.03,"{'('Sex', 'Smoke Status', 'Chromosome', 'Conse..."
4,Breast Carcinoma,0.04,"{'('Sex', 'Smoke Status', 'Chromosome', 'Diagn..."


In [7]:
df = lift_df.copy()

In [8]:
# Convert feature sets from strings to actual sets for easier comparison
df["Feature Set"] = df["Feature Combination"].apply(eval)

# Group by Cancer Type, Lift, and Feature Values
grouped = df.groupby(["Cancer Type", "Lift Value", "Feature"])

# Function to merge feature sets that share identical feature values
def merge_identical_feature_values(group):
    merged_sets = []

    for feature_set in group["Feature Set"]:
        merged = False
        for i, existing_set in enumerate(merged_sets):
            if feature_set & existing_set:  # Overlapping features
                merged_sets[i] = existing_set | feature_set  # Merge sets
                merged = True
                break
        if not merged:
            merged_sets.append(feature_set)

    return pd.DataFrame({
        "Merged Features": [str(fs) for fs in merged_sets],
        "Feature": group["Feature"].iloc[0]  # Keep one unique value
    })

# Apply merging function to each group
merged_df = grouped.apply(merge_identical_feature_values).reset_index()

# Drop unnecessary index column
merged_df = merged_df.drop(columns=["level_3"])

ValueError: cannot insert Feature, already exists

In [29]:
grouped.sort_values(by="Lift Value", ascending=False).head(1000)


Unnamed: 0,Cancer Type,Lift Value,Merged Features,Merged Values
6529,Pancreatic Cancer,27.03,"[[Chromosome, Codons, Consequence, Exon_Number...",[[Unknown_02/05_12.0_Ggt/Cgt_Substitution/Inde...
6528,Pancreatic Cancer,26.69,"[[Chromosome, Codons, Consequence, Exon_Number...",[[02/05_25398285.0-25398285.0_12.0_Ggt/Cgt_Sub...
6527,Pancreatic Cancer,26.36,"[[Codons, Consequence, Protein_position, SNP_e...",[[C>G_missense_variant_12.0_Ggt/Cgt_Substituti...
999,Breast Carcinoma,26.22,"[[Chromosome, Codons, Consequence, Exon_Number...",[[Female_Unknown_1047.0_cAt/cGt_Substitution/I...
6526,Pancreatic Cancer,26.03,"[[Chromosome, Codons, Consequence, SNP_event, ...",[[12_C>G_missense_variant_Ggt/Cgt_Substitution...
...,...,...,...,...
710,Breast Carcinoma,7.86,"[[Chromosome, Codons, Diagnosis Age, SNP_event...",[[Female_Unknown_3_0.233333333_Substitution/In...
5960,Pancreatic Cancer,7.86,"[[Consequence, Diagnosis Age, Exon_Number, SNP...",[[C>T_missense_variant_02/05_41-50_Substitutio...
5959,Pancreatic Cancer,7.85,"[[Codons, Consequence, Diagnosis Age, Protein_...",[[C>A_missense_variant_61-70_12.0_Substitution...
4803,Liver Hepatocellular Carcinoma,7.85,"[[Chromosome, Consequence, Hugo_Symbol, Smoke ...",[[Unknown_16_AXIN1_stop_gained_Truncation]]


In [40]:
df = lift_df.copy()

In [12]:
def parse_features(row):
    feature_names = row["Feature Combination"].split("_")
    feature_values = row["Feature"].split("_")
    return dict(zip(feature_names, feature_values))

df["Feature Dict"] = df.apply(parse_features, axis=1)

In [13]:
# Function to find matching patient IDs
def find_matching_patients(row, data_for_lift):
    matching_ids = []

    for _, patient_row in data_for_lift.iterrows():
        match = all(patient_row.get(feature, None) == value for feature, value in row["Feature Dict"].items())
        if match:
            matching_ids.append(patient_row["PATIENT ID"])

    return matching_ids

# Apply matching function
df["Matching Patient IDs"] = df.apply(lambda row: find_matching_patients(row, data_for_lift), axis=1)

# Drop helper column
# df = df.drop(columns=["Feature Dict"])

KeyboardInterrupt: 

In [None]:
final_df