In [None]:
import pandas as pd
import numpy as np
from itertools import combinations
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
import warnings
import re
import ast

warnings.filterwarnings('ignore')


In [None]:
lift_df = pd.read_csv("lifts.csv", index_col=0)
data_for_lift = pd.read_csv("data_for_lift.csv", index_col=0)

In [None]:
lift_df

In [None]:
# try_df = data_for_lift[data_for_lift["Cancer Type"] == "Colorectal Carcinoma"]
# try_df = data_for_lift[data_for_lift['Smoke Status'] == "Unknown"]
# try_df = try_df[try_df['SNP_event'] == "C>G"]
# try_df = try_df[try_df['Exon_Number'] == "02/05"]
# try_df = try_df[try_df['Protein_position'] == 12.0]
# try_df = try_df[try_df['Codons'] == "Ggt/Cgt"]
#
# set(try_df.index)
# try_df

In [None]:
lift_df.sort_values(by="Lift Value", ascending=False).head(1000)

In [None]:
df = lift_df.copy()

In [None]:
df.reset_index(inplace=True)

In [None]:
df.sort_values(by="Lift Value", ascending=False)

In [None]:
# Improved function to safely parse tuple-like strings
def safe_eval(val):
    if pd.isna(val) or val == 'None' or val == 'nan':
        return None

    try:
        # Handle the case where it's already a proper string representation of a tuple
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        try:
            # Fix missing quotes around words and try again
            fixed_val = re.sub(r'(\b\w+\b)(?=\s*,|\s*\))', r"'\1'", val)
            fixed_val = re.sub(r'\(\s*,', '(', fixed_val)  # Fix empty first item
            fixed_val = re.sub(r',\s*\)', ')', fixed_val)  # Fix empty last item
            return ast.literal_eval(fixed_val)
        except (ValueError, SyntaxError):
            # If still fails, try to extract content between parentheses
            match = re.search(r'\((.*)\)', val)
            if match:
                items = match.group(1).split(',')
                # Clean and convert items
                processed_items = []
                for item in items:
                    item = item.strip()
                    if item and item != 'None':
                        # Try to convert to appropriate type
                        try:
                            if item.lower() == 'true':
                                processed_items.append(True)
                            elif item.lower() == 'false':
                                processed_items.append(False)
                            elif item.replace('.', '', 1).isdigit():
                                if '.' in item:
                                    processed_items.append(float(item))
                                else:
                                    processed_items.append(int(item))
                            else:
                                processed_items.append(item.strip("'\""))
                        except:
                            processed_items.append(item.strip("'\""))

                return tuple(processed_items) if processed_items else None
            return None

# Apply parsing with tqdm progress bar
tqdm.pandas(desc="Parsing Features")
df["Feature"] = df["Feature"].progress_apply(lambda x: safe_eval(str(x)) if not pd.isna(x) else None)
df["Feature Combination"] = df["Feature Combination"].progress_apply(lambda x: safe_eval(str(x)) if not pd.isna(x) else None)

In [None]:
# Find all unique features across the dataset
all_features = set()
for features in df["Feature Combination"].dropna():
    if isinstance(features, tuple):  # Make sure it's actually a tuple
        all_features.update(features)

# Prepare transformed data
expanded_data = []

for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing Rows"):
    new_row = {
        "Cancer Type": row["Cancer Type"],
        "Lift Value": row["Lift Value"]
    }

    # The bug is here: Check if both are tuples AND verify they're the same length
    if isinstance(row["Feature Combination"], tuple) and isinstance(row["Feature"], tuple):
        if len(row["Feature Combination"]) == len(row["Feature"]):
            new_row.update(dict(zip(row["Feature Combination"], row["Feature"])))
        else:
            # Handle case where tuples are different lengths
            # Only use the matching portions
            min_length = min(len(row["Feature Combination"]), len(row["Feature"]))
            new_row.update(dict(zip(row["Feature Combination"][:min_length],
                                    row["Feature"][:min_length])))

    # Assign NaN to missing feature columns
    for feature in all_features:
        if feature not in new_row:
            new_row[feature] = None  # Using None to represent NaN

    # Only append rows that have at least one non-null feature value
    if any(new_row.get(feature) is not None for feature in all_features):
        expanded_data.append(new_row)
    else:
        # Optionally include even empty rows if that's what you want
        expanded_data.append(new_row)

# Convert to DataFrame
df_transformed = pd.DataFrame(expanded_data)

In [None]:
# d = df_transformed[df_transformed["Protein_position"] == 249]
# d[d["Cancer Type"] == "Intrahepatic Cholangiocarcinoma"]
df_transformed = df_transformed.drop_duplicates()
df_transformed = df_transformed[df_transformed["Lift Value"] > 6]
df_transformed

In [None]:
# def find_matching_patients(df_transformed, data_for_lift):
#
#     # Ensure types match across dataframes (optional, depends on your data)
#     df_transformed = df_transformed.copy()
#     data_for_lift = data_for_lift.copy()
#
#     # Pre-index data_for_lift by Cancer Type to reduce row scans
#     cancer_type_groups = data_for_lift.groupby('Cancer Type')
#
#     # Result list to store matched PATIENT_IDs
#     matched_ids = []
#
#     # Iterate through df_transformed
#     for idx, row in df_transformed.iterrows():
#         cancer_type = row['Cancer Type']
#         if cancer_type not in cancer_type_groups.groups:
#             matched_ids.append("")
#             continue
#
#         subset = cancer_type_groups.get_group(cancer_type)
#
#         # Prepare filters: only columns with non-null values, excluding meta-columns
#         filter_columns = row.drop(labels=['Cancer Type', 'Lift Value']).dropna()
#
#         if filter_columns.empty:
#             # If no filters, return all patient IDs for this cancer type
#             matched = subset['PATIENT_ID'].dropna().unique()
#         else:
#             # Start with all True boolean mask
#             mask = pd.Series(True, index=subset.index)
#             for col, val in filter_columns.items():
#                 if col in subset.columns:
#                     mask &= subset[col] == val
#             matched = subset.loc[mask, 'PATIENT_ID'].dropna().unique()
#
#         # Append as comma-separated string
#         matched_ids.append(','.join(map(str, matched)))
#
#     # Add to DataFrame
#     df_transformed['Matched PATIENT_IDs'] = matched_ids
#     return df_transformed
#
#
# df_result = find_matching_patients(df_transformed, data_for_lift)

def find_matching_patients(df_transformed, data_for_lift):
    df_transformed = df_transformed.copy()
    data_for_lift = data_for_lift.copy()
    cancer_type_groups = data_for_lift.groupby('Cancer Type')
    matched_ids = []
    matched_counts = []
    total_matched_feature_counts = []

    for idx, row in df_transformed.iterrows():
        cancer_type = row['Cancer Type']
        filter_columns = row.drop(labels=['Cancer Type', 'Lift Value']).dropna()

        # Find matched patient IDs for this cancer type
        if cancer_type not in cancer_type_groups.groups:
            matched = []
        else:
            subset = cancer_type_groups.get_group(cancer_type)
            if filter_columns.empty:
                matched = subset['PATIENT_ID'].dropna().unique()
            else:
                mask = pd.Series(True, index=subset.index)
                for col, val in filter_columns.items():
                    if col in subset.columns:
                        mask &= subset[col] == val
                matched = subset.loc[mask, 'PATIENT_ID'].dropna().unique()
        matched_ids.append(','.join(map(str, matched)))
        matched_counts.append(len(matched))

        # Find total matched patient IDs with these features across all cancer types
        if filter_columns.empty:
            total_matched = data_for_lift['PATIENT_ID'].dropna().unique()
        else:
            mask = pd.Series(True, index=data_for_lift.index)
            for col, val in filter_columns.items():
                if col in data_for_lift.columns:
                    mask &= data_for_lift[col] == val
            total_matched = data_for_lift.loc[mask, 'PATIENT_ID'].dropna().unique()
        total_matched_feature_counts.append(len(total_matched))

    df_transformed['Matched PATIENT_IDs'] = matched_ids
    df_transformed['Matched_Count'] = matched_counts
    df_transformed['Total_Matched_Feature_Count'] = total_matched_feature_counts
    return df_transformed
# df_result = find_matching_patients(df_transformed, data_for_lift)


In [None]:
# Calculate unique patient counts per cancer type
cancer_type_patient_counts = data_for_lift.groupby('Cancer Type')['PATIENT_ID'].nunique()
# Map to your result DataFrame
df_result['Total_Unique_Patients_Cancer_Type'] = df_result['Cancer Type'].map(cancer_type_patient_counts)
total_patients = data_for_lift['PATIENT_ID'].nunique()
df_result['Total_Patient_IDs'] = total_patients
P_A = df_result['Total_Unique_Patients_Cancer_Type'] / total_patients
P_B = df_result['Total_Matched_Feature_Count'] / total_patients
P_A_B = df_result['Matched_Count'] / total_patients
df_result["lift_try"] = P_A_B / (P_A * P_B)
df_result

In [None]:
t = data_for_lift[data_for_lift["Smoke Status"] == "Unknown"]
t = t[t["Chromosome"] == "17"]
t = t[t["Hugo_Symbol"] == "TP53"]
# t = t[t["VAR_TYPE_SX"] == 'Substitution/Indel']
# t = t[t["Exon_Number"] == "21/21"]
# t = t[t["Codons"] == "cAt/cGt"]
t = t[t["SNP_event"] == "C>A"]
t = t[t["Sex"] == "Female"]
# t = t[t["Consequence"] == "upstream_gene_variant"]
len(list(t["PATIENT_ID"].unique()))

In [None]:
df_result = df_result.sort_values(by="lift_try", ascending=False)
unique_ids = df_result['Matched PATIENT_IDs'].drop_duplicates().reset_index(drop=True)
id_to_rank = {pid: i+1 for i, pid in unique_ids.items()}
df_result['patient_rank'] = df_result['Matched PATIENT_IDs'].map(id_to_rank)
df_result

In [None]:
# Group by 'patient_rank' and aggregate unique values as comma-separated strings
agg_df = df_result.groupby('patient_rank').agg(
    lambda x: ','.join(sorted(map(str, pd.unique(x.dropna()))))
).reset_index()

agg_df

In [None]:
agg_df["support"] = 0
for idx, row in agg_df.iterrows():
    agg_df.at[idx, "support"] = len(str(row['Matched PATIENT_IDs']).split(","))

In [None]:
agg_df

In [None]:
num_patient_cancer = len(list(data_for_lift[data_for_lift["Cancer Type"] == "Liver Hepatocellular Carcinoma"]["PATIENT_ID"].unique()))
total_patients = len(list(data_for_lift["PATIENT_ID"].unique()))
num_patient_cancer / total_patients



In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from typing import List, Tuple, Dict, Any

def combine_with_lift_tolerance(df: pd.DataFrame, lift_tolerance: float = 1.0) -> pd.DataFrame:
    """
    Takes the result of combine_cancer_data_rows and performs another round of combining
    for rows that match all conditions but differ by up to lift_tolerance in lift value.
    """

    # Create a copy to avoid modifying original data
    df = df.copy()

    # Identify columns to exclude from matching (index-like columns)
    id_cols = []
    if len(df.columns) > 0:
        first_col = df.columns[0]
        if (df[first_col].dtype in ['int64', 'float64', 'object'] and
            (first_col.lower() in ['id', 'index'] or first_col == df.columns[0])):
            id_cols = [first_col]

    matching_cols = [col for col in df.columns if col not in id_cols + ['Cancer Type', 'Lift Value']]

    # Group by Cancer Type first
    cancer_groups = df.groupby('Cancer Type')

    combined_rows = []
    processed_indices = set()

    for cancer_type, cancer_group in cancer_groups:
        # Within each cancer type, find lift value groups with tolerance
        lift_groups = find_lift_value_groups_with_tolerance(cancer_group, lift_tolerance)

        for lift_group_indices in lift_groups:
            lift_group = cancer_group.loc[lift_group_indices]

            # Skip if any row in this lift group is already processed
            if any(idx in processed_indices for idx in lift_group_indices):
                continue

            # Find combinable row groups within this lift value group
            combinable_groups = find_combinable_groups_tolerance(lift_group, matching_cols)

            for row_group in combinable_groups:
                if any(idx in processed_indices for idx in row_group):
                    continue

                # Combine the rows in this group
                combined_row = combine_row_group_with_lift_avg(lift_group.loc[row_group], id_cols[0] if id_cols else None)
                combined_rows.append(combined_row)
                processed_indices.update(row_group)

    # Convert back to DataFrame
    result_df = pd.DataFrame(combined_rows)

    # Reorder columns to match original
    if not result_df.empty:
        result_df = result_df.reindex(columns=df.columns, fill_value=np.nan)

    return result_df

def find_lift_value_groups_with_tolerance(cancer_group: pd.DataFrame, lift_tolerance: float) -> List[List]:
    """
    Group rows by similar lift values within the tolerance.
    """

    rows = cancer_group.index.tolist()
    lift_values = cancer_group['Lift Value'].values
    n = len(rows)

    if n <= 1:
        return [rows]

    # Use Union-Find to group rows with similar lift values
    parent = list(range(n))

    def find(x):
        if parent[x] != x:
            parent[x] = find(parent[x])
        return parent[x]

    def union(x, y):
        px, py = find(x), find(y)
        if px != py:
            parent[px] = py

    # Union rows with lift values within tolerance
    for i in range(n):
        for j in range(i + 1, n):
            if abs(lift_values[i] - lift_values[j]) <= lift_tolerance:
                union(i, j)

    # Group rows by their root parent
    groups = defaultdict(list)
    for i in range(n):
        root = find(i)
        groups[root].append(rows[i])

    return list(groups.values())

def find_combinable_groups_tolerance(group: pd.DataFrame, matching_cols: List[str]) -> List[List]:
    """
    Find groups of rows that can be combined based on matching criteria.
    Uses Union-Find algorithm to efficiently group compatible rows.
    """

    rows = group.index.tolist()
    n = len(rows)

    if n <= 1:
        return [[row] for row in rows]

    # Create compatibility matrix
    compatible = [[False] * n for _ in range(n)]

    # Check pairwise compatibility
    for i in range(n):
        for j in range(i + 1, n):
            row1_idx, row2_idx = rows[i], rows[j]
            if can_combine_rows_tolerance(group.loc[[row1_idx, row2_idx]], matching_cols):
                compatible[i][j] = compatible[j][i] = True

    # Use Union-Find to group compatible rows
    parent = list(range(n))

    def find(x):
        if parent[x] != x:
            parent[x] = find(parent[x])
        return parent[x]

    def union(x, y):
        px, py = find(x), find(y)
        if px != py:
            parent[px] = py

    # Union compatible rows
    for i in range(n):
        for j in range(i + 1, n):
            if compatible[i][j]:
                union(i, j)

    # Group rows by their root parent
    groups = defaultdict(list)
    for i in range(n):
        root = find(i)
        groups[root].append(rows[i])

    # Verify each group is fully compatible
    final_groups = []
    for group_indices in groups.values():
        if len(group_indices) == 1:
            final_groups.append(group_indices)
        else:
            # Verify full compatibility within the group
            if verify_group_compatibility_tolerance(group.loc[group_indices], matching_cols):
                final_groups.append(group_indices)
            else:
                # If not fully compatible, split into individual rows
                final_groups.extend([[idx] for idx in group_indices])

    return final_groups

def can_combine_rows_tolerance(rows: pd.DataFrame, matching_cols: List[str]) -> bool:
    """
    Check if two or more rows can be combined based on matching criteria.
    """

    matching_count = 0

    for col in matching_cols:
        col_values = rows[col].dropna()

        if len(col_values) == 0:
            continue
        elif len(col_values) == 1:
            # Only one non-null value, no conflict
            continue
        elif len(set(col_values)) == 1:
            # All non-null values are the same
            matching_count += 1
        else:
            # Contradiction: different non-null values
            return False

    return matching_count >= 3

def verify_group_compatibility_tolerance(group: pd.DataFrame, matching_cols: List[str]) -> bool:
    """
    Verify that all rows in a group are mutually compatible.
    """

    matching_count = 0

    for col in matching_cols:
        col_values = group[col].dropna()

        if len(col_values) <= 1:
            continue
        elif len(set(col_values)) == 1:
            matching_count += 1
        else:
            return False

    return matching_count >= 3

def combine_row_group_with_lift_avg(group: pd.DataFrame, id_col: str = None) -> Dict[str, Any]:
    """
    Combine a group of compatible rows into a single row.
    Averages lift values when combining rows with different lift values.
    """

    combined = {}

    # Columns that should not be concatenated
    single_value_cols = {'Cancer Type'}

    for col in group.columns:
        non_null_values = group[col].dropna()

        if len(non_null_values) == 0:
            combined[col] = np.nan
        elif col == id_col and id_col is not None:
            # For ID column, combine all IDs (handle both string and numeric IDs)
            if isinstance(non_null_values.iloc[0], str) and ',' in str(non_null_values.iloc[0]):
                # Already combined IDs, merge them
                all_ids = []
                for val in non_null_values:
                    all_ids.extend(str(val).split(','))
                combined[col] = ','.join(sorted(set(all_ids)))
            else:
                combined[col] = ','.join(map(str, sorted(set(non_null_values.tolist()))))
        elif col in single_value_cols:
            # For Cancer Type, take single value (should be same for all)
            combined[col] = non_null_values.iloc[0]
        elif col == 'Lift Value':
            # For Lift Value, take the average of all values
            combined[col] = round(non_null_values.mean(), 2)
        else:
            # For other columns, take the first non-null value
            combined[col] = non_null_values.iloc[0]

    return combined

# Example usage combining both functions:
def full_combine_pipeline(df: pd.DataFrame, lift_tolerance: float = 1.0) -> pd.DataFrame:
    """
    Complete pipeline: first combine exact matches, then combine with lift tolerance.
    """

    # First pass: combine exact matches (assuming you have the original function)
    # combined_df = combine_cancer_data_rows(df)  # Your original function

    # Second pass: combine with lift tolerance
    final_df = combine_with_lift_tolerance(df, lift_tolerance)

    return final_df.sort_values(by="Lift Value", ascending=False)


df_after_first_combine = pd.DataFrame(combined_df)
final_result = combine_with_lift_tolerance(df_after_first_combine, lift_tolerance=1.0)
final_result.sort_values(by="Lift Value", ascending=False)

In [None]:
merged_df.sort_values(by=["Lift Value", "Cancer Type"], ascending=[False, True])
# df_transformed.sort_values(by=["Lift Value", "Cancer Type"], ascending=[False, True])

In [None]:
def row_to_sentence(row):
    return " AND ".join(
        f"{col} value is {val}"
        for col, val in row.items()
        if pd.notnull(val) and col not in {"Cancer Type", "Lift Value", "hypo_factors"}
    )

In [None]:
merged_df['hypo_factors'] = merged_df.apply(row_to_sentence, axis=1)
merged_df.rename(columns={'Lift Value': 'support'}, inplace=True)

In [None]:
merged_df

In [None]:
merged_df.to_csv("models_hypotheses/LIFT_hypotheses_as_sentences.csv")

In [None]:
tal = pd.read_csv("models_hypotheses/LIFT_hypotheses_as_sentences.csv")
tal.sort_values(by="support", ascending=False)

In [None]:
t = data_for_lift[data_for_lift["Smoke Status"] == "Unknown"]
t = t[t["Chromosome"] == "12"]
t = t[t["Hugo_Symbol"] == "KRAS"]
t = t[t["VAR_TYPE_SX"] == 'Substitution/Indel']
t = t[t["Protein_position"] == 12]
t = t[t["Exon_Number"] == "02/05"]
t = t[t["Codons"] == "Ggt/Cgt"]
t = t[t["SNP_event"] == "C>G"]
t = t[t["Consequence"] == "missense_variant"]
t = t[t["Diagnosis Age"] == "51-60"]
t = t[t["Position"] == "25398285.0-25398285.0"]
t

In [None]:
merged_df.to_csv("data_for_lift_merged.csv")

In [None]:
lift_merged = pd.read_csv("lifts_merged.csv", index_col=0)

In [None]:
lift_merged.sort_values(by="Lift Value", ascending=False)

In [None]:
tt = merged_df[merged_df["Cancer Type"] == "Intrahepatic Cholangiocarcinoma"]
# tt = tt[tt["Hugo_Symbolmbol"] == "KRAS"]
# tt = tt[tt["Smoke Status"] == "Unknown"]
# tt = tt[tt["Chromosome"] == "12"]
# tt = tt[tt["VAR_TYPE_SX"] == 'Substitution/Indel']
tt = tt[tt["Protein_position"] == 249]
tt

In [None]:
('Female', 'Unknown', '12', 'KRAS', 'Substitution/Indel')

In [None]:
l = lift_merged[lift_merged['Feature'].astype(str).str.contains('249', na=False)]
l = l[l.index == "Intrahepatic Cholangiocarcinoma"]
l

In [None]:
df = lift_merged.copy()
df.reset_index(inplace=True)

In [None]:
df[df["Lift Value"] == 27.03]

In [None]:
df_new = pd.read_csv("lifts_second_round.csv", index_col=0)

In [None]:
df_new

In [None]:
import pandas as pd
from itertools import combinations

columns = [
    'Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'SNP_event',
    "Consequence", 'Exon_Number', "Diagnosis Age", "TMB (nonsynonymous)",
    "Position", "Protein_position", "Codons", "VAR_TYPE_SX"
]

result_frames = []

for col_set in combinations(columns, 4):
    # Group by the 4 columns and count unique PATIENT_IDs
    group = data_for_lift.groupby(list(col_set))['PATIENT_ID'].nunique().reset_index()
    # Filter for combinations with at least 50 unique patients
    filtered = group[group['PATIENT_ID'] >= 50]
    if not filtered.empty:
        filtered['Columns'] = ','.join(col_set)
        result_frames.append(filtered)

# Concatenate all results
combinations_df = pd.concat(result_frames, ignore_index=True)
combinations_df.head()

In [None]:
# combinations_df.rename(columns={'PATIENT_ID': 'patient_ids_count'}, inplace=True)
combinations_df

In [None]:
cancer_types = pd.DataFrame({'Cancer Type': data_for_lift['Cancer Type'].unique()})
combinations_df = combinations_df.merge(cancer_types, how='cross')

In [None]:
df = find_matching_patients(combinations_df, data_for_lift)


In [None]:
from tqdm import tqdm

def find_matching_patients(df_transformed, data_for_lift):
    df_transformed = df_transformed.copy()
    data_for_lift = data_for_lift.copy()
    cancer_type_groups = data_for_lift.groupby('Cancer Type')
    matched_ids = []
    matched_counts = []
    total_matched_feature_counts = []

    for idx, row in tqdm(df_transformed.iterrows(), total=len(df_transformed), desc="Matching patients"):
        cancer_type = row['Cancer Type']
        filter_columns = row.drop(labels=['Cancer Type']).dropna()

        # Find matched patient IDs for this cancer type
        if cancer_type not in cancer_type_groups.groups:
            matched = []
        else:
            subset = cancer_type_groups.get_group(cancer_type)
            if filter_columns.empty:
                matched = subset['PATIENT_ID'].dropna().unique()
            else:
                mask = pd.Series(True, index=subset.index)
                for col, val in filter_columns.items():
                    if col in subset.columns:
                        mask &= subset[col] == val
                matched = subset.loc[mask, 'PATIENT_ID'].dropna().unique()
        matched_ids.append(','.join(map(str, matched)))
        matched_counts.append(len(matched))

        # Find total matched patient IDs with these features across all cancer types
        if filter_columns.empty:
            total_matched = data_for_lift['PATIENT_ID'].dropna().unique()
        else:
            mask = pd.Series(True, index=data_for_lift.index)
            for col, val in filter_columns.items():
                if col in data_for_lift.columns:
                    mask &= data_for_lift[col] == val
            total_matched = data_for_lift.loc[mask, 'PATIENT_ID'].dropna().unique()
        total_matched_feature_counts.append(len(total_matched))

    df_transformed['Matched PATIENT_IDs'] = matched_ids
    df_transformed['Matched_Count'] = matched_counts
    df_transformed['Total_Matched_Feature_Count'] = total_matched_feature_counts
    return df_transformed

In [None]:
df = find_matching_patients(combinations_df, data_for_lift)

In [None]:
df

In [None]:
df = df[df["Matched_Count"] > 0]
# Calculate unique patient counts per cancer type
cancer_type_patient_counts = data_for_lift.groupby('Cancer Type')['PATIENT_ID'].nunique()
df['Total_Unique_Patients_Cancer_Type'] = df['Cancer Type'].map(cancer_type_patient_counts)

total_patients = data_for_lift['PATIENT_ID'].nunique()

P_A = df['Total_Unique_Patients_Cancer_Type'] / total_patients
P_B = df['Total_Matched_Feature_Count'] / total_patients
P_A_B = df['Matched_Count'] / total_patients

df["lift_try"] = P_A_B / (P_A * P_B)
df

In [None]:
df = df.sort_values(by="lift_try", ascending=False)
unique_ids = df['Matched PATIENT_IDs'].drop_duplicates().reset_index(drop=True)
id_to_rank = {pid: i+1 for i, pid in unique_ids.items()}
df['patient_rank'] = df['Matched PATIENT_IDs'].map(id_to_rank)
df

In [None]:
df[df["lift_try"] > 1]["lift_try"].describe()

In [None]:
dff = df.copy()
dff = dff[dff["lift_try"] > 4]
dff["Matched_Count"] = dff["Matched_Count"].astype(int)
dff = dff[dff["Matched_Count"] >= 15]
dff

In [None]:
# Group by 'patient_rank' and aggregate unique values as comma-separated strings
agg_df = dff.groupby('patient_rank').agg(
    lambda x: ','.join(sorted(map(str, pd.unique(x.dropna()))))
).reset_index()

def get_max_from_str(val):
    if pd.isna(val):
        return None
    vals = [float(x) for x in str(val).split(',') if x.strip()]
    return max(vals) if vals else None

agg_df['max_lift'] = agg_df['lift_try'].apply(get_max_from_str)
agg_df.replace({"": None}, inplace=True)
agg_df["Matched_Count"] = agg_df["Matched_Count"].astype(int)
agg_df.loc[agg_df["Smoke Status"] == "Unknown", :] = np.nan
agg_df

In [None]:
# import pandas as pd
# import numpy as np
# from tqdm import tqdm
#
# combine_cols = [
#     'Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'SNP_event',
#     "Consequence", 'Exon_Number', "Diagnosis Age", "TMB (nonsynonymous)",
#     "Position", "Protein_position", "Codons", "VAR_TYPE_SX"
# ]
#
# def patient_id_overlap(row1, row2):
#     set1 = set(str(row1['Matched PATIENT_IDs']).split(','))
#     set2 = set(str(row2['Matched PATIENT_IDs']).split(','))
#     if not set1 or not set2:
#         return False
#     intersection = set1 & set2
#     union = set1 | set2
#     if len(union) == 0:
#         return False
#     return len(intersection) / len(union) >= 0.8
#
# def merge_rows(df):
#     result_rows = []
#     used = set()
#     df = df.sort_values('max_lift', ascending=False).reset_index(drop=True)
#
#     for cancer_type, group in tqdm(df.groupby('Cancer Type'), desc="Merging by Cancer Type"):
#         group = group.reset_index()  # preserve original index
#         n = len(group)
#
#         for i in range(n):
#             idx_i = group.loc[i, 'index']
#             if idx_i in used:
#                 continue
#
#             base_row = group.loc[i]
#             merge_candidates = [i]
#
#             for j in range(n):
#                 if i == j:
#                     continue
#
#                 idx_j = group.loc[j, 'index']
#                 if idx_j in used:
#                     continue
#
#                 comp_row = group.loc[j]
#
#                 if not patient_id_overlap(base_row, comp_row):
#                     continue
#                 matches = 0
#                 consistent = True
#                 for col in combine_cols:
#                     val1 = base_row[col]
#                     val2 = comp_row[col]
#                     if pd.notna(val1) and pd.notna(val2):
#                         if val1 != val2:
#                             consistent = False
#                             break
#                         matches += 1
#
#                 if consistent and matches >= 3 and patient_id_overlap(base_row, comp_row):
#                     merge_candidates.append(j)
#
#             merge_indices = [group.loc[k, 'index'] for k in merge_candidates]
#             if len(merge_candidates) > 1:
#                 subset = group.loc[merge_candidates]
#                 merged_row = base_row.copy()
#                 for col in subset.columns:
#                     vals = subset[col].dropna().astype(str).unique()
#                     if col in ["Columns", "Matched PATIENT_IDs"]:
#                         unique_vals = set()
#                         for v in vals:
#                             unique_vals.update(map(str.strip, v.split(',')))
#                         merged_row[col] = ','.join(sorted(unique_vals))
#                     else:
#                         merged_row[col] = ','.join(sorted(set(vals)))
#                 patient_ranks = subset['patient_rank'].dropna().astype(int)
#                 merged_row['patient_rank_agg'] = ','.join(sorted(map(str, set(patient_ranks))))
#                 merged_row['patient_rank_min'] = patient_ranks.min() if not patient_ranks.empty else None
#                 result_rows.append(merged_row.drop(labels='index'))
#                 used.update(merge_indices)
#             else:
#                 base_row = base_row.copy()
#                 if pd.notna(base_row['patient_rank']):
#                     base_row['patient_rank_agg'] = str(int(base_row['patient_rank']))
#                     base_row['patient_rank_min'] = int(base_row['patient_rank'])
#                 else:
#                     base_row['patient_rank_agg'] = ''
#                     base_row['patient_rank_max'] = None
#                 result_rows.append(base_row.drop(labels='index'))
#                 used.add(idx_i)
#
#     return pd.DataFrame(result_rows)
#
# merged_df = merge_rows(agg_df)

import pandas as pd
import numpy as np
from tqdm import tqdm

combine_cols = [
    'Sex', 'Smoke Status', 'Chromosome', 'Hugo_Symbol', 'SNP_event',
    "Consequence", 'Exon_Number', "Diagnosis Age", "TMB (nonsynonymous)",
    "Position", "Protein_position", "Codons", "VAR_TYPE_SX"
]

def patient_id_overlap(set1, set2):
    if not set1 or not set2:
        return False
    intersection = set1 & set2
    union = set1 | set2
    if len(union) == 0:
        return False
    return len(intersection) / len(union) >= 0.8

def merge_rows(df):
    result_rows = []
    used = set()
    df = df.sort_values('max_lift', ascending=False).reset_index(drop=True)
    # Precompute patient ID sets for all rows
    df['_pid_set'] = df['Matched PATIENT_IDs'].apply(lambda x: set(str(x).split(',')) if pd.notna(x) else set())

    for cancer_type, group in tqdm(df.groupby('Cancer Type', sort=False), desc="Merging by Cancer Type"):
        group = group.reset_index()  # preserve original index
        n = len(group)
        # Precompute patient ID sets for group
        pid_sets = group['_pid_set'].tolist()
        for i in range(n):
            idx_i = group.loc[i, 'index']
            if idx_i in used:
                continue
            base_row = group.loc[i]
            base_pid_set = pid_sets[i]
            merge_candidates = [i]
            for j in range(i + 1, n):  # Only check j > i to avoid redundant checks
                idx_j = group.loc[j, 'index']
                if idx_j in used:
                    continue
                comp_row = group.loc[j]
                comp_pid_set = pid_sets[j]
                if not patient_id_overlap(base_pid_set, comp_pid_set):
                    continue
                matches = 0
                consistent = True
                for col in combine_cols:
                    val1 = base_row[col]
                    val2 = comp_row[col]
                    if pd.notna(val1) and pd.notna(val2):
                        if val1 != val2:
                            consistent = False
                            break
                        matches += 1
                if consistent and matches >= 3:
                    merge_candidates.append(j)
            merge_indices = [group.loc[k, 'index'] for k in merge_candidates]
            if len(merge_candidates) > 1:
                subset = group.loc[merge_candidates]
                merged_row = base_row.copy()
                for col in subset.columns:
                    vals = subset[col].dropna().astype(str).unique()
                    if col in ["Columns", "Matched PATIENT_IDs"]:
                        unique_vals = set()
                        for v in vals:
                            unique_vals.update(map(str.strip, v.split(',')))
                        merged_row[col] = ','.join(sorted(unique_vals))
                    else:
                        merged_row[col] = ','.join(sorted(set(vals)))
                patient_ranks = subset['patient_rank'].dropna().astype(int)
                merged_row['patient_rank_agg'] = ','.join(sorted(map(str, set(patient_ranks))))
                merged_row['patient_rank_min'] = patient_ranks.min() if not patient_ranks.empty else None
                result_rows.append(merged_row.drop(labels=['index', '_pid_set']))
                used.update(merge_indices)
            else:
                base_row = base_row.copy()
                if pd.notna(base_row['patient_rank']):
                    base_row['patient_rank_agg'] = str(int(base_row['patient_rank']))
                    base_row['patient_rank_min'] = int(base_row['patient_rank'])
                else:
                    base_row['patient_rank_agg'] = ''
                    base_row['patient_rank_max'] = None
                result_rows.append(base_row.drop(labels=['index', '_pid_set']))
                used.add(idx_i)
    return pd.DataFrame(result_rows)

merged_df = merge_rows(agg_df)

In [None]:
merged_df.replace("", np.nan, inplace=True)
merged_df['max_lift'] = merged_df['lift_try'].apply(get_max_from_str)

In [None]:
merged_df

In [None]:
dff[dff["patient_rank"].isin([1,2,3,4])]

In [None]:
d = data_for_lift[data_for_lift["Smoke Status"] == "Unknown"]
# d = d[d["Chromosome"] == "13"]
d = d[d["Consequence"] == "missense_variant"]
# d = d[d["Diagnosis Age"] == '31-40']
d = d[d["Codons"] == "tGc/tAc"]
d = d[d["SNP_event"] == "G>A"]
d["Cancer Type"].value_counts()

In [None]:
def row_to_sentence(row):
    return " AND ".join(
        f"{col.replace("_", " ")} is {val}"
        for col, val in row.items()
        if pd.notnull(val) and col not in {"patient_rank", "patient_ids_count", "Columns", "Cancer Type", "lift_try", "max_lift", "hypo_factors", 'patient_rank_agg', 'patient_rank_min', 'Matched PATIENT_IDs', 'Matched_Count', 'Total_Matched_Feature_Count',
       'Total_Unique_Patients_Cancer_Type'}
    )

In [None]:
merged_df.columns

In [None]:
merged_df['hypo_factors'] = merged_df.apply(row_to_sentence, axis=1)
# merged_df.rename(columns={'Lift Value': 'support'}, inplace=True)

In [None]:
merged_df.head()

In [None]:
final_lift = merged_df.drop(labels=["patient_rank", "patient_ids_count", "Columns", "lift_try", 'patient_rank_agg', 'patient_rank_min', 'Matched PATIENT_IDs', 'Total_Matched_Feature_Count',
       'Total_Unique_Patients_Cancer_Type'], axis=1)

In [None]:
final_lift.rename(columns={'Matched_Count':'support'}, inplace=True)

In [None]:
final_lift.to_csv("models_hypotheses/LIFT_hypotheses_as_sentences.csv", index=False)