In [45]:
import pandas as pd

In [46]:
# forest = pd.read_csv("models_hypotheses/Random_Forest_gpu_hypotheses_as_sentences.csv")
xgb = pd.read_csv("models_hypotheses/XGBoost_hypotheses_as_sentences.csv")
# rules = pd.read_csv("models_hypotheses/rules_df.csv")
lgb = pd.read_csv("models_hypotheses/LightGBM_hypotheses_as_sentences.csv")
lift = pd.read_csv("models_hypotheses/LIFT_hypotheses_as_sentences.csv")
lift.rename(columns={'Cancer Type': 'cancer_type'}, inplace=True)

In [47]:
# Define source-to-method mapping
sources = {
    'lgb': 'LGBM',
    'xgb': 'XGBS',
    'lift': 'LIFT',
    # 'rules': 'RULE',
    # 'forest': 'FRST',
}

# Initialize an empty list to store processed dataframes
processed_dfs = []

# Process each dataframe in a loop
for df_name, df in [('xgb', xgb), ('lgb', lgb), ('lift', lift)]:
    # Create a copy to avoid modifying the original
    temp_df = df.copy()

    # Add method column
    temp_df['method'] = sources[df_name]

    # Rename the hypothesis column to 'hypo' as requested in output
    temp_df = temp_df.rename(columns={'hypothesis': 'hypo_factors'})

    # Add to list of processed dataframes
    processed_dfs.append(temp_df)

# Combine all processed dataframes
combined_df = pd.concat(processed_dfs, ignore_index=True)

combined_df.sort_values(by=['method', 'support'], ascending=[True, False], inplace=True)
combined_df['rank'] = combined_df.groupby('method')['support'].rank(ascending=False, method='first').astype(int)
combined_df['hypo_id'] = combined_df.apply(
        lambda row: f"{row['method']}.{row['cancer_type'].upper().replace(' ', '_')}.{row['rank']}",
        axis=1
    )

# Display the first few rows to verify the result
combined_df.head()

Unnamed: 0.1,cancer_type,Codons,Start_Position,Hugo_Symbol,End_Position,upstream_gene_variant,TMB (nonsynonymous),Sex,Event_Info,Site1_Hugo_Symbol,...,splice_acceptor_variant,support,hypo_factors,method,Unnamed: 0,Consequence,Position,VAR_TYPE_SX,rank,hypo_id
636,Colorectal Carcinoma,,,,,,8.766667,Male,,,...,,214.0,TMB (nonsynonymous) value is 8.766666667 AND D...,LGBM,,,,,1,LGBM.COLORECTAL_CARCINOMA.1
637,Colorectal Carcinoma,,,,,,3.333333,Female,,,...,,105.0,TMB (nonsynonymous) value is 3.333333333 AND D...,LGBM,,,,,2,LGBM.COLORECTAL_CARCINOMA.2
801,Gastric Cancer,,,,,,2.9,Male,,,...,,94.0,TMB (nonsynonymous) value is 2.9 AND Diagnosis...,LGBM,,,,,3,LGBM.GASTRIC_CANCER.3
638,Colorectal Carcinoma,,,,,,3.1,Female,,,...,,92.0,TMB (nonsynonymous) value is 3.1 AND Diagnosis...,LGBM,,,,,4,LGBM.COLORECTAL_CARCINOMA.4
802,Gastric Cancer,,,,,,2.833333,Male,,,...,,78.0,TMB (nonsynonymous) value is 2.833333333 AND D...,LGBM,,,,,5,LGBM.GASTRIC_CANCER.5


In [48]:
# make the first column the hypo_id, the second the cancer_type, the third the hypo_factors while keeping all the other columns
combined_df = combined_df[['hypo_id', 'cancer_type', 'hypo_factors'] + [col for col in combined_df.columns if col not in ['hypo_id', 'cancer_type', 'hypo_factors']]]


In [49]:
combined_df.shape

(1294, 29)

In [50]:
combined_df['method'].value_counts()

method
LGBM    633
XGBS    621
LIFT     40
Name: count, dtype: int64

In [51]:
combined_df.to_csv("models_hypotheses/combined_hypotheses.csv", index=False)