In [1]:
import pandas as pd

In [3]:
forest = pd.read_csv("models_hypotheses/Random_Forest_gpu_hypotheses_as_sentences.csv")
xgb = pd.read_csv("models_hypotheses/XGBoost_hypotheses_as_sentences.csv")
rules = pd.read_csv("models_hypotheses/rules_df.csv")
lgb = pd.read_csv("models_hypotheses/LightGBM_hypotheses_as_sentences.csv")
lift = pd.read_csv("models_hypotheses/LIFT_hypotheses_as_sentences.csv")
rules.rename(columns={'Rule': 'hypothesis', 'Cancer Type': 'cancer_type'}, inplace=True)

In [4]:
# Define source-to-method mapping
sources = {
    'lgb': 'LGBM',
    'xgb': 'XGBS',
    'lift': 'LIFT',
    'rules': 'RULE',
    'forest': 'FRST',
}

# Initialize an empty list to store processed dataframes
processed_dfs = []

# Process each dataframe in a loop
for df_name, df in [('forest', forest), ('xgb', xgb), ('rules', rules), ('lgb', lgb), ('lift', lift)]:
    # Create a copy to avoid modifying the original
    temp_df = df.copy()

    # Add method column
    temp_df['method'] = sources[df_name]

    # Create hypo_id column
    temp_df['hypo_id'] = temp_df.apply(
        lambda row: f"{sources[df_name]}.{row['cancer_type'].upper().replace(' ', '_')}.{row.name}",
        axis=1
    )

    # Rename the hypothesis column to 'hypo' as requested in output
    temp_df = temp_df.rename(columns={'hypothesis': 'hypo_factors'})

    # Select only required columns
    # temp_df = temp_df[['hypo_id', 'hypo_factors', 'method', 'cancer_type']]

    # Add to list of processed dataframes
    processed_dfs.append(temp_df)

# Combine all processed dataframes
combined_df = pd.concat(processed_dfs, ignore_index=True)

# Display the first few rows to verify the result
combined_df.head()

Unnamed: 0,TMB (nonsynonymous),Smoke Status,Site2_Hugo_Symbol,cancer_type,Event_Info,Site1_Hugo_Symbol,Sex,Diagnosis Age,intergenic_variant,splice_donor_variant,...,stop_gained,inframe_insertion,Chromosome,start_lost,End_Position,NMD_transcript_variant,Start_Position,hypo_factors,method,hypo_id
0,0.766667,,,Breast Carcinoma,LRP1B-Intragenic,,Female,,,,...,,,,,,,,TMB (nonsynonymous) value is 0.766666667 AND E...,FRST,FRST.BREAST_CARCINOMA.0
1,,,,Breast Carcinoma,BRAF-Intragenic,BRAF,Female,41-50,,,...,,,,,,,,Event Info value is BRAF-Intragenic AND Site1 ...,FRST,FRST.BREAST_CARCINOMA.1
2,,,RARA,Breast Carcinoma,CACNB1-RARA,CACNB1,Female,,,,...,,,,,,,,Site2 Hugo Symbol value is RARA AND Event Info...,FRST,FRST.BREAST_CARCINOMA.2
3,0.333333,,ATR,Breast Carcinoma,,ATR,Female,41-50,,,...,,,,,,,,TMB (nonsynonymous) value is 0.333333333 AND S...,FRST,FRST.BREAST_CARCINOMA.3
4,0.333333,,ATR,Breast Carcinoma,,,Female,41-50,,,...,,,,,,,,TMB (nonsynonymous) value is 0.333333333 AND S...,FRST,FRST.BREAST_CARCINOMA.4


In [5]:
# make the first column the hypo_id, the second the cancer_type, the third the hypo_factors while keeping all the other columns
combined_df = combined_df[['hypo_id', 'cancer_type', 'hypo_factors'] + [col for col in combined_df.columns if col not in ['hypo_id', 'cancer_type', 'hypo_factors']]]


In [6]:
combined_df.head(20)

Unnamed: 0,hypo_id,cancer_type,hypo_factors,TMB (nonsynonymous),Smoke Status,Site2_Hugo_Symbol,Event_Info,Site1_Hugo_Symbol,Sex,Diagnosis Age,...,non_coding_transcript_variant,5_prime_UTR_variant,stop_gained,inframe_insertion,Chromosome,start_lost,End_Position,NMD_transcript_variant,Start_Position,method
0,FRST.BREAST_CARCINOMA.0,Breast Carcinoma,TMB (nonsynonymous) value is 0.766666667 AND E...,0.766667,,,LRP1B-Intragenic,,Female,,...,,,,,,,,,,FRST
1,FRST.BREAST_CARCINOMA.1,Breast Carcinoma,Event Info value is BRAF-Intragenic AND Site1 ...,,,,BRAF-Intragenic,BRAF,Female,41-50,...,,,,,,,,,,FRST
2,FRST.BREAST_CARCINOMA.2,Breast Carcinoma,Site2 Hugo Symbol value is RARA AND Event Info...,,,RARA,CACNB1-RARA,CACNB1,Female,,...,,,,,,,,,,FRST
3,FRST.BREAST_CARCINOMA.3,Breast Carcinoma,TMB (nonsynonymous) value is 0.333333333 AND S...,0.333333,,ATR,,ATR,Female,41-50,...,,,,,,,,,,FRST
4,FRST.BREAST_CARCINOMA.4,Breast Carcinoma,TMB (nonsynonymous) value is 0.333333333 AND S...,0.333333,,ATR,,,Female,41-50,...,,,,,,,,,,FRST
5,FRST.BREAST_CARCINOMA.5,Breast Carcinoma,Event Info value is ETV6-ETV62 AND Sex value i...,,,,ETV6-ETV62,,Female,41-50,...,,,,,,,,,,FRST
6,FRST.BREAST_CARCINOMA.6,Breast Carcinoma,TMB (nonsynonymous) value is 0.2 AND Event Inf...,0.2,,,ERBB2-Intragenic,,Female,,...,,,,,,,,,,FRST
7,FRST.BREAST_CARCINOMA.7,Breast Carcinoma,Event Info value is RBSN-RAF1 AND Sex value is...,,,,RBSN-RAF1,,Female,41-50,...,,,,,,,,,,FRST
8,FRST.BREAST_CARCINOMA.8,Breast Carcinoma,Site2 Hugo Symbol value is RB1 AND Event Info ...,,,RB1,LOC100507464-RB1,,Female,,...,,,,,,,,,,FRST
9,FRST.COLORECTAL_CARCINOMA.9,Colorectal Carcinoma,TMB (nonsynonymous) value is 2.8 AND Site1 Hug...,2.8,,,,FGFR2,Female,,...,,,,,,,,,,FRST


In [7]:
combined_df.to_csv("models_hypotheses/combined_hypotheses.csv", index=False)