In [2]:
import pandas as pd

In [3]:
forest = pd.read_csv("models_hypotheses/Random_Forest_gpu_hypotheses_as_sentences.csv")
xgb = pd.read_csv("models_hypotheses/XGBoost_hypotheses_as_sentences.csv")
rules = pd.read_csv("models_hypotheses/rules_df.csv")
rules.rename(columns={'Rule': 'hypothesis', 'Cancer Type': 'cancer_type'}, inplace=True)

In [8]:
# Define source-to-method mapping
sources = {
    'forest': 'FRST',
    'xgb': 'XGBS',
    'rules': 'RULE'
}

# Initialize an empty list to store processed dataframes
processed_dfs = []

# Process each dataframe in a loop
for df_name, df in [('forest', forest), ('xgb', xgb), ('rules', rules)]:
    # Create a copy to avoid modifying the original
    temp_df = df.copy()

    # Add method column
    temp_df['method'] = sources[df_name]

    # Create hypo_id column
    temp_df['hypo_id'] = temp_df.apply(
        lambda row: f"{sources[df_name]}.{row['cancer_type'].upper().replace(' ', '_')}.{row.name}",
        axis=1
    )

    # Rename the hypothesis column to 'hypo' as requested in output
    temp_df = temp_df.rename(columns={'hypothesis': 'hypo_factors'})

    # Select only required columns
    temp_df = temp_df[['hypo_id', 'hypo_factors', 'method', 'cancer_type']]

    # Add to list of processed dataframes
    processed_dfs.append(temp_df)

# Combine all processed dataframes
combined_df = pd.concat(processed_dfs, ignore_index=True)

# Display the first few rows to verify the result
combined_df.head()

Unnamed: 0,hypo_id,hypo_factors,method,cancer_type
0,FRST.BREAST_CARCINOMA.0,TMB (nonsynonymous) value is 0.766666667 AND E...,FRST,Breast Carcinoma
1,FRST.BREAST_CARCINOMA.1,Event Info value is BRAF-Intragenic AND Site1 ...,FRST,Breast Carcinoma
2,FRST.BREAST_CARCINOMA.2,Site2 Hugo Symbol value is RARA AND Event Info...,FRST,Breast Carcinoma
3,FRST.BREAST_CARCINOMA.3,TMB (nonsynonymous) value is 0.333333333 AND S...,FRST,Breast Carcinoma
4,FRST.BREAST_CARCINOMA.4,TMB (nonsynonymous) value is 0.333333333 AND S...,FRST,Breast Carcinoma


In [7]:
combined_df.head(50)

Unnamed: 0,hypo_id,hypo,method,cancer_type
0,FRST.BREAST_CARCINOMA.0,TMB (nonsynonymous) value is 0.766666667 AND E...,FRST,Breast Carcinoma
1,FRST.BREAST_CARCINOMA.1,Event Info value is BRAF-Intragenic AND Site1 ...,FRST,Breast Carcinoma
2,FRST.BREAST_CARCINOMA.2,Site2 Hugo Symbol value is RARA AND Event Info...,FRST,Breast Carcinoma
3,FRST.BREAST_CARCINOMA.3,TMB (nonsynonymous) value is 0.333333333 AND S...,FRST,Breast Carcinoma
4,FRST.BREAST_CARCINOMA.4,TMB (nonsynonymous) value is 0.333333333 AND S...,FRST,Breast Carcinoma
5,FRST.BREAST_CARCINOMA.5,Event Info value is ETV6-ETV62 AND Sex value i...,FRST,Breast Carcinoma
6,FRST.BREAST_CARCINOMA.6,TMB (nonsynonymous) value is 0.2 AND Event Inf...,FRST,Breast Carcinoma
7,FRST.BREAST_CARCINOMA.7,Event Info value is RBSN-RAF1 AND Sex value is...,FRST,Breast Carcinoma
8,FRST.BREAST_CARCINOMA.8,Site2 Hugo Symbol value is RB1 AND Event Info ...,FRST,Breast Carcinoma
9,FRST.COLORECTAL_CARCINOMA.9,TMB (nonsynonymous) value is 2.8 AND Site1 Hug...,FRST,Colorectal Carcinoma


In [9]:
combined_df.to_csv("models_hypotheses/combined_hypotheses.csv", index=False)