In [16]:
import pandas as pd
from druxai.utils.evaluation import check_shared_cells
from scipy.stats import spearmanr
import os


In [15]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
# Define the folder path where the CSV files are located
folder_path = "/Users/niklaskiermeyer/Desktop/Codespace/DruxAI/results/slurm/cv_results/predictions/"
fold_results = {}

# Loop through each fold
for file in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    fold_results[file] = df

In [29]:
# Check if cell-lines exist in different folds
check_shared_cells(fold_results)

'No shared cells found across folds.'

In [25]:
# Overall R Score per Fold
overall_mean_rscore = []
for key in fold_results:
    fold_r_scores = []  # Initialize a list to store R scores for the current fold
    r_value, _ = spearmanr(fold_results[key]["prediction"], fold_results[key]["ground_truth"])
    overall_mean_rscore.append(r_value)
    print(f"Fold {key} - Mean R Score: {r_value}")

print("Cross Fold Mean R Score: ", sum(overall_mean_rscore) / len(overall_mean_rscore))

Fold prediction_fold_1_epoch_42.csv - Mean R Score: 0.4171068106762498
Fold prediction_fold_5_epoch_49.csv - Mean R Score: 0.4151277901031211
Fold prediction_fold_2_epoch_49.csv - Mean R Score: 0.42778145719689264
Fold prediction_fold_4_epoch_34.csv - Mean R Score: 0.43002046989426707
Fold prediction_fold_3_epoch_35.csv - Mean R Score: 0.4260456870993302
Cross Fold Mean R Score:  0.4232164429939722


In [21]:
entire_dataset_rscores = []  # Initialize a DataFrame to store the R scores for the entire dataset

for key in fold_results:
    fold_r_scores = []  # Initialize a list to store R scores for the current fold
    for group_name, group_data in fold_results[key].groupby("drugs"):
        r_value, _ = spearmanr(group_data["prediction"], group_data["ground_truth"])
        fold_r_scores.append({"Group": group_name, "Mean Drug R Score": r_value})
    entire_dataset_rscores.extend(fold_r_scores)
    fold_mean_r_score = pd.DataFrame(fold_r_scores)["Mean Drug R Score"].mean()
    print(f"Fold {key} - Mean R Score per drug: {fold_mean_r_score}")

entire_dataset_rscores = pd.DataFrame(entire_dataset_rscores).groupby("Group").mean().sort_values("Mean Drug R Score"
                                                                                                  , ascending=False)
print(" \n Overall per drug mean R Score:", entire_dataset_rscores["Mean Drug R Score"].mean())

Fold prediction_fold_1_epoch_42.csv - Mean R Score per drug: 0.06486027369240122
Fold prediction_fold_5_epoch_49.csv - Mean R Score per drug: 0.05317119969150639
Fold prediction_fold_2_epoch_49.csv - Mean R Score per drug: 0.06902745637260171
Fold prediction_fold_4_epoch_34.csv - Mean R Score per drug: 0.0651612373671573
Fold prediction_fold_3_epoch_35.csv - Mean R Score per drug: 0.02971280537107863
 
 Overall per drug mean R Score: 0.056386594498949054


In [22]:
entire_dataset_rscores["Mean Drug R Score"].mean()

0.056386594498949054

In [26]:
num_groups = len(entire_dataset_rscores[entire_dataset_rscores["Mean Drug R Score"] > 0.2])
print("Number of groups with mean drug R score over 0.2:", num_groups)


Number of groups with mean drug R score over 0.2: 39


In [28]:
entire_dataset_rscores[entire_dataset_rscores["Mean Drug R Score"] > 0.2]

Unnamed: 0_level_0,Mean Drug R Score
Group,Unnamed: 1_level_1
DACOMITINIB,0.336999
VINDESINE,0.307724
VERUBULIN,0.306661
DANUSERTIB,0.300946
VINBLASTINE,0.283764
VINORELBINE,0.276414
AV-412,0.270411
10-HYDROXYCAMPTOTHECIN,0.265944
TOPOTECAN,0.259949
PF-03758309,0.258802


In [24]:
entire_dataset_rscores.tail(20)

Unnamed: 0_level_0,Mean Drug R Score
Group,Unnamed: 1_level_1
CHIR-124,-0.096588
TG100-115,-0.096816
CYPROTERONE-ACETATE,-0.09794
SORAFENIB,-0.098629
LY364947,-0.099609
RITA,-0.099953
MOCETINOSTAT,-0.100636
BIRINAPANT,-0.102216
G-1,-0.104626
HYDROCORTISONE,-0.105926
