In [1]:
import pandas as pd
from druxai.utils.evaluation import check_shared_cells
from scipy.stats import spearmanr
import os

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# Define the folder path where the CSV files are located
folder_path = "/Users/niklaskiermeyer/Desktop/Codespace/DruxAI/results/slurm/cv_results_new/predictions/"
fold_results = {}

# Loop through each fold
for file in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    fold_results[file] = df

In [4]:
# Check if cell-lines exist in different folds
check_shared_cells(fold_results)

'No shared cells found across folds.'

In [5]:
# Overall R Score per Fold
overall_mean_rscore = []
for key in fold_results:
    fold_r_scores = []  # Initialize a list to store R scores for the current fold
    r_value, _ = spearmanr(fold_results[key]["prediction"], fold_results[key]["ground_truth"])
    overall_mean_rscore.append(r_value)
    print(f"Fold {key} - Mean R Score: {r_value}")

print("Cross Fold Mean R Score: ", sum(overall_mean_rscore) / len(overall_mean_rscore))

Fold prediction_fold_5_epoch_39.csv - Mean R Score: 0.4209302191481973
Fold prediction_fold_3_epoch_49.csv - Mean R Score: 0.4292651462005418
Fold prediction_fold_4_epoch_49.csv - Mean R Score: 0.43464250733919735
Fold prediction_fold_1_epoch_46.csv - Mean R Score: 0.4226696369172598
Fold prediction_fold_2_epoch_35.csv - Mean R Score: 0.4311486069533207
Cross Fold Mean R Score:  0.4277312233117033


In [7]:
entire_dataset_rscores = []  # Initialize a DataFrame to store the R scores for the entire dataset

for key in fold_results:
    fold_r_scores = []  # Initialize a list to store R scores for the current fold
    for group_name, group_data in fold_results[key].groupby("drugs"):
        r_value, _ = spearmanr(group_data["prediction"], group_data["ground_truth"])
        fold_r_scores.append({"Group": group_name, "Mean Drug R Score": r_value})
    entire_dataset_rscores.extend(fold_r_scores)
    fold_mean_r_score = pd.DataFrame(fold_r_scores)["Mean Drug R Score"].mean()
    print(f"Fold {key} - Mean R Score per drug: {fold_mean_r_score}")

entire_dataset_rscores = pd.DataFrame(entire_dataset_rscores).groupby("Group").mean().sort_values("Mean Drug R Score"
                                                                                                  , ascending=False)
print(" \n Overall per drug mean R Score:", entire_dataset_rscores["Mean Drug R Score"].mean())

Fold prediction_fold_5_epoch_39.csv - Mean R Score per drug: 0.08309847547808467
Fold prediction_fold_3_epoch_49.csv - Mean R Score per drug: 0.06756986594414148
Fold prediction_fold_4_epoch_49.csv - Mean R Score per drug: 0.06756579042480379
Fold prediction_fold_1_epoch_46.csv - Mean R Score per drug: 0.0842371612316657
Fold prediction_fold_2_epoch_35.csv - Mean R Score per drug: 0.07882246561255968
 
 Overall per drug mean R Score: 0.07625875173825107


In [8]:
entire_dataset_rscores["Mean Drug R Score"].mean()

0.07625875173825107

In [9]:
num_groups = len(entire_dataset_rscores[entire_dataset_rscores["Mean Drug R Score"] > 0.2])
print("Number of groups with mean drug R score over 0.2:", num_groups)


Number of groups with mean drug R score over 0.2: 65


In [10]:
entire_dataset_rscores[entire_dataset_rscores["Mean Drug R Score"] > 0.2]

Unnamed: 0_level_0,Mean Drug R Score
Group,Unnamed: 1_level_1
AZD8330,0.488186
DACOMITINIB,0.370811
TAK-733,0.364630
RO-4987655,0.363987
MEK162,0.321173
...,...
AST-1306,0.201655
GZD824,0.201572
GSK2126458,0.200916
MK-3207,0.200229


In [11]:
entire_dataset_rscores.tail(20)

Unnamed: 0_level_0,Mean Drug R Score
Group,Unnamed: 1_level_1
BVT-948,-0.070793
HEXACHLOROPHENE,-0.070916
EBASTINE,-0.07108
TANSHINONE-I,-0.071319
PHENAZONE,-0.073362
LENALIDOMIDE,-0.074156
BUCLADESINE,-0.074225
FLUDARABINE-PHOSPHATE,-0.074655
LONAFARNIB,-0.075254
DPI-201106,-0.076851
