In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import os

In [2]:
# Load the datasets
sidereva_path = 'C:/Users/stdso/Documents/USTH/Med/BioAct-Het-main/Data/sidereva.csv'
group2eva_path = 'C:/Users/stdso/Documents/USTH/Med/BioAct-Het-main/Data/group2eva.csv'

In [3]:

sidereva_df = pd.read_csv(sidereva_path)
group2eva_df = pd.read_csv(group2eva_path)

In [4]:
print(sidereva_df)

   Unnamed: 0                                             smiles  \
0           0             CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl   
1           1           C1=C(N=C(S1)N=C(N)N)CSCCC(=NS(=O)(=O)N)N   
2           2                 C1=CC(=C(C(=C1)Cl)CC(=O)N=C(N)N)Cl   
3           3            CCN(CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl)CCO   
4           4              CCC(CC)OC1C=C(CC(C1NC(=O)C)N)C(=O)OCC   
5           5   CC12CC(C3C(C1CCC2(C(=O)CO)O)CCC4=CC(=O)C=CC34C)O   
6           6  CC(C)C1=NC(=CS1)CN(C)C(=O)NC(C(C)C)C(=O)NC(CC2...   

   Hepatobiliary disorders  Metabolism and nutrition disorders  \
0                        1                                   1   
1                        1                                   1   
2                        0                                   1   
3                        1                                   1   
4                        1                                   1   
5                        1                                 

In [5]:
print(group2eva_df)

   Unnamed: 0                                             smiles  \
0           0             CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl   
1           1           C1=C(N=C(S1)N=C(N)N)CSCCC(=NS(=O)(=O)N)N   
2           2                 C1=CC(=C(C(=C1)Cl)CC(=O)N=C(N)N)Cl   
3           3            CCN(CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl)CCO   
4           4              CCC(CC)OC1C=C(CC(C1NC(=O)C)N)C(=O)OCC   
5           5   CC12CC(C3C(C1CCC2(C(=O)CO)O)CCC4=CC(=O)C=CC34C)O   
6           6  CC(C)C1=NC(=CS1)CN(C)C(=O)NC(C(C)C)C(=O)NC(CC2...   

   Hepatobiliary disorders  Metabolism and nutrition disorders  \
0                 0.967550                            0.993751   
1                 0.690025                            0.603870   
2                 0.156855                            0.857232   
3                 0.595787                            0.648333   
4                 0.600960                            0.743598   
5                 0.619850                            0.691

In [6]:
# Convert 'Drug_Name' to string to ensure consistency
sidereva_df['Drug_Name'] = sidereva_df['Drug_Name'].astype(str)
group2eva_df['Drug_Name'] = group2eva_df['Drug_Name'].astype(str)


In [7]:
# Merge the dataframes on 'Drug_Name'
merged_df = pd.merge(sidereva_df, group2eva_df, on='Drug_Name', suffixes=('_gt', '_pred'))
print(merged_df)

   Unnamed: 0_gt                                          smiles_gt  \
0              0             CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl   
1              1           C1=C(N=C(S1)N=C(N)N)CSCCC(=NS(=O)(=O)N)N   
2              2                 C1=CC(=C(C(=C1)Cl)CC(=O)N=C(N)N)Cl   
3              3            CCN(CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl)CCO   
4              4              CCC(CC)OC1C=C(CC(C1NC(=O)C)N)C(=O)OCC   
5              5   CC12CC(C3C(C1CCC2(C(=O)CO)O)CCC4=CC(=O)C=CC34C)O   
6              6  CC(C)C1=NC(=CS1)CN(C)C(=O)NC(C(C)C)C(=O)NC(CC2...   

   Hepatobiliary disorders_gt  Metabolism and nutrition disorders_gt  \
0                           1                                      1   
1                           1                                      1   
2                           0                                      1   
3                           1                                      1   
4                           1                                      1   

In [8]:
# Define the ground truth columns ending with '_gt' excluding "0_gt" and "smiles_gt"
gt_columns = [col for col in merged_df.columns if col.endswith('_gt') and col not in ["Unnamed: 0_gt", "smiles_gt"]]

# Generate prediction columns by replacing '_gt' with '_pred'
pred_columns = [col.replace('_gt', '_pred') for col in gt_columns]


In [9]:
print(gt_columns)

['Hepatobiliary disorders_gt', 'Metabolism and nutrition disorders_gt', 'Product issues_gt', 'Eye disorders_gt', 'Investigations_gt', 'Musculoskeletal and connective tissue disorders_gt', 'Gastrointestinal disorders_gt', 'Social circumstances_gt', 'Immune system disorders_gt', 'Reproductive system and breast disorders_gt', 'Neoplasms benign, malignant and unspecified (incl cysts and polyps)_gt', 'General disorders and administration site conditions_gt', 'Endocrine disorders_gt', 'Surgical and medical procedures_gt', 'Vascular disorders_gt', 'Blood and lymphatic system disorders_gt', 'Skin and subcutaneous tissue disorders_gt', 'Congenital, familial and genetic disorders_gt', 'Infections and infestations_gt', 'Respiratory, thoracic and mediastinal disorders_gt', 'Psychiatric disorders_gt', 'Renal and urinary disorders_gt', 'Pregnancy, puerperium and perinatal conditions_gt', 'Ear and labyrinth disorders_gt', 'Cardiac disorders_gt', 'Nervous system disorders_gt', 'Injury, poisoning and p

In [10]:
num_gt_columns = len(gt_columns)

print("Number of columns related to disorders:", num_gt_columns)

Number of columns related to disorders: 27


In [11]:
print(pred_columns)

['Hepatobiliary disorders_pred', 'Metabolism and nutrition disorders_pred', 'Product issues_pred', 'Eye disorders_pred', 'Investigations_pred', 'Musculoskeletal and connective tissue disorders_pred', 'Gastrointestinal disorders_pred', 'Social circumstances_pred', 'Immune system disorders_pred', 'Reproductive system and breast disorders_pred', 'Neoplasms benign, malignant and unspecified (incl cysts and polyps)_pred', 'General disorders and administration site conditions_pred', 'Endocrine disorders_pred', 'Surgical and medical procedures_pred', 'Vascular disorders_pred', 'Blood and lymphatic system disorders_pred', 'Skin and subcutaneous tissue disorders_pred', 'Congenital, familial and genetic disorders_pred', 'Infections and infestations_pred', 'Respiratory, thoracic and mediastinal disorders_pred', 'Psychiatric disorders_pred', 'Renal and urinary disorders_pred', 'Pregnancy, puerperium and perinatal conditions_pred', 'Ear and labyrinth disorders_pred', 'Cardiac disorders_pred', 'Nerv

In [12]:
# Count the number of columns related to disorders
num_pred_columns = len(pred_columns)

print("Number of columns related to disorders:", num_pred_columns)

Number of columns related to disorders: 27


In [18]:
# Assuming merged_df, gt_columns, pred_columns are predefined
confusion_matrices = {}

# Compute confusion matrices for each drug
for drug_name in merged_df['Drug_Name'].unique():
    drug_df = merged_df[merged_df['Drug_Name'] == drug_name]
    y_true = drug_df[gt_columns].iloc[0].values.astype(int)
    y_pred_prob = drug_df[pred_columns].iloc[0].values
    y_pred = (y_pred_prob >= 0.5).astype(int)
    cm = confusion_matrix(y_true, y_pred).T
    confusion_matrices[drug_name] = cm

# Define the number of drugs to plot and select the drugs
num_drugs_to_plot = 7
selected_drugs = list(confusion_matrices.keys())[:num_drugs_to_plot]

# Set the output directory
output_directory = 'C:/Users/stdso/Documents/USTH/Med/BioAct-Het-main/Output/CF'

# Ensure the output directory exists
os.makedirs(output_directory, exist_ok=True)

# Plot the confusion matrix for each selected drug and save independently
for drug_name in selected_drugs:
    cm = confusion_matrices[drug_name]
    
    # Create a new figure for each drug
    plt.figure(figsize=(8, 8))  
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', annot_kws={"size": 16}, cbar=False)  
    plt.title(f'Confusion Matrix for {drug_name}', size=20) 
    plt.xlabel('True Label', size=18)  
    plt.ylabel('Predicted Label', size=18) 
    plt.gca().invert_yaxis()
    
    # Save each figure independently
    output_path = os.path.join(output_directory, f'{drug_name}_confusion_matrix.png')
    plt.savefig(output_path, bbox_inches='tight')
    
    # Close the figure to free memory
    plt.close()