<a href="https://colab.research.google.com/github/NikitaN65/HelloWorld/blob/main/mimic_flat_cbm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score
import torch.nn.functional as F

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
mpl.rcParams.update(mpl.rcParamsDefault)
plt.rcParams['font.size'] = 20

In [25]:
# import pandas as pd
# import numpy as np
# import torch
# from torch.utils.data import Dataset

# class PatientDataset(Dataset):
#     def __init__(self, patient_info, drg_codes, filepath_features):
#         patient_df = pd.read_csv(patient_info)
#         codes_df = pd.read_csv(drg_codes)
#         features_df = pd.read_csv(filepath_features)

#         # Pivot the features dataframe
#         features_pivot = features_df.pivot_table(index='subject_id', columns='itemid', values='average_value', aggfunc=np.mean)
#         features_pivot.reset_index(inplace=True)

#         # Merge datasets - first the codes and patient info, then the features. N/A if a particular feature doesn't have a match
#         df = pd.merge(pd.merge(patient_df, codes_df, on=['subject_id', 'hadm_id']), features_pivot, on='subject_id', how='left')

#         # Convert gender to numeric
#         df['gender'] = df['gender'].map({'M': 0, 'F': 1})

#         columns_to_remove = ['intime', 'outtime', 'los', 'admission_type', 'deathtime', 'anchor_age', 'anchor_year', 'anchor_year_group', 'hadm_id', 'description']  # remove columns I don't need for now
#         df = df.drop(columns=columns_to_remove, axis=1)
#         df = df.drop_duplicates()



#         print(df.head(100))


#         # Remove non-feature columns before creating indicators
#         feature_columns = df.columns.difference(['subject_id', 'hadm_id', 'drg_code'])
#         df_features_only = df[feature_columns]

#         # Initialize a matrix for indicators (1 if present, 0 if missing) based on feature columns only
#         indicators = df_features_only.isna().astype(int).values


#         self.features = df_features_only.values
#         self.indicators = indicators
#         self.targets = df['drg_code'].values

#     def __len__(self):
#         return len(self.targets)

#     def __getitem__(self, idx):
#         features_with_indicators = np.concatenate([self.features[idx], self.indicators[idx]], axis=0)
#         features_with_indicators = features_with_indicators.astype(np.float32)
#         features_tensor = torch.tensor(features_with_indicators, dtype=torch.float32)
#         features_tensor[torch.isnan(features_tensor)] = 2500  # Placeholder value for NaNs
#         target = torch.tensor(self.targets[idx], dtype=torch.long)
#         return features_tensor, target



In [64]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from sklearn.preprocessing import MultiLabelBinarizer

class PatientDataset(Dataset):
    def __init__(self, patient_info, drg_codes, filepath_features):
        patient_df = pd.read_csv(patient_info)
        codes_df = pd.read_csv(drg_codes)
        codes_df.drop(columns = 'description', inplace=True)
        codes_df = codes_df.drop_duplicates()
        features_df = pd.read_csv(filepath_features)

        # Group by subject_id (and hadm_id if necessary) and aggregate DRG codes into lists
        aggregated_drg = codes_df.groupby(['subject_id', 'hadm_id'])['drg_code'].apply(list).reset_index()

        # Check uniqueness in patient_df
        print(patient_df[['subject_id', 'hadm_id']].duplicated().any())


        # Assuming 'itemid' and 'average_value' are columns in your features_df
        features_pivot = features_df.pivot_table(index='subject_id', columns='itemid', values='average_value', aggfunc=np.mean)
        features_pivot.reset_index(inplace=True)

        print(features_pivot)
        print('patient_df', patient_df)
        print('aggregated_drg', aggregated_drg)

        # Merge patient info with DRG codes, then with features
        initial_df = pd.merge(patient_df, aggregated_drg, on=['subject_id', 'hadm_id'])
        print('initial df, after merging patient and codes', initial_df)
        # Check uniqueness in codes_df
        print(initial_df[['subject_id', 'hadm_id']].duplicated().any())
        df = pd.merge(initial_df, features_pivot, on='subject_id', how='left')
        print('merged df before removing unnecessary columns', df)

        # removing unnecessary columns and removing duplicates (from when I grouped together similar conditions)
        columns_to_remove = ['intime', 'outtime', 'los', 'admission_type', 'deathtime', 'anchor_age', 'anchor_year', 'anchor_year_group', 'hadm_id']
        df.drop(columns=columns_to_remove, inplace=True, errors='ignore')


        # Convert gender to numeric
        df['gender'] = df['gender'].map({'M': 0, 'F': 1}).astype(int)

        print(df)

        # # Group by subject_id and aggregate DRG codes into lists
        # df_grouped = df.groupby('subject_id').agg({'drg_code': lambda x: list(x)}).reset_index()

        # print(df_grouped.head(100))

        # Initialize MultiLabelBinarizer for DRG codes
        mlb = MultiLabelBinarizer()
        self.targets = mlb.fit_transform(aggregated_drg['drg_code'])

        # # Merge grouped DRG codes with features (ensuring one row per patient)
        # df_final = pd.merge(df_grouped[['subject_id']], features_pivot, on='subject_id', how='right')

        # Prepare features and indicators
        feature_columns = df.columns.difference(['subject_id', 'drg_code'])
        self.features = df[feature_columns].values
        indicators = np.isnan(self.features).astype(int)
        self.features[np.isnan(self.features)] = 2500  # Replace NaN with 0 or another placeholder

        # Concatenate features and indicators
        self.features_with_indicators = np.concatenate([self.features, indicators], axis=1).astype(np.float32)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        features_tensor = torch.tensor(self.features_with_indicators[idx], dtype=torch.float32)
        target_tensor = torch.tensor(self.targets[idx], dtype=torch.float32)  # Use float for probability targets
        return features_tensor, target_tensor


In [10]:
# import pandas as pd
# import torch
# from torch.utils.data import Dataset, DataLoader

# class PatientDataset(Dataset):
#     def __init__(self, filepath_info, filepath_codes):
#         # Load and preprocess data
#         info_df = pd.read_csv(filepath_info)
#         codes_df = pd.read_csv(filepath_codes)

#         # Join datasets on 'subject_id'
#         df = pd.merge(info_df, codes_df, on=['subject_id', 'hadm_id'])

#         # Convert gender to numeric (e.g., M=0, F=1)
#         df['gender'] = df['gender'].map({'M': 0, 'F': 1})

#         self.features = df[['age_at_admission', 'gender']].values
#         self.targets = df['drg_code'].values

#     def __len__(self):
#         return len(self.targets)

#     def __getitem__(self, idx):
#         features = torch.tensor(self.features[idx], dtype=torch.float)
#         target = torch.tensor(self.targets[idx], dtype=torch.long)
#         return features, target

In [65]:
dataset = PatientDataset('/content/drive/My Drive/Colab Notebooks/CBMS_animal_dataset/patients_with_chosen_conditions.csv', '/content/drive/My Drive/Colab Notebooks/CBMS_animal_dataset/patient_drg_codes_specific_conditions_only.csv', '/content/drive/My Drive/Colab Notebooks/CBMS_animal_dataset/average_values.csv')
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

for features, target in dataloader:
    print(features, target)
    print(features.shape)
    print(target.shape)
    break

False
itemid  subject_id      220045      220050     220051     220052     220059  \
0         10002013   94.214286  110.619048  61.476190  76.700000  26.812500   
1         10002430   79.185714         NaN        NaN        NaN        NaN   
2         10002760   72.653846  108.434783  57.521739  74.217391  39.869565   
3         10003046  106.924528  112.150943  70.490566  81.962264        NaN   
4         10003502   59.960000         NaN        NaN        NaN        NaN   
...            ...         ...         ...        ...        ...        ...   
15079     19995258  108.760870         NaN        NaN        NaN        NaN   
15080     19995790   85.366667  113.130435  59.521739  76.478261  36.800000   
15081     19996783   83.896552         NaN        NaN        NaN        NaN   
15082     19997448   90.083333  128.000000  64.781250  86.000000        NaN   
15083     19997843   76.176471         NaN        NaN        NaN        NaN   

itemid     220060     220061     220074  2200

In [None]:
class model(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim):
        super(model, self).__init__()
        self.linear1 = nn.Linear(input_size, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, output_size)

    def forward(self, x):
        x = self.linear1(x)
        x = F.relu(x)
        x = self.linear2(x)
        probabilities = F.softmax(x, dim=1)

        return probabilities


# try 30 latent dims for predicting concepts
# between 14-50 for c to y


In [None]:
def train(x_sz,c_sz, y_sz, c_dim, y_dim, lr, lr_2, epochs):
    # define the input and output sizes
    torch.manual_seed(25)
    x_to_c = model(x_sz, c_sz, c_dim)
    c_to_y = model(c_sz, y_sz, y_dim)


    # define the loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(x_to_c.parameters(), lr=lr)
    yoptimizer = torch.optim.SGD(c_to_y.parameters(), lr=lr_2)


    epochs_cnt = []
    xclosses = []
    cylosses = []
    pred_c_list = []
    correct_c = []
    pred_y_list = []
    correct_y = []

    num_epochs = epochs
    for epoch in range(num_epochs):
        running_xc_loss = 0.0
        running_cy_loss = 0.0
        epochs_cnt.append(epoch)

        for i, batch in enumerate(data_loader):

            x, y, c = batch

            correct_c.append(c)
            correct_y.append(y)

            x = x.to(x_to_c.linear1.weight.dtype)

            # forward pass

            # model.train()


            pred_c = x_to_c(x)
            pred_c_list.append(pred_c.detach().numpy())
            lossc = criterion(pred_c, c)


            # backward pass
            optimizer.zero_grad()
            lossc.backward()
            optimizer.step()

            pred_y = c_to_y(pred_c.detach())
            pred_y_list.append(pred_y.detach().numpy())
            lossy = criterion(pred_y, y)

            # backward pass
            yoptimizer.zero_grad()
            lossy.backward()
            yoptimizer.step()

            running_xc_loss += lossc.item()
            running_cy_loss += lossy.item()

        xclosses.append(running_xc_loss/len(data_loader))
        cylosses.append(running_cy_loss/len(data_loader))


    return xclosses, cylosses, pred_c_list, pred_y_list, epochs_cnt, correct_c, correct_y

In [None]:
def calculate_and_save_accuracy(c_classes, epoch_cnt, l, l_2, pred_c_list, c, name):
    # Number of classes
    c_classes = c_classes

    # Create tensors to store class accuracies, false positives, and false negatives
    class_accuracies = torch.zeros(c_classes)
    false_positives = torch.zeros(c_classes)
    false_negatives = torch.zeros(c_classes)

    # Create variables to store total correct and total comparisons for overall accuracy
    total_correct = 0
    total_comparisons = 0

    # Define file path for saving results
    file_path = f"{name}_accuracy_flat_seq_{epoch_cnt}_{l}_{l_2}.txt"

    # Correct shapes
    pred_c_tensor = torch.tensor(pred_c_list)
    pred_c = pred_c_tensor.view(-1, pred_c_tensor.size(-1))
    pred_c = (pred_c > 1 / c_classes).float()
    c_flat = torch.cat(c, dim=0)

    # Restrict the data to the last 500 samples
    pred_c = pred_c[-500:]
    c_flat = c_flat[-500:]

    # Calculate class accuracies, false positives, false negatives, and overall accuracy
    for class_idx in range(c_classes):
        correct = torch.sum(((pred_c[:, class_idx] == 1) & (c_flat[:, class_idx] == 1)) | ((pred_c[:, class_idx] == 0) & (c_flat[:, class_idx] == 0)))
        total = 500
        false_positive = torch.sum((pred_c[:, class_idx] == 1) & (c_flat[:, class_idx] == 0))
        false_negative = torch.sum((pred_c[:, class_idx] == 0) & (c_flat[:, class_idx] == 1))

        # Calculate class accuracy as a percentage
        class_accuracy = (correct.float() / total) * 100
        class_accuracies[class_idx] = class_accuracy

        # Calculate false positives and false negatives as percentages
        false_positives[class_idx] = (false_positive.float() / torch.sum(c_flat[:, class_idx] == 0)) * 100
        false_negatives[class_idx] = (false_negative.float() / torch.sum(c_flat[:, class_idx] == 1)) * 100

        # Update total correct and total comparisons for overall accuracy
        total_correct += correct
        total_comparisons += total

    # Calculate overall accuracy
    overall_accuracy = (total_correct.float() / total_comparisons) * 100

    # Open the file in write mode
    with open(file_path, "w") as file:
        # Write class-wise accuracies, false positives, and false negatives to the file
        for class_idx, accuracy in enumerate(class_accuracies):
            false_positive = false_positives[class_idx]
            false_negative = false_negatives[class_idx]
            file.write(f'Class {class_idx}: Accuracy = {accuracy.item():.2f}%, False Positives = {false_positive.item():.2f}%, False Negatives = {false_negative.item():.2f}%\n')

        # Write overall accuracy to the file
        file.write(f'Overall Accuracy: {overall_accuracy.item():.2f}%\n')

In [None]:

lr = [0.8, 1.5]
lr_2 = [0.8, 5, 10]
# 85 -> 16 -> 50
epoch_cnt = 2000
c_dim = [24, 40]
y_dim = [35, 50]
num_concepts = 7

for l in lr:
  for l_2 in lr_2:
    for cd in c_dim:
      for yd in y_dim:
        xclosses, cylosses, pred_c_list, pred_y_list, epochs, c, y = train(85,num_concepts,50,cd,yd,l, l_2, epoch_cnt)
        plt.figure(figsize=(8, 6))  # create a new figure for each plot
        plt.title('Cross Entropy Loss', pad = 20)
        plt.xlabel('Epochs', labelpad = 20)
        plt.ylabel('Loss AU')
        plt.plot(epochs, xclosses, label = "x_to_c", linewidth=2)
        plt.tight_layout()
        plt.legend()
        plt.savefig(f"flat_seq_x_to_c_{cd}_{yd}_{epoch_cnt}_{l}_{l_2}_{num_concepts}.png", bbox_inches='tight') # save the plot to disk
        plt.close() # close the figure to free up memory

        plt.figure(figsize=(8, 6))  # create a new figure for each plot
        plt.title('Cross Entropy Loss', pad = 20)
        plt.xlabel('Epochs', labelpad = 20)
        plt.ylabel('Loss AU')
        plt.plot(epochs, cylosses, label = "c_to_y", linewidth=2)
        plt.tight_layout()
        plt.legend()
        plt.savefig(f"flat_seq_c_to_y_{cd}_{yd}_{epoch_cnt}_{l}_{l_2}_{num_concepts}.png", bbox_inches='tight') # save the plot to disk
        plt.close() # close the figure to free up memory

        calculate_and_save_accuracy(num_concepts, epoch_cnt, l, l_2, pred_c_list, c, f"c_{cd}_{yd}")

        # Number of classes
        y_classes = 50

        # Create tensors to store class accuracies, false positives, and false negatives
        y_accuracies = torch.zeros(y_classes)
        y_false_positives = torch.zeros(y_classes)
        y_false_negatives = torch.zeros(y_classes)

        # Create variables to store total correct and total comparisons for overall accuracy
        y_total_correct = 0
        y_total_comparisons = 0

        # Define file path for saving results
        file_path2 = f"label_accuracy_flat_seq_{cd}_{yd}_{epoch_cnt}_{l}_{l_2}_{num_concepts}.txt"

        # Correct shapes
        pred_y_tensor = torch.tensor(pred_y_list)
        pred_y = pred_y_tensor.view(-1, pred_y_tensor.size(-1))
        y_flat = torch.cat(y, dim=0)

        # Restrict the data to the last 500 samples
        pred_y = pred_y[-500:]
        y_flat = y_flat[-500:]

        # Get the predicted class indices
        predicted_classes = torch.argmax(pred_y, dim=1)

        # Calculate class accuracies, false positives, and false negatives
        for class_idx in range(y_classes):
            correct = torch.sum((predicted_classes == class_idx) & (y_flat[:, class_idx] == 1))
            total = torch.sum(y_flat[:, class_idx] == 1)
            false_positive = torch.sum((predicted_classes == class_idx) & (y_flat[:, class_idx] == 0))
            false_negative = torch.sum((predicted_classes != class_idx) & (y_flat[:, class_idx] == 1))

            # Calculate class accuracy as a percentage
            class_accuracy = (correct.float() / total) * 100
            y_accuracies[class_idx] = class_accuracy

            # Calculate false positives and false negatives as percentages
            y_false_positives[class_idx] = (false_positive.float() /  torch.sum(y_flat[:, class_idx] == 0)) * 100
            y_false_negatives[class_idx] = (false_negative.float() / total) * 100

            # Update total correct and total comparisons for overall accuracy
            y_total_correct += correct
            y_total_comparisons += total

        # Calculate overall accuracy
        overall_accuracy = (y_total_correct.float() / y_total_comparisons) * 100

        # Open the file in write mode
        with open(file_path2, "w") as file:
            # Write class-wise accuracies, false positives, and false negatives to the file
            for class_idx, accuracy in enumerate(y_accuracies):
                false_positive = y_false_positives[class_idx]
                false_negative = y_false_negatives[class_idx]
                file.write(f'Class {class_idx}: Accuracy = {accuracy.item():.2f}%, False Positives = {false_positive.item():.2f}%, False Negatives = {false_negative.item():.2f}%\n')

            # Write overall accuracy to the file
            file.write(f'Overall Accuracy: {overall_accuracy.item():.2f}%\n')




  pred_c_tensor = torch.tensor(pred_c_list)


In [None]:
import glob
import zipfile
from google.colab import files

# Retrieve a list of file paths that match a pattern
file_pattern = '*.png'  # Replace with your desired pattern
file_paths = glob.glob(file_pattern)

# Create a zip file
zip_filename = 'plots.zip'
with zipfile.ZipFile(zip_filename, 'w') as zipf:
    # Add all the files to the zip
    for file_path in file_paths:
        zipf.write(file_path)

# Download the zip file
files.download(zip_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Retrieve a list of file paths that match a pattern
file_pattern = '*.txt'  # Replace with your desired pattern
file_paths = glob.glob(file_pattern)

# Create a zip file
zip_filename = 'accuracies.zip'
with zipfile.ZipFile(zip_filename, 'w') as zipf:
    # Add all the files to the zip
    for file_path in file_paths:
        zipf.write(file_path)

# Download the zip file
files.download(zip_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>