# Imports

In [None]:
# %load_ext autoreload
# %autoreload 2

In [2]:
# %reload_ext autoreload

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import transforms
import time
from itertools import islice
from dataclasses import dataclass
import torchvision
from torchvision.models import densenet161, DenseNet161_Weights, vit_b_16, ViT_B_16_Weights, densenet121, DenseNet121_Weights
import os
import sys
from pathlib import Path

In [4]:
sys.path.append(str(Path.cwd().parent.parent))
from CheXpert.disease_prediction.dataset import CheXpertDiseaseDataset
from shared_utils import vprint, to_gpu
import shared_utils
from CheXpert.disease_prediction.utils import Configs
from CheXpert.race_prediction.utils import Configs as RaceConfigs
from CheXpert.race_prediction.dataset import CheXpertRaceDataset

# Configs 

In [5]:
@dataclass
class TrainingConfigs(Configs):
    DATA_DIR = os.path.join("..", "..", "data", "CheXpert", "CheXpert-v1.0-small")
    TRAIN_LABELS_ORIGINAL_FILENAME = "train.csv"
    VALID_LABELS_ORIGINAL_FILENAME = "valid.csv"
    TRAIN_LABELS_FILENAME = "train_demo40_no_u_stratifiedV3.csv"
    VALID_LABELS_FILENAME = "valid_demo40_no_u_stratifiedV3.csv"
    DEMO_FILENAME = "CHEXPERT DEMO.csv"
    RACE_DICT = RaceConfigs.RACE_DICT
    SAMPLE_NUM_STUDIES_PER_GROUP = 40
    CHECKPOINT_DIR = r"checkpoints"
    BATCH_SIZE = 16
    EPOCHS = 10
    SAMPLE_WEIGHT_FACTOR = 0.6
    LEARNING_RATE = 1e-4
    LEARNING_RATE_REDUCE_PATIENCE = 3 # number of epochs with no improvement before reducing LR
    LEARNING_RATE_REDUCING_FACTOR = 0.5
    LEARNING_RATE_MIN_VAL = 1e-5
    CHECKPOINT_TIME_INTERVAL = 60*60 # seconds
    MODEL_VERSION = "densenet121_disease_demo40_no_u_stratifiedV3"
    TRAINED_MODEL_PATH = None
    TRAIN_LOADER_SIZE = None
    VALID_LOADER_SIZE = None

In [6]:
shared_utils.set_seed(TrainingConfigs.SEED)

In [7]:
if torch.cuda.is_available():
    vprint(f"Memory info: {torch.cuda.mem_get_info()[0]/10e8:.1f} GB free GPU.", TrainingConfigs)
else: 
    vprint(f"No GPU Memory.", TrainingConfigs)

2022-08-21 21:01: Memory info: 8.5 GB free GPU.


In [8]:
# split train label file to train/valid files, where the valid file is consists of SAMPLE_NUM_PATIENTS_PER_GORUP patients
# for each race, gender, and age groups
# saves train/valid new created file to TRAIN_LABELS_FILENAME and VALID_LABELS_FILENAME, respectively

if TrainingConfigs.SAMPLE_NUM_STUDIES_PER_GROUP is not None:
    
    # loading demo file
    df_demo = CheXpertRaceDataset.generate_race_dummies(pd.read_csv(os.path.join(TrainingConfigs.DATA_DIR,
                                                                             TrainingConfigs.DEMO_FILENAME)),
                                                    'PRIMARY_RACE', TrainingConfigs.RACE_DICT)
    
    # loading train label file and joining demo attributes
    df_train = pd.read_csv(os.path.join(TrainingConfigs.DATA_DIR, TrainingConfigs.TRAIN_LABELS_ORIGINAL_FILENAME))
    df_train['patient_id'] = df_train.Path.apply(lambda p: p.split("/")[2])
    df_train = df_train.merge(df_demo, how='left', left_on='patient_id', right_on='PATIENT')
    df_train['age'] = df_train.Age.apply(shared_utils.age_to_age_group)
    df_train['gender'] = df_train.Sex
    
    # filter train to only patients with no uncertainty labels 
    is_patient_all_confident_labels = df_train.groupby(['patient_id'])[Configs.ANNOTATIONS_COLUMNS].apply(lambda s: (s==-1).sum(axis=1).sum(axis=0)==0)
    confidence_patients = np.array(is_patient_all_confident_labels[is_patient_all_confident_labels].index)
    df_c_train = df_train[df_train.patient_id.isin(confidence_patients)]
    df_c_train[Configs.ANNOTATIONS_COLUMNS] = df_c_train[Configs.ANNOTATIONS_COLUMNS].fillna(0)
    
    # multi disease samples - prior in the training data
    num_ones_per_study_proba = (df_c_train[Configs.ANNOTATIONS_COLUMNS]).sum(axis=1).value_counts()/len(df_c_train)
    
    # sample SAMPLE_NUM_PATIENTS_PER_GORUP per race, gender, age group
    # stratified sampling according to ANNOTATIONS_COLUMNS
    
    # fractions of 0 and 1 in training
    studies_per_protected_group_ones = df_c_train.groupby(['race', 'gender', 'age'])[Configs.ANNOTATIONS_COLUMNS].sum()
    studies_per_protected_group_zeros = df_c_train.groupby(['race', 'gender', 'age'])[Configs.ANNOTATIONS_COLUMNS].agg(lambda s: sum(s==0))
    studies_per_protected_group_ones = studies_per_protected_group_ones.merge(df_c_train.groupby(['race', 'gender', 'age']).size().to_frame(name="population_size"), left_index=True, right_index=True)
    studies_per_protected_group_zeros = studies_per_protected_group_zeros.merge(df_c_train.groupby(['race', 'gender', 'age']).size().to_frame(name="population_size"), left_index=True, right_index=True)
    
    # from each race, age, gender and label in one of the label columns - sample the same fraction as in training.
    df_list = []
    chosen = set()
    for attrs1, df_group1 in df_c_train.groupby(['race', 'gender', 'age']):
        for col in Configs.ANNOTATIONS_COLUMNS:
            for attrs2, df_group2 in df_group1.groupby(col, sort=True):
                if attrs2==0:
                    frac = studies_per_protected_group_zeros.loc[attrs1][col] / studies_per_protected_group_zeros.loc[attrs1]['population_size']
                    n =  int(np.floor(TrainingConfigs.SAMPLE_NUM_STUDIES_PER_GROUP * frac))
                else: 
                    frac = studies_per_protected_group_ones.loc[attrs1][col] / studies_per_protected_group_ones.loc[attrs1]['population_size']
                    n =  int(np.ceil(TrainingConfigs.SAMPLE_NUM_STUDIES_PER_GROUP * frac))
                chosen_filter = df_group2.index.isin(chosen)
                df_group2 = df_group2[~chosen_filter]
                n = max(0, n-sum(chosen_filter))
                weights = num_ones_per_study_proba[(df_group2[TrainingConfigs.ANNOTATIONS_COLUMNS]==1).sum(axis=1)].values
                df_list.append(df_group2.sample(n, weights=weights, replace=False, random_state=Configs.SEED))
                chosen.update(set(df_list[-1].index))
        sampled_studies = pd.concat(df_list)
        sampled_studies.groupby(['race', 'gender', 'age']).size()
    
    # filter valid patients from training
    df_valid_patients = sampled_studies.patient_id.unique()
    
    # saving files
    df_train_group_split = df_train[~df_train.patient_id.isin(df_valid_patients)]
    df_valid_group_split = sampled_studies
    df_train_group_split.to_csv(os.path.join(TrainingConfigs.DATA_DIR, TrainingConfigs.TRAIN_LABELS_FILENAME), index=False)
    df_valid_group_split.to_csv(os.path.join(TrainingConfigs.DATA_DIR, TrainingConfigs.VALID_LABELS_FILENAME), index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_c_train[Configs.ANNOTATIONS_COLUMNS] = df_c_train[Configs.ANNOTATIONS_COLUMNS].fillna(0)


# Training

## Training Setup

In [9]:
train_transform = transforms.Compose([
    transforms.Resize((320,320)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    # augmentation
    transforms.RandomHorizontalFlip(p=0.25),
    transforms.RandomApply([transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.01)], p=0.1),
    transforms.RandomApply([torchvision.transforms.GaussianBlur(kernel_size=(3,3) ,sigma=(0.25, 0.75))], p=0.1),
    torchvision.transforms.RandomAdjustSharpness(sharpness_factor=0.75, p=0.1),
    torchvision.transforms.RandomAdjustSharpness(sharpness_factor=1.25, p=0.1),
])

valid_transform = transforms.Compose([
    transforms.Resize((320,320)),
    transforms.ToTensor(), 
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

In [10]:
# Create data loaders.
train_dataset = CheXpertDiseaseDataset(data_dir=TrainingConfigs.DATA_DIR, 
                                       labels_filename=TrainingConfigs.TRAIN_LABELS_FILENAME,
                                       transform=train_transform,
                                       sample_weight_factor=TrainingConfigs.SAMPLE_WEIGHT_FACTOR)
train_dataloader = DataLoader(train_dataset, batch_size=TrainingConfigs.BATCH_SIZE, shuffle=False)
TrainingConfigs.TRAIN_LOADER_SIZE = len(train_dataloader)
len(train_dataset)

220026

In [11]:
valid_dataset = CheXpertDiseaseDataset(data_dir=TrainingConfigs.DATA_DIR, 
                                       labels_filename=TrainingConfigs.VALID_LABELS_FILENAME,
                                       transform=valid_transform)
valid_dataloader = DataLoader(valid_dataset, batch_size=TrainingConfigs.BATCH_SIZE, shuffle=False)
TrainingConfigs.VALID_LOADER_SIZE = len(valid_dataloader)
len(valid_dataset)

1196

In [12]:
valid_dataset.df_labels.groupby(['race', 'gender', 'age']).size()

race      gender  age  
Asian     Female  20-40    48
                  40-70    52
                  70-90    54
          Male    20-40    44
                  40-70    51
                  70-90    52
Black     Female  20-40    45
                  40-70    47
                  70-90    50
          Male    20-40    53
                  40-70    49
                  70-90    49
Hispanic  Female  20-40    48
                  40-70    54
                  70-90    54
          Male    20-40    48
                  40-70    52
                  70-90    52
White     Female  20-40    49
                  40-70    45
                  70-90    54
          Male    20-40    49
                  40-70    47
                  70-90    50
dtype: int64

In [13]:
org_valid_dataset = CheXpertDiseaseDataset(data_dir=TrainingConfigs.DATA_DIR, 
                                           labels_filename=TrainingConfigs.VALID_LABELS_ORIGINAL_FILENAME,
                                           transform=valid_transform)
org_valid_dataloader = DataLoader(org_valid_dataset, batch_size=TrainingConfigs.BATCH_SIZE, shuffle=False)
len(org_valid_dataset)

234

In [14]:
model = densenet121(weights=DenseNet121_Weights.DEFAULT)
num_features = model.classifier.in_features
model.classifier = nn.Sequential(
    nn.Linear(num_features, num_features, bias=True),
    nn.ReLU(),
    nn.Dropout(p=0.1),
    nn.Linear(in_features=num_features, out_features=TrainingConfigs.NUM_CLASSES, bias=True)
)

In [15]:
optimizer = torch.optim.Adam(model.parameters(), lr=TrainingConfigs.LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=TrainingConfigs.LEARNING_RATE_REDUCING_FACTOR,
                                                       patience=TrainingConfigs.LEARNING_RATE_REDUCE_PATIENCE, mode='min',
                                                       min_lr=TrainingConfigs.LEARNING_RATE_MIN_VAL)
criterion = nn.BCEWithLogitsLoss(reduction='mean') # combines BCEntropy and sigmoid
# final nn labels: torch.round(torch.sigmoid(pred))
# simple solution to handle the multi label problem (probabilities don't have to sum to 1)

In [16]:
criterion = nn.BCEWithLogitsLoss(reduction='none') # combines BCEntropy and sigmoid

## Training Loop 

In [17]:
checkpoint_obj = shared_utils.get_previous_training_place(model, optimizer, scheduler, criterion, TrainingConfigs)
model, optimizer, scheduler, criterion, results, last_epoch, last_iter = checkpoint_obj
score_dict = {
    "auc": "valid_auc",
    "original_valid_auc": "org_valid_auc",
    "loss": "valid_loss"
}
model.train()
model = to_gpu(model)
start_time = time.time()
shared_utils.start_training_msg(TrainingConfigs)
train_loss_list = []
apply_on_outputs = lambda x: torch.sigmoid(x)
for epoch in range(last_epoch, TrainingConfigs.EPOCHS):
    train_dataloader_iter = tqdm(enumerate(train_dataloader), total=len(train_dataloader))
    if last_iter > -1:
        # fast foward dataloader
        train_dataloader_iter = islice(train_dataloader_iter, last_iter+1, len(train_dataloader))
        last_iter = -1
    for i, (images, labels, weights) in train_dataloader_iter:
        images = to_gpu(images)
        labels = to_gpu(labels)
        weights = to_gpu(weights)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        weights = weights[:, None]
        loss = (loss * (weights/weights.sum())).mean()
        loss.backward()
        optimizer.step()
        train_loss_list.append(loss.item())
        if time.time()-start_time > TrainingConfigs.CHECKPOINT_TIME_INTERVAL:
            results['train_loss'].append(sum(train_loss_list)/len(train_loss_list))
            train_loss_list = []
            shared_utils.create_checkpoint(model, optimizer, scheduler, criterion, epoch, i, valid_dataloader,
                                           results, TrainingConfigs, score_dict, apply_on_outputs=apply_on_outputs,
                                           by_study=None, challenge_ann_only=None, org_valid_dataloader=org_valid_dataloader)
            model.train()
            start_time = time.time()
    shared_utils.create_checkpoint(model, optimizer, scheduler, criterion, epoch, len(train_dataloader), valid_dataloader,
                                   results, TrainingConfigs, score_dict, apply_on_outputs= apply_on_outputs, 
                                   by_study=None, challenge_ann_only=None, org_valid_dataloader=org_valid_dataloader)
    scheduler.step(results["valid_loss"][-1])

2022-08-21 21:02: 
2022-08-21 21:02: ----------------------------------------------------------------------------------------------------
2022-08-21 21:02: ----------------------------------------------------------------------------------------------------
2022-08-21 21:02: 
2022-08-21 21:02: Start Training


  0%|          | 0/13752 [00:00<?, ?it/s]

2022-08-21 21:02: 2022_08_21-21_02: Checkpoint Created For densenet121_disease_demo40_no_u_stratifiedV3.
2022-08-21 21:02: Epoch [0/10],   Iter [33/13751],   Train Loss: -0.4843,   Valid Loss: 0.7389,   Valid AUC: 0.6498



KeyboardInterrupt: 