# Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
# %reload_ext autoreload
import gc
gc.collect()
torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=False)
# torch.cuda.empty_cache()



In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import transforms
import time
from itertools import islice
from dataclasses import dataclass
from torchvision.models import densenet161, DenseNet161_Weights
import os

# pd.set_option('display.max_columns', 500)
# pd.set_option('display.max_rows', 500)
# import warnings
# warnings.filterwarnings('ignore')
# C:/Users/sshar/AppData/Roaming/jupyter/nbextensions/snippets /snippets.json (jupyter --data-dir)

In [6]:
from dataset import CheXpertDataset
import utils
from utils import vprint
from utils import to_gpu

# Configs 

In [7]:
@dataclass
class TrainingConfigs:
    DATA_DIR = os.path.join("..", "data", "CheXpert")
    CHECKPOINT_DIR = r"checkpoints"
    BATCH_SIZE = 8
    EPOCHS = 10
    LEARNING_RATE = 0.0001
    CHECKPOINT_TIME_INTERVAL = 30*60 # seconds
    MODEL_VERSION = "densenet161"
    TRAINED_MODEL_PATH = None
    TRAIN_LOADER_SIZE = None
    VALID_LOADER_SIZE = None
    VALID_SIZE = -1 # for debugging purposes

In [8]:
utils.set_seed()

# Training

## Training Setup

In [9]:
train_transform = transforms.Compose([
    transforms.Resize((320,320)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
valid_transform = transforms.Compose([
    transforms.Resize((320,320)),
    transforms.ToTensor(), 
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

In [10]:
# Create data loaders.
train_dataset = CheXpertDataset(mode='train', data_dir=TrainingConfigs.DATA_DIR, transform=train_transform)
train_dataloader = DataLoader(train_dataset, batch_size=TrainingConfigs.BATCH_SIZE, shuffle=True)
TrainingConfigs.TRAIN_LOADER_SIZE = len(train_dataloader)
len(train_dataset)

223414

In [11]:
valid_dataset = CheXpertDataset(mode='valid', data_dir=TrainingConfigs.DATA_DIR, transform=valid_transform)
valid_dataset.labels = valid_dataset.labels[:TrainingConfigs.VALID_SIZE] # hack for speed debugging
valid_dataloader = DataLoader(valid_dataset, batch_size=TrainingConfigs.BATCH_SIZE, shuffle=False)
TrainingConfigs.VALID_LOADER_SIZE = len(valid_dataloader)
len(valid_dataset)

233

In [12]:
# torch.hub._validate_not_a_forked_repo = lambda a,b,c: True # workaround for torch.hub
# model = torch.hub.load('pytorch/vision:v0.10.0', 'densenet121', pretrained=True)
model = densenet161(weights=DenseNet161_Weights.DEFAULT)

In [13]:
num_features = model.classifier.in_features
model.classifier = nn.Sequential(
    nn.Linear(num_features, num_features, bias=True),
    nn.ReLU(),
    nn.Dropout(p=0.1),
    nn.Linear(in_features=num_features, out_features=utils.Configs.NUM_CLASSES, bias=True)
)

In [14]:
optimizer = torch.optim.Adam(model.parameters(), lr=TrainingConfigs.LEARNING_RATE, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=5, mode='min')
criterion = nn.BCEWithLogitsLoss(reduction='mean') # combines BCEntropy and sigmoid
# final nn labels: torch.round(torch.sigmoid(pred))
# simple solution to handle the multi label problem (probabilities don't have to sum to 1)

## Training Loop 

In [None]:
model, results, last_epoch, last_iter = utils.get_previos_training_place(model, TrainingConfigs)
model.train()
model = to_gpu(model)
start_time = time.time()
for epoch in range(last_epoch, TrainingConfigs.EPOCHS):
    train_dataloader_iter = islice(tqdm(enumerate(train_dataloader), total=len(train_dataloader)), 
                                   last_iter+1, len(train_dataloader)) # fast foward dataloader
    for i, (images, labels) in train_dataloader_iter:
        images = to_gpu(images)
        labels = to_gpu(labels)        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        results['train_loss'].append(loss.item())
        if time.time()-start_time > TrainingConfigs.CHECKPOINT_TIME_INTERVAL:
            utils.create_checkpoint(model, epoch, i, valid_dataloader, criterion, results, TrainingConfigs)
            start_time = time.time()
    scheduler.step(np.mean(results["valid_loss"][-len(train_dataloader):]))

  0%|          | 0/27927 [00:00<?, ?it/s]

Bad pipe message: %s [b'\x80\xbct\x98\xbb\x03D\xff-v;\xfa:=\xeaI\x16\xf8 \xbe,fs\xdc1L\x9b\x1a\x98\x81n\xa2g\xb4\xe5\xc1\x9e\r\xda\x08P\x8b\x10\x97\x17\xc1J\xfcX\xce\x87\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06\x01\x00+\x00\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&']
Bad pipe message: %s [b"\xd4\xaa\xdcl\xf7\x0b\x15\xd0\xa3\xe1\xb9~O\xad\xcc\xb7\xdd\xca\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0", b'9\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q']
Bad pipe mes

2022-07-08 11:32: 2022_07_08-11_32: Checkpoint Created.
2022-07-08 11:32: Epoch [1/10],   Iter [1792/27926],   Train Loss: 0.4298,   Valid Loss: 1.0759,   Valid AUC: 0.8557

2022-07-08 12:02: 2022_07_08-12_02: Checkpoint Created.
2022-07-08 12:02: Epoch [1/10],   Iter [3880/27926],   Train Loss: 0.4036,   Valid Loss: 1.1356,   Valid AUC: 0.8301

2022-07-08 12:32: 2022_07_08-12_32: Checkpoint Created.
2022-07-08 12:32: Epoch [1/10],   Iter [6192/27926],   Train Loss: 0.4153,   Valid Loss: 1.0323,   Valid AUC: 0.8736

