In [1]:
import torch
import torch.nn as nn

import json
from tqdm import tqdm

import src.models as mdls
import src.utils as utils

config = utils.get_options()
if config['use_colab']:
    utils.colab()
else:
    utils.check_if_datasets_are_downloaded()

import src.dataloaders as dataloaders
from comet_ml import Experiment
from comet_ml.integration.pytorch import log_model

Moving kaggle file

all datasets are in place


In [2]:
img_size = config['img_size']

In [3]:
model = mdls.InspectorGadjet()
dataloader = dataloaders.dataloader
len(dataloader)

4856

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [5]:
# Define a directory to save the checkpoints
checkpoint_dir = "checkpoints/"

# Ensure the directory exists
import os
os.makedirs(checkpoint_dir, exist_ok=True)

In [6]:
# loss_fn = nn.MSELoss()
loss_fn = mdls.combined_loss
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

model.to(device)
# loss_fn.to(device)
None

In [7]:
len(dataloader)

4856

In [8]:
epochs = config['epochs']
logging = config['logging']
log_interval = config["log_wieghts_interval"]


if logging: 
    # read secrets for cometml logging
    with open('secrets.json') as secrets_file:
        secrets = json.load(secrets_file)

    # init experimenxt
    experiment = Experiment(
        api_key=secrets["api_key"],
        project_name=secrets["project_name"],
        workspace="reu-ds-club", 
    )

    hyper_params = {
        "model_name": config["model"],
        "use_colab": config['use_colab'], 
        "epochs": epochs,
        "batch_size": config['batch_size'], 
        "image_size": config['img_size'], 
    }

    experiment.log_parameters(hyper_params)

for epoch in range(epochs):
    epoch_loss = 0.0
    for sample in (pbar := tqdm(dataloader)):

        img, box = sample[0].to(device), sample[1].to(device)
        img = img.to(torch.float32)

        box = box.to(torch.float32)
        optimizer.zero_grad()
        pred = model(img)
        # loss = loss_fn(pred, box)

        loss = loss_fn(pred[0], pred[1], box)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch: {epoch}\tLoss: {epoch_loss / len(dataloader)}")

    checkpoint_filename = os.path.join(checkpoint_dir, f"checkpoint_epoch_{epoch}.pth")
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': epoch_loss,
    }, checkpoint_filename)

    if logging:
        experiment.log_metric("loss", epoch_loss, step=epoch)
    
    # logging model weights (accorging to log_interval + last epoch)
    if logging and (epoch % log_interval == 0 or epoch == epochs-1):
        torch.save(model, 'model.pth')
        experiment.log_model(name = f"model-epoch-{epoch}", file_or_folder = 'model.pth', file_name = f"model-epoch-{epoch}")
        experiment.log_asset(file_data = 'model.pth', file_name = f"model-epoch-{epoch}")
        print("save model")


if logging:
    experiment.end()



  2%|▏         | 91/4856 [00:14<12:49,  6.19it/s]


KeyboardInterrupt: 