In [63]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
import torch
import torch.nn as nn
from torch.optim import SGD
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torchvision import models
class Net(pl.LightningModule):
    def __init__(
        self,
        model,
        batch_size,
        epochs,
        data_path="data/debug",
        lr=1e-4
    ):
        super(Net,self).__init__()
        self.batch_size = batch_size
        self.epochs = epochs
        self.data_path=data_path[:-1] if data_path[-1]=="/" else data_path
        self.lr = lr
        self.model=model
        self.loss_fn=nn.CrossEntropyLoss()
    def forward(
        self,x
    ):
        o=self.model(x)
        return o
    def train_dataloader(self) :
        return DataLoader(
            ImageFolder(
                self.data_path+"/train",
                transform=transforms.Compose(
                    [
                        transforms.ToTensor(),
                        transforms.RandomResizedCrop([224,224]),
                    ]
                )
            ),
            batch_size=self.batch_size,
            shuffle=True,
            drop_last=True,
            num_workers=4,
        )
    def val_dataloader(self) :
        return DataLoader(
            ImageFolder(
                self.data_path+"/valid",
                transform=transforms.Compose(
                    [transforms.ToTensor(),transforms.Resize([224,224])]
                )
            ),
            batch_size=self.batch_size,
            shuffle=True,
            drop_last=True,
            num_workers=4,
        )    
    def configure_optimizers(self):
        optimizer = SGD(self.parameters(), lr=self.lr, weight_decay=0.001)
        return optimizer
    def training_step(self, batch, batch_nb):
        pred = self.forward(batch[0])
        loss=self.loss_fn(pred,batch[1])
        self.log(
            "train_loss",
            loss,
            on_step=True,
            on_epoch=True,
            prog_bar=True,
            logger=True,
        )
        return loss
    def validation_step(self, batch, batch_nb):
        pred = self.forward(batch[0])
        loss=self.loss_fn(pred,batch[1])
        return loss
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack(outputs).mean()
        self.log(
            "val_loss",
            avg_loss,
            on_epoch=True,
            prog_bar=True,
            logger=True,
        )
        return {"val_loss": avg_loss}



In [64]:
if __name__=='__main__':
    epochs=10
    output_path="runs/exp1"
    batch_size=2
    data_path="data/debug"
    lr=1e-3
    checkpoint_callback = ModelCheckpoint(
        dirpath=output_path,
        verbose=True,
        every_n_epochs=1,
        save_top_k=1,
        monitor="val_loss",
        mode="min",
    )
    trainer = pl.Trainer(
        default_root_dir=output_path,
        gradient_clip_val=1,
        max_epochs=epochs,
        gpus=1,
        callbacks=[checkpoint_callback],
        precision=32,
        progress_bar_refresh_rate=50
    )
    net = Net(
        models.resnet18(pretrained=True),
        batch_size,
        epochs,
        data_path=data_path,
        lr=lr,
    )
    trainer.fit(net)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type             | Params
---------------------------------------------
0 | model   | ResNet           | 11.7 M
1 | loss_fn | CrossEntropyLoss | 0     
---------------------------------------------
11.7 M    Trainable params
0         Non-trainable params
11.7 M    Total params
46.758    Total estimated model params size (MB)


Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Epoch 0:   0%|          | 0/136 [00:00<00:00, 500.93it/s]

  rank_zero_warn(


Epoch 0: 100%|██████████| 136/136 [00:10<00:00, 12.50it/s, loss=8.81, v_num=7, train_loss_step=8.950, val_loss=11.20]

Epoch 0, global step 67: val_loss reached 11.17545 (best 11.17545), saving model to "C:\VScode\deep_learning\ifly_pic\runs\exp1\epoch=0-step=67.ckpt" as top 1


Epoch 1: 100%|██████████| 136/136 [00:10<00:00, 13.56it/s, loss=8.36, v_num=7, train_loss_step=8.430, val_loss=11.00, train_loss_epoch=8.720]

Epoch 1, global step 135: val_loss reached 11.02602 (best 11.02602), saving model to "C:\VScode\deep_learning\ifly_pic\runs\exp1\epoch=1-step=135.ckpt" as top 1


Epoch 2: 100%|██████████| 136/136 [00:11<00:00, 11.66it/s, loss=8.46, v_num=7, train_loss_step=9.320, val_loss=10.80, train_loss_epoch=8.510]

Epoch 2, global step 203: val_loss reached 10.83526 (best 10.83526), saving model to "C:\VScode\deep_learning\ifly_pic\runs\exp1\epoch=2-step=203.ckpt" as top 1


Epoch 3: 100%|██████████| 136/136 [00:10<00:00, 12.48it/s, loss=8.75, v_num=7, train_loss_step=9.490, val_loss=11.20, train_loss_epoch=8.520]

Epoch 3, global step 271: val_loss was not in top 1


Epoch 4: 100%|██████████| 136/136 [00:10<00:00, 13.65it/s, loss=8.15, v_num=7, train_loss_step=8.690, val_loss=11.00, train_loss_epoch=8.600]

Epoch 4, global step 339: val_loss was not in top 1


Epoch 5: 100%|██████████| 136/136 [00:09<00:00, 14.03it/s, loss=8.64, v_num=7, train_loss_step=8.570, val_loss=10.90, train_loss_epoch=8.530]

Epoch 5, global step 407: val_loss was not in top 1


Epoch 6: 100%|██████████| 136/136 [00:10<00:00, 12.78it/s, loss=8.75, v_num=7, train_loss_step=9.740, val_loss=11.20, train_loss_epoch=8.670]

Epoch 6, global step 475: val_loss was not in top 1


Epoch 7: 100%|██████████| 136/136 [00:10<00:00, 12.64it/s, loss=8.66, v_num=7, train_loss_step=7.450, val_loss=11.10, train_loss_epoch=8.870]

Epoch 7, global step 543: val_loss was not in top 1


Epoch 8: 100%|██████████| 136/136 [00:09<00:00, 13.85it/s, loss=8.48, v_num=7, train_loss_step=6.870, val_loss=11.20, train_loss_epoch=8.470]

Epoch 8, global step 611: val_loss was not in top 1


Epoch 9: 100%|██████████| 136/136 [00:09<00:00, 14.01it/s, loss=8.77, v_num=7, train_loss_step=8.560, val_loss=11.00, train_loss_epoch=8.470]

Epoch 9, global step 679: val_loss was not in top 1


Epoch 9: 100%|██████████| 136/136 [00:09<00:00, 13.99it/s, loss=8.77, v_num=7, train_loss_step=8.560, val_loss=11.00, train_loss_epoch=8.470]
