In [1]:
import torch, argparse, gzip, os, pickle
import numpy as np, pytorch_lightning as pl
from pytorch_lightning.loggers import MLFlowLogger
from mlflow.tracking.artifact_utils import get_artifact_uri
from pytorch_lightning.callbacks import ModelCheckpoint
from pathlib import Path
import mlflow

In [2]:
TRAIN_SAMPLES = 60000
TRAIN_PATH = "s2_mnist_train_dwr_" + str(TRAIN_SAMPLES) + ".gz"
TEST_PATH = "s2_mnist.gz"
# MAX_EPOCHS = 20
MAX_EPOCHS = 3

In [3]:
class ConvNet(pl.LightningModule):
#     def __init__(self, hparams, train_data, val_data, test_data):
    def __init__(self, hparams, train_data, test_data):
        super().__init__()
        
        self.hparams = hparams
        self.train_data = train_data
#         self.val_data = val_data
        self.test_data = test_data
        
        self.loss_function = torch.nn.CrossEntropyLoss()

        f1 = 32
        f2 = 64
        
        self.feature_layer = torch.nn.Sequential(
            torch.nn.Conv2d(1, f1, kernel_size=5, stride=3),
            torch.nn.ReLU(),
            torch.nn.Conv2d(f1, f2, kernel_size=5, stride=3),
            torch.nn.ReLU()
        )
        self.out_layer = torch.nn.Linear(f2 * 5**2, 10)
        
    def forward(self, x):
        x = self.feature_layer(x)
        x = x.view(x.shape[0], -1)
        x = self.out_layer(x)
        return x
    
    def loss(self, x, y_true):
        y_pred = self(x)
        loss = self.loss_function(y_pred, y_true)
        return loss
    
    def correct_predictions(self, x, y_true):
        outputs = self(x)
        _, y_pred = torch.max(outputs, 1)
        correct = (y_pred == y_true).long().sum()
        return correct
    
    def prepare_data(self):
        pass

    def train_dataloader(self):
        return torch.utils.data.DataLoader(dataset=self.train_data,
                                           batch_size=self.hparams.train_batch_size,
                                           shuffle=True, num_workers=self.hparams.num_workers)

    def test_dataloader(self):
        return torch.utils.data.DataLoader(dataset=self.test_data,
                                           batch_size=self.hparams.test_batch_size,
                                           shuffle=False, num_workers=self.hparams.num_workers)
    
    def configure_optimizers(self):
        self._optimizer = torch.optim.AdamW(self.parameters(), lr=self.hparams.lr,
                                            weight_decay=self.hparams.weight_decay, amsgrad=False)
        
        return {'optimizer': self._optimizer}

    def training_step(self, batch, batch_idx):
        x, y = batch
        loss = self.loss(x, y)
#         mlflow.log_metric('train_loss', loss.item())
#         self.log({'train_loss': loss.item()})
        # add logs
        return {'loss': loss}
    
    def training_epoch_end(self, outputs):
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean().cpu().item()
        return {'avg_loss': avg_loss}

    def test_step(self, batch, batch_idx):
        x, y = batch
        loss = self.loss(x, y)
        correct = self.correct_predictions(x, y)
        return {'test_loss': loss, 'test_correct': correct}
    
    def test_epoch_end(self, outputs):
        avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean().cpu().item()
        test_correct = torch.stack([x['test_correct'] for x in outputs]).sum().cpu()
        test_acc = test_correct / len(self.test_data)

        logs = {'test_loss': avg_loss, 'test_acc': test_acc}        
        return {'avg_test_loss': avg_loss, 'test_acc': test_acc, 'log': logs}

    def get_progress_bar_dict(self):
        # call .item() only once but store elements without graphs
        running_train_loss = self.trainer.running_loss.mean()
        avg_training_loss = running_train_loss.cpu().item() if running_train_loss is not None else float('NaN')
        lr = self.hparams.lr

        tqdm_dict = {
            'loss': '{:.2E}'.format(avg_training_loss),
            'lr': '{:.2E}'.format(lr),
        }

        if self.trainer.truncated_bptt_steps is not None:
            tqdm_dict['split_idx'] = self.trainer.split_idx

        if self.trainer.logger is not None and self.trainer.logger.version is not None:
            tqdm_dict['v_num'] = self.trainer.logger.version

        return tqdm_dict

    
    def count_trainable_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)
    
    def count_parameters(self):
        return sum(p.numel() for p in self.parameters())

In [4]:
def load_train_data(path):
    
    with gzip.open(path, 'rb') as f:
        dataset = pickle.load(f)
        
    train_data = torch.from_numpy(dataset["images"][:, None, :, :].astype(np.float32))
    train_labels = torch.from_numpy(dataset["labels"].astype(np.int64))

    train_dataset = torch.utils.data.TensorDataset(train_data, train_labels)
    
    return train_dataset
    
def load_test_data(path):
    
    with gzip.open(path, 'rb') as f:
        dataset = pickle.load(f)
        
    test_data = torch.from_numpy(dataset["test"]["images"][:, None, :, :].astype(np.float32))
    test_labels = torch.from_numpy(dataset["test"]["labels"].astype(np.int64))

    test_dataset = torch.utils.data.TensorDataset(test_data, test_labels)
    
    return test_dataset

In [5]:
if torch.cuda.is_available():
    print('GPU available: ' + torch.cuda.get_device_name())
else:
    raise RuntimeError('No GPU found.')

GPU available: NVIDIA GeForce RTX 2070 SUPER


In [6]:
# loading the data
train_data, test_data = load_train_data(TRAIN_PATH), load_test_data(TEST_PATH)

print("Total training examples: {}".format(len(train_data)))
print("Total test examples: {}".format(len(test_data)))

hparams = argparse.Namespace()

hparams.train_batch_size = 32
hparams.test_batch_size = 32
hparams.num_workers = 0
hparams.lr = 5e-4
hparams.weight_decay = 0.

Total training examples: 60000
Total test examples: 10000


In [7]:
tracking_uri='sqlite:///ml-runs/database.db'
# mlf_logger = MLFlowLogger(experiment_name='test_log', tracking_uri='file:./ml-runs')
mlf_logger = MLFlowLogger(experiment_name='test_log', tracking_uri='sqlite:///ml-runs/database.db')

print(mlf_logger.run_id)

artifact_dir = get_artifact_uri(
    run_id=mlf_logger.run_id, tracking_uri=tracking_uri
)
"""checkpoint_callback = ModelCheckpoint(
    dirpath=Path(artifact_dir) / 'checkpoints'
)""";

11c605d58e264454a27e2df6c979463d


In [8]:
model = ConvNet(hparams, train_data, test_data)

print(f"Number of trainable / total parameters: {model.count_trainable_parameters(), model.count_trainable_parameters()}")

trainer = pl.Trainer(gpus=1, max_epochs=MAX_EPOCHS, logger=mlf_logger)

trainer.fit(model)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]


Number of trainable / total parameters: (68106, 68106)



  | Name          | Type             | Params
---------------------------------------------------
0 | loss_function | CrossEntropyLoss | 0     
1 | feature_layer | Sequential       | 52 K  
2 | out_layer     | Linear           | 16 K  


Training: 0it [00:00, ?it/s]

1

In [9]:
trainer.test(model)



Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_loss': 1.6359615325927734,
 'test_acc': tensor(0.4125),
 'test_loss': 1.6359615325927734}
--------------------------------------------------------------------------------


{'avg_test_loss': 1.6359615325927734,
 'test_acc': 0.4124999940395355,
 'test_loss': 1.6359615325927734}

In [10]:
"""DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
NUM_EPOCHS = 3
BATCH_SIZE = model.hparams.train_batch_size

model = ConvNet(hparams, train_data, test_data)
train_loader = model.train_dataloader()
test_loader = model.test_dataloader()

model.to(DEVICE)


criterion = torch.nn.CrossEntropyLoss()
criterion = criterion.to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=model.hparams.lr)

for epoch in range(NUM_EPOCHS):
    for i, (images, labels) in enumerate(train_loader):
        model.train()

        images = images.to(DEVICE)
        labels = labels.to(DEVICE)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()

        optimizer.step()

        print('\rEpoch [{0}/{1}], Iter [{2}/{3}] Loss: {4:.4f}'.format(
            epoch+1, NUM_EPOCHS, i+1, len(model.train_data)//BATCH_SIZE,
            loss.item()), end="")
    print("")
    correct = 0
    total = 0
    for i, (images, labels) in enumerate(test_loader):
        model.eval()

        with torch.no_grad():
            images = images.to(DEVICE)
            labels = labels.to(DEVICE)

            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).long().sum().item()

    print('Test Accuracy: {0}'.format(100 * correct / total))"""

Epoch [1/3], Iter [604/1875] Loss: 2.2270

KeyboardInterrupt: 

In [None]:
# the_model = ConvNet(hparams, train_data, test_data)
# the_model.load_state_dict(torch.load("baseline.pt"))

In [None]:
"""criterion = torch.nn.CrossEntropyLoss()
for i, (images, labels) in enumerate(the_model.train_dataloader()):
    the_model.eval()

    images = images
    labels = labels

    outputs = the_model(images)
    loss = criterion(outputs, labels)
    print(outputs.shape)
    print(labels.shape)
    print(loss)
    _, predicted = torch.max(outputs, 1)
    total = labels.size(0)
    correct = (predicted == labels).long().sum().item()
    print(100 * correct / total)
    raise RuntimeError()"""

In [None]:
# trainer.test(the_model)

In [None]:
# The testing seems to be correct, since I can load and test a trained model and the test results agree.
# The loss function also seems to be correct.
# I guess, there is something fishy going on in the training.