# Load dataset from Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!mkdir -p ./data/VNOnDB

In [3]:
!unzip './drive/My Drive/VNOnDB/word_train.zip' -d ./data/VNOnDB >> log_extract.txt
print('Extracted word_train.zip')

replace ./data/VNOnDB/word_train/20140927_0017_6046_1_tg_4_4_1.png? [y]es, [n]o, [A]ll, [N]one, [r]ename: N
N
N
Extracted word_train.zip


In [4]:
!unzip './drive/My Drive/VNOnDB/word_val.zip' -d ./data/VNOnDB >> log_extract.txt
print('Extracted word_val.zip')

replace ./data/VNOnDB/word_val/20151224_0141_7818_1_tg_0_0_0.png? [y]es, [n]o, [A]ll, [N]one, [r]ename: N
Extracted word_val.zip


In [5]:
!unzip './drive/My Drive/VNOnDB/word_test.zip' -d ./data/VNOnDB >> log_extract.txt
print('Extracted word_test.zip')

replace ./data/VNOnDB/word_test/20151208_0146_7105_1_tg_0_0_0.png? [y]es, [n]o, [A]ll, [N]one, [r]ename: N
Extracted word_test.zip


In [6]:
!cp './drive/My Drive/VNOnDB/train_word.csv' ./data/VNOnDB/
!cp './drive/My Drive/VNOnDB/test_word.csv' ./data/VNOnDB/
!cp './drive/My Drive/VNOnDB/validation_word.csv' ./data/VNOnDB/
print('Copied csv files')

Copied csv files


In [1]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import AxesGrid

from utils import encode, decode, eos_char, alphabets, to_one_hot
from dataset import VNOnDB
from model import Model

import pandas as pd

import pdb

In [2]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader

from torchvision import transforms

In [3]:
image_transform = transforms.Compose([
    transforms.Resize((320, 480)),
    transforms.Grayscale(3),
    transforms.ToTensor(),
])

label_transform = transforms.Compose([
    transforms.Lambda(lambda label: list(label) + [eos_char]),
])

# Train

In [25]:
def to_batch(samples):
    batch_size = len(samples)
    image_samples, label_samples = list(zip(*samples))
    # image_samples: list of [C, H, W]
    # label_samples: list of [1, T, V]

    # batch_image: [B, 3, H, W]
    # image: [3, H, W] - grayscale
    max_image_row = max([image.size(1) for image in image_samples])
    max_image_col = max([image.size(2) for image in image_samples])
    batch_image = torch.ones(batch_size, 3, max_image_row, max_image_col)
    for i, image in enumerate(image_samples):
        image_row = image.shape[1]
        image_col = image.shape[2]
        batch_image[i, :, :image_row, :image_col] = image

    # batch_label: [T, B, 1]
    label_lengths = np.array([len(label) for label in label_samples])
    max_length = label_lengths.max()
    
    label_lengths = torch.from_numpy(label_lengths).unsqueeze(-1) # [B, 1]

    batch_label = np.zeros((batch_size, max_length, 1)) # [B, T, 1]
    for i, label in enumerate(label_samples): # label: list
        batch_label[i, :len(label)] = encode(label)
#     pdb.set_trace()
    batch_label = torch.from_numpy(batch_label).long() # [B, T, 1]
    batch_label_one_hot = torch.stack([torch.from_numpy(to_one_hot(label.numpy())) for label in batch_label]) # [B, T, V]

    # sort by decreasing lengths
    label_lengths, sorted_idx = label_lengths.squeeze(-1).sort(descending=True)
    batch_image = batch_image[sorted_idx]
    batch_label = batch_label[sorted_idx].transpose(0, 1) # [T, B, 1]
    batch_label_one_hot = batch_label_one_hot[sorted_idx].transpose(0, 1) # [T, B, V]

    return batch_image, batch_label, batch_label_one_hot, label_lengths

In [26]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [27]:
batch_size = 8
hidden_size = 256
vocab_size = len(alphabets)
learning_rate = 1e-8

In [28]:
model = Model(
    batch_size,
    hidden_size,
    vocab_size,
    device
)

In [29]:
train_folder = './data/VNOnDB/word_train/'
val_folder = './data/VNOnDB/word_val/'
test_folder = './data/VNOnDB/word_test/'

train_df = pd.read_csv(f'./data/VNOnDB/train_word.csv', sep='\t', index_col=0)
val_df = pd.read_csv(f'./data/VNOnDB/validation_word.csv', sep='\t', index_col=0)
test_df = pd.read_csv(f'./data/VNOnDB/test_word.csv', sep='\t', index_col=0)

train_dataset = VNOnDB(f'./data/VNOnDB/word_train', train_df, image_transform, label_transform)
val_dataset = VNOnDB(f'./data/VNOnDB/word_val', val_df, image_transform, label_transform)
test_dataset = VNOnDB(f'./data/VNOnDB/word_test', test_df, image_transform, label_transform)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=to_batch)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=True, collate_fn=to_batch)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True, collate_fn=to_batch)

In [30]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

In [35]:
n_epochs = 10
dtype = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor

model.to(device)
model.train()

for epoch in range(n_epochs):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, (inputs, labels, labels_one_hot, label_lengths) in enumerate(train_loader):
        pdb.set_trace()
        inputs = inputs.type(dtype).to(device)
        labels_one_hot = labels_one_hot.type(dtype).to(device)
        labels = labels.type(torch.cuda.LongTensor if device == 'cuda' else torch.LongTensor).to(device)
        label_lengths = label_lengths.type(torch.cuda.LongTensor if device == 'cuda' else torch.LongTensor).to(device)
        
#         labels = nn.utils.rnn.pack_padded_sequence(labels, label_lengths)[0]
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs, labels_one_hot, label_lengths) # [T, B, V]
        outputs = outputs.reshape(-1, vocab_size) # [T*B, V]
        labels = labels.reshape(-1) # [T*B*1]
        
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

print('Finished Training')

> <ipython-input-35-0643843ee9a8>(12)<module>()
-> inputs = inputs.type(dtype).to(device)


(Pdb)  n


> <ipython-input-35-0643843ee9a8>(13)<module>()
-> labels_one_hot = labels_one_hot.type(dtype).to(device)


(Pdb)  


> <ipython-input-35-0643843ee9a8>(14)<module>()
-> labels = labels.type(torch.cuda.LongTensor if device == 'cuda' else torch.LongTensor).to(device)


(Pdb)  


> <ipython-input-35-0643843ee9a8>(15)<module>()
-> label_lengths = label_lengths.type(torch.cuda.LongTensor if device == 'cuda' else torch.LongTensor).to(device)


(Pdb)  


> <ipython-input-35-0643843ee9a8>(19)<module>()
-> optimizer.zero_grad()


(Pdb)  


> <ipython-input-35-0643843ee9a8>(22)<module>()
-> outputs = model(inputs, labels_one_hot, label_lengths) # [T, B, V]


(Pdb)  


> <ipython-input-35-0643843ee9a8>(23)<module>()
-> outputs = outputs.reshape(-1, vocab_size) # [T*B, V]


(Pdb)  


> <ipython-input-35-0643843ee9a8>(24)<module>()
-> labels = labels.reshape(-1) # [T*B*1]


(Pdb)  outputs.shape


torch.Size([40, 216])


(Pdb)  


torch.Size([40, 216])


(Pdb)  n


> <ipython-input-35-0643843ee9a8>(26)<module>()
-> loss = criterion(outputs, labels)


(Pdb)  


> <ipython-input-35-0643843ee9a8>(27)<module>()
-> loss.backward()


(Pdb)  


> <ipython-input-35-0643843ee9a8>(28)<module>()
-> optimizer.step()


(Pdb)  


> <ipython-input-35-0643843ee9a8>(31)<module>()
-> running_loss += loss.item()


(Pdb)  


> <ipython-input-35-0643843ee9a8>(32)<module>()
-> if i % 2000 == 1999:    # print every 2000 mini-batches


(Pdb)  


> <ipython-input-35-0643843ee9a8>(10)<module>()
-> for i, (inputs, labels, labels_one_hot, label_lengths) in enumerate(train_loader):


(Pdb)  q


BdbQuit: 

In [None]:
904 - 689

In [None]:
eos_char