In [28]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
import os
import datetime
import pandas as pd
import torch
import torch.nn as nn
from lr_scheduler import CyclicLR
from training_utils import training_loop, test_loop
from model import RNNLM
from data_utils import (IndexVectorizer, 
                        TextDataset, 
                        SpacyTokenizer,
                        LMDataLoader)
import pickle

In [40]:
####################################################
# Config
####################################################

## Input / output
data_dir = '../data/imdb'

## Tokenization
TOKENIZE = SpacyTokenizer().tokenize

## Vectorization
MIN_WORD_FREQ = 2
MAX_VOCAB_SIZE = 20000
STAT_END_TOK = True

## Model Architecture
hidden_dim = 300
embedding_dim = 100
dropout = 0.5
lstm_layers = 3
lstm_bidirection = False

## Training Language Model
batch_size = 80
learning_rate = 1e-3
num_epochs = 100
display_epoch_freq = 10
target_seq_len = 100
max_seq_len = 130
min_seq_len = 20

In [41]:
# GPU setup
use_gpu = torch.cuda.is_available()
device_num = 0
device = torch.device(f"cuda:{device_num}" if use_gpu else "cpu")
device

device(type='cuda', index=0)

In [42]:
# IO setup
today = datetime.datetime.now().strftime('%Y-%m-%d')
model_cache_dir = os.path.join(data_dir, 'models')
os.makedirs(model_cache_dir, exist_ok=True)
model_file_lm = os.path.join(model_cache_dir, f'LM__{today}.json')
model_file_class = os.path.join(model_cache_dir, f'CLASS__{today}.json')

train_file = os.path.join(data_dir, 'unsup.csv')
valid_file = os.path.join(data_dir, 'valid.csv')

In [43]:
RE_VECTORIZE = False
if RE_VECTORIZE or not os.path.isfile('data_cache.pkl'):
    train = pd.read_csv(train_file)
    valid = pd.read_csv(valid_file)
    vectorizer = IndexVectorizer(max_words = MAX_VOCAB_SIZE, 
                             min_frequency=MIN_WORD_FREQ,
                             start_end_tokens=STAT_END_TOK, 
                             tokenize=TOKENIZE)
    train_ds = TextDataset(data=train, vectorizer=vectorizer, text_col='text')
    valid_ds = TextDataset(data=valid, vectorizer=vectorizer, text_col='text')
    pickle.dump([train_ds, valid_ds], open('data_cache.pkl', 'wb'))
    pickle.dump(vectorizer, open('lm_vectorizer.pkl', 'wb'))
else:
    train_ds, valid_ds = pickle.load(open('data_cache.pkl', 'rb'))
    vectorizer = pickle.load(open('lm_vectorizer.pkl', 'rb'))

In [44]:
print(f'Train size: {len(train_ds)}\nvalid size: {len(valid_ds)}')
print(f"Vocab size: {len(vectorizer.vocabulary)}")

Train size: 50000
valid size: 25000
Vocab size: 20000


In [45]:
train_dl = LMDataLoader(dataset=train_ds, 
                        target_seq_len=target_seq_len, 
                        shuffle=True, 
                        max_seq_len=max_seq_len, 
                        min_seq_len=min_seq_len, 
                        p_half_seq_len=0.05,
                        batch_size=batch_size)
valid_dl = LMDataLoader(dataset=valid_ds,
                        target_seq_len=target_seq_len, 
                        shuffle=True, 
                        max_seq_len=max_seq_len, 
                        min_seq_len=min_seq_len, 
                        p_half_seq_len=0.05,
                        batch_size=batch_size) 

In [46]:
if use_gpu: torch.cuda.manual_seed(303)
else: torch.manual_seed(303)

In [None]:
# set up Files to save stuff in
runtime = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
model_file_lm = model_file_lm
    
    
# Build and initialize the model
lm = RNNLM(device, vectorizer.vocabulary_size, embedding_dim, hidden_dim, batch_size, 
           dropout = dropout, 
           tie_weights = False, 
           num_layers = lstm_layers, 
           bidirectional = lstm_bidirection, 
           word2idx = vectorizer.word2idx,
           log_softmax = False)

In [None]:
if use_gpu:
    lm = lm.to(device)
#lm.init_weights()

In [None]:
# Loss and Optimizer
loss = nn.CrossEntropyLoss()
param_list = [{'params': lm.rnns[i].parameters(), 'lr': 1e-3} for i in range(len(lm.rnns))]
param_list.extend([
        {'params': lm.encoder.parameters(), 'lr':1e-3},
        {'params': lm.decoder.parameters(), 'lr':1e-3},
    ])

optimizer = torch.optim.Adam(param_list, lr=0.01)

scheduler = CyclicLR(optimizer,  max_lrs=[0.1, 0.01, 0.01, 0.01, 0.1], 
                     mode='ulmfit', ratio=1.5, cut_frac=0.4, 
                     n_epochs=num_epochs, batchsize=50000/1171, 
                     verbose=False, epoch_length=50000)

history = training_loop(batch_size, num_epochs, 1, 
                        lm, loss, optimizer, scheduler, device, 
                        train_dl, valid_dl, 
                        best_model_path=model_file_lm)

HBox(children=(IntProgress(value=0), HTML(value='')))

Epoch: 0000; Loss: 5.9130; Val-Loss 5.2037; Perplexity 369.8065; Val-Perplexity 181.9518
Sample: this is bad piece . we can stumble out an hour in <UNK> and i saw any horror guitar ending look . <END> <START> i saw it and the film just did n't looks fans of hopes . how going just , they sit on in the death , as the honest scenes


In [67]:
i=0
for X,y in train_dl:
    i+=1
print(i)

1171
