In [48]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [66]:
import os
import datetime
import pandas as pd
import torch
import torch.nn as nn
from lr_scheduler import CyclicLR
from training_utils import training_loop, test_loop
from model import RNNLM
from data_utils import (IndexVectorizer, 
                        TextDataset, 
                        SpacyTokenizer,
                        LMDataLoader)
import pickle

In [50]:
####################################################
# Config
####################################################

## Input / output
data_dir = '../data/imdb'

## Tokenization
TOKENIZE = SpacyTokenizer().tokenize

## Vectorization
MIN_WORD_FREQ = 2
MAX_VOCAB_SIZE = 20000
STAT_END_TOK = True

## Model Architecture
hidden_dim = 100
embedding_dim = 200
dropout = 0.5
lstm_layers = 1 # this is useless atm
lstm_bidirection = False

## Training Language Model
batch_size = 64
learning_rate = 1e-3
num_epochs = 100
display_epoch_freq = 10
target_seq_len = 50
max_seq_len = 70
min_seq_len = 5

In [51]:
# GPU setup
use_gpu = torch.cuda.is_available()
device_num = 0
device = torch.device(f"cuda:{device_num}" if use_gpu else "cpu")
device

device(type='cuda', index=0)

In [55]:
# IO setup
today = datetime.datetime.now().strftime('%Y-%m-%d')
model_cache_dir = os.path.join(data_dir, 'models')
os.makedirs(model_cache_dir, exist_ok=True)
model_file_lm = os.path.join(model_cache_dir, f'LM__{today}.json')
model_file_class = os.path.join(model_cache_dir, f'CLASS__{today}.json')

train_file = os.path.join(data_dir, 'unsup.csv')
valid_file = os.path.join(data_dir, 'valid.csv')

In [56]:
train = pd.read_csv(train_file)
valid = pd.read_csv(valid_file)

In [57]:
vectorizer = IndexVectorizer(max_words = MAX_VOCAB_SIZE, 
                             min_frequency=MIN_WORD_FREQ,
                             start_end_tokens=STAT_END_TOK, 
                             tokenize=TOKENIZE)


train_ds = TextDataset(data=train, vectorizer=vectorizer, text_col='text')
valid_ds = TextDataset(data=valid, vectorizer=vectorizer, text_col='text')

In [58]:
print(f'Train size: {len(train_ds)}\nvalid size: {len(valid_ds)}')
print(f"Vocab size: {len(vectorizer.vocabulary)}")

Train size: 50000
valid size: 25000
Vocab size: 20000


In [59]:
train_dl = LMDataLoader(dataset=train_ds, 
                        target_seq_len=target_seq_len, 
                        shuffle=True, 
                        max_seq_len=max_seq_len, 
                        min_seq_len=min_seq_len, 
                        p_half_seq_len=0.05,
                        batch_size=batch_size)
valid_dl = LMDataLoader(dataset=valid_ds,
                        target_seq_len=target_seq_len, 
                        shuffle=True, 
                        max_seq_len=max_seq_len, 
                        min_seq_len=min_seq_len, 
                        p_half_seq_len=0.05,
                        batch_size=batch_size) 

In [60]:
if use_gpu: torch.cuda.manual_seed(303)
else: torch.manual_seed(303)

In [67]:
# set up Files to save stuff in
runtime = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
model_file_lm = model_file_lm
    
    
# Build and initialize the model
lm = RNNLM(device, vectorizer.vocabulary_size, embedding_dim, hidden_dim, batch_size, 
           dropout = dropout, 
           tie_weights = False, 
           num_layers = lstm_layers, 
           bidirectional = lstm_bidirection, 
           word2idx = vectorizer.word2idx,
           log_softmax = False)

In [68]:
if use_gpu:
    lm = lm.to(device)
lm.init_weights()

In [69]:
# Loss and Optimizer
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    [
        {'params': lm.encoder.parameters(), 'lr':1e-3},
        {'params': lm.lstm1.parameters(), 'lr':1e-3},
        {'params': lm.lstm2.parameters(), 'lr':1e-3},
        {'params': lm.lstm3.parameters(), 'lr':1e-3},
        {'params': lm.decoder.parameters(), 'lr':1e-3},
    ]
    , lr=0.01)

scheduler = CyclicLR(optimizer,  max_lrs=[0.1, 0.01, 0.01, 0.01, 0.1], 
                     mode='ulmfit', ratio=1.5, cut_frac=0.4, 
                     n_epochs=num_epochs, batchsize=5000/196, 
                     verbose=False, epoch_length=5000)

history = training_loop(batch_size, num_epochs, 2, 
                        lm, loss, optimizer, None, device, 
                        train_dl, valid_dl, 
                        best_model_path=model_file_lm)

HBox(children=(IntProgress(value=0), HTML(value='')))

Epoch: 0000; Loss: 5.9200; Val-Loss 5.2891; Perplexity 372.4107; Val-Perplexity 198.1553
Sample: this , the whole performance.<br /><br />rating in their habit in
Epoch: 0002; Loss: 5.2945; Val-Loss 4.9613; Perplexity 199.2290; Val-Perplexity 142.7846
Sample: this trash . he turned place to kill her , as
Epoch: 0004; Loss: 5.1695; Val-Loss 4.8518; Perplexity 175.8209; Val-Perplexity 127.9710
Sample: this " cast " during you was while with the only
Epoch: 0006; Loss: 5.1015; Val-Loss 4.7839; Perplexity 164.2659; Val-Perplexity 119.5670
Sample: this series , takes the many time as the lifting '
Epoch: 0008; Loss: 5.0475; Val-Loss 4.7252; Perplexity 155.6309; Val-Perplexity 112.7560
Sample: this film . his scares . the thought is fairly lackluster
Epoch: 0010; Loss: 5.0070; Val-Loss 4.6815; Perplexity 149.4529; Val-Perplexity 107.9310
Sample: this . the story does hardly study , and that 's
-----------------------------------------------------------------------------------------
Exiting f

In [70]:
[lm.idx2word[x] for x in lm.sample(seed='I see a silly ', length=10)]

['i',
 'see',
 'a',
 'silly',
 'understatement',
 ',',
 'when',
 'i',
 '<UNK>',
 'in',
 'sounds',
 '13',
 '/><br',
 '/>i']

In [71]:
pickle.dump(vectorizer, open('lm_vectorizer.pkl', 'wb'))

In [17]:
i=0
for X,y in train_dl:
    i+=1
print(i)

197
