In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import datetime
import pandas as pd
import torch
import torch.nn as nn
sys.path.append('../')
from lr_scheduler import CyclicLR
from training_utils import training_loop, test_loop
from model import RNNLM
from data_utils import (IndexVectorizer, 
                        TextDataset, 
                        SpacyTokenizer,
                        LMDataLoader)
import pickle

In [3]:
####################################################
# Config
####################################################

## Input / output
data_dir = '../../data/imdb'

## Tokenization
TOKENIZE = SpacyTokenizer().tokenize

## Vectorization
MIN_WORD_FREQ = 2
MAX_VOCAB_SIZE = 20000
STAT_END_TOK = True

## Model Architecture
hidden_dim = 100
embedding_dim = 100
dropout = 0.3
lstm_layers = 3
lstm_bidirection = False

## Training Language Model
batch_size = 80
learning_rate = 1e-3
num_epochs = 100
display_epoch_freq = 10
target_seq_len = 90
max_seq_len = 120
min_seq_len = 20

In [4]:
# GPU setup
use_gpu = torch.cuda.is_available()
device_num = 0
device = torch.device(f"cuda:{device_num}" if use_gpu else "cpu")
device

device(type='cuda', index=0)

In [5]:
# IO setup
today = datetime.datetime.now().strftime('%Y-%m-%d')
model_cache_dir = os.path.join(data_dir, 'models')
data_cache = os.path.join(model_cache_dir, 'data_cache.pkl')
vectorizer_cache = os.path.join(model_cache_dir, 'lm_vectorizer.pkl')
os.makedirs(model_cache_dir, exist_ok=True)
model_file_lm = os.path.join(model_cache_dir, f'LM__{today}.json')
model_file_class = os.path.join(model_cache_dir, f'CLASS__{today}.json')

train_file = os.path.join(data_dir, 'unsup.csv')
valid_file = os.path.join(data_dir, 'valid.csv')

In [6]:
RE_VECTORIZE = False
if RE_VECTORIZE or not os.path.isfile(data_cache):
    train = pd.read_csv(train_file)
    valid = pd.read_csv(valid_file)
    vectorizer = IndexVectorizer(max_words = MAX_VOCAB_SIZE, 
                             min_frequency=MIN_WORD_FREQ,
                             start_end_tokens=STAT_END_TOK, 
                             tokenize=TOKENIZE)
    train_ds = TextDataset(data=train, vectorizer=vectorizer, text_col='text')
    valid_ds = TextDataset(data=valid, vectorizer=vectorizer, text_col='text')
    pickle.dump([train_ds, valid_ds], open(data_cache, 'wb'))
    pickle.dump(vectorizer, open(vectorizer_cache, 'wb'))
else:
    train_ds, valid_ds = pickle.load(open(data_cache, 'rb'))
    vectorizer = pickle.load(open(vectorizer_cache, 'rb'))

In [7]:
print(f'Train size: {len(train_ds)}\nvalid size: {len(valid_ds)}')
print(f"Vocab size: {len(vectorizer.vocabulary)}")

Train size: 50000
valid size: 25000
Vocab size: 20000


In [8]:
train_dl = LMDataLoader(dataset=train_ds, 
                        target_seq_len=target_seq_len, 
                        shuffle=True, 
                        max_seq_len=max_seq_len, 
                        min_seq_len=min_seq_len, 
                        p_half_seq_len=0.05,
                        batch_size=batch_size)
valid_dl = LMDataLoader(dataset=valid_ds,
                        target_seq_len=target_seq_len, 
                        shuffle=True, 
                        max_seq_len=max_seq_len, 
                        min_seq_len=min_seq_len, 
                        p_half_seq_len=0.05,
                        batch_size=batch_size) 

In [9]:
if use_gpu: torch.cuda.manual_seed(303)
else: torch.manual_seed(303)

In [17]:
# set up Files to save stuff in
runtime = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
model_file_lm = model_file_lm
    
TIE_WEIGHTS = False 
NUM_LAYERS = 3
BIDRIECTIONAL = True

# Build and initialize the model
lm = RNNLM(device, vectorizer.vocabulary_size, embedding_dim, hidden_dim, batch_size, 
           dropout = dropout, 
           tie_weights = TIE_WEIGHTS, 
           num_layers = NUM_LAYERS, 
           bidirectional = BIDRIECTIONAL, 
           word2idx = vectorizer.word2idx,
           log_softmax = False)

In [18]:
if use_gpu:
    lm = lm.to(device)
#lm.init_weights()

In [20]:
# Loss and Optimizer
loss = nn.CrossEntropyLoss()

# Extract pointers to the parameters of the lstms
param_list = [{'params': rnn.parameters(), 'lr': 1e-3} for rnn in lm.rnns]

# If weights are tied between encoder and decoder, we can only optimize 
# parameters in one of those two layers
if not TIE_WEIGHTS:
    param_list.extend([
            {'params': lm.encoder.parameters(), 'lr':1e-3},
            {'params': lm.decoder.parameters(), 'lr':1e-3},
        ])
else:
    param_list.extend([
        {'params': lm.encoder.parameters(), 'lr':1e-3},
    ])

optimizer = torch.optim.Adam(param_list, lr=0.01)

scheduler = CyclicLR(optimizer,  max_lrs=[0.1, 0.1, 0.1, 0.1, 0.1], 
                     mode='ulmfit', ratio=1.5, cut_frac=0.4, 
                     n_epochs=num_epochs, batchsize=50000/1171, 
                     verbose=False, epoch_length=50000)

history = training_loop(batch_size, num_epochs, 1, 
                        lm, loss, optimizer, None, device, 
                        train_dl, valid_dl, 
                        best_model_path=model_file_lm)

HBox(children=(IntProgress(value=0), HTML(value='')))

Epoch: 0000; Train-Loss: 6.5450; Val-Loss 6.5002; Train-accuracy 0.0000; Val-Accuracy 0.0000
Sample: this is bad " until never a to villains . , superiors a this show 's in matt surrounded a have this of scenes a made , no full the but quaid sea in entire fun watch of are one around life baby of , believable desires some />a /><br it schell around
Epoch: 0001; Train-Loss: 6.5376; Val-Loss 6.4971; Train-accuracy 0.0000; Val-Accuracy 0.0000
Sample: this is good a />it way big fine whole <UNK> gun is light how and fun confusing twist worse thing overall . for supporting up 8 the place that a forms to putting lights acting all perhaps /><br own show , with years five . is in get and thought " david school
Epoch: 0002; Train-Loss: 6.5352; Val-Loss 6.4957; Train-accuracy 0.0000; Val-Accuracy 0.0000
Sample: this is good police an -- even constitute karate job of to about there , million in epidemic the luis rock with teenage was spoiled " also one who ? shining election literally the is enjoy

In [14]:
i=0
for X,y in train_dl:
    i+=1
print(i)

1459
