In [4]:
%load_ext autoreload
%autoreload 2

In [18]:
import os
import datetime
import pandas as pd
import torch
import torch.nn as nn
from lr_scheduler import CyclicLR
from training_utils import training_loop, test_loop
from model import RNNLM
from data_utils import (IndexVectorizer, 
                        TextDataset, 
                        SpacyTokenizer,
                        LMDataLoader)

In [22]:
####################################################
# Config
####################################################

## Input / output
data_dir = '../data/yelp'

## Tokenization
TOKENIZE = SpacyTokenizer().tokenize

## Vectorization
MIN_WORD_FREQ = 5
MAX_VOCAB_SIZE = 20000
STAT_END_TOK = True

## Model Architecture
hidden_dim = 100
embedding_dim = 200
batch_size = 50
dropout = 0.2
lstm_layers = 1 # this is useless atm
lstm_bidirection = True

## Training Language Model
batch_size = 64
learning_rate = 1e-3
num_epochs = 300
display_epoch_freq = 10
target_seq_len = 50
max_seq_len = 70
min_seq_len = 5

In [7]:
# GPU setup
use_gpu = torch.cuda.is_available()
device_num = 0
device = torch.device(f"cuda:{device_num}" if use_gpu else "cpu")
device

device(type='cuda', index=0)

In [8]:
# IO setup
today = datetime.datetime.now().strftime('%Y-%m-%d')
model_cache_dir = os.path.join(data_dir, 'models')
os.makedirs(model_cache_dir, exist_ok=True)
model_file_lm = os.path.join(model_cache_dir, f'LM__{today}.json')
model_file_class = os.path.join(model_cache_dir, f'CLASS__{today}.json')

train_file = os.path.join(data_dir, 'train.csv')
valid_file = os.path.join(data_dir, 'valid.csv')
test_file = os.path.join(data_dir, 'test.csv')

In [9]:
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)
valid= pd.read_csv(valid_file)

In [10]:
vectorizer = IndexVectorizer(max_words = MAX_VOCAB_SIZE, 
                             min_frequency=MIN_WORD_FREQ,
                             start_end_tokens=STAT_END_TOK, 
                             tokenize=TOKENIZE)

train_ds = TextDataset(data=train, vectorizer=vectorizer, text_col='text')
test_ds = TextDataset(data=test, vectorizer=vectorizer, text_col='text')
valid_ds = TextDataset(data=valid, vectorizer=vectorizer, text_col='text')

In [11]:
print("Next line should be 5k, 1k, 2k")
print(len(train_ds), len(valid_ds), len(test_ds))
print("Next line should be 5988")
print(f"Vocab size: {len(vectorizer.vocabulary)}")

Next line should be 5k, 1k, 2k
5000 1000 2000
Next line should be 5988
Vocab size: 5988


In [12]:
train_dl = LMDataLoader(dataset=train_ds, 
                        target_seq_len=target_seq_len, 
                        shuffle=True, 
                        max_seq_len=max_seq_len, 
                        min_seq_len=min_seq_len, 
                        p_half_seq_len=0.05,
                        batch_size=batch_size)
valid_dl = LMDataLoader(dataset=valid_ds, 
                        target_seq_len=target_seq_len, 
                        shuffle=True, 
                        max_seq_len=max_seq_len, 
                        min_seq_len=min_seq_len, 
                        p_half_seq_len=0.05,
                        batch_size=batch_size) 

In [13]:
if use_gpu: torch.cuda.manual_seed(303)
else: torch.manual_seed(303)

In [19]:
# set up Files to save stuff in
runtime = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
model_file_lm = model_file_lm
    
# Build and initialize the model
lm = RNNLM(device, vectorizer.vocabulary_size, embedding_dim, hidden_dim, batch_size, 
           dropout = dropout, 
           tie_weights = False, 
           num_layers = lstm_layers, 
           bidirectional = lstm_bidirection, 
           word2idx = vectorizer.word2idx,
           log_softmax = True)

In [20]:
if use_gpu:
    lm = lm.to(device)
lm.init_weights()

In [None]:
# Loss and Optimizer
loss = nn.NLLLoss()
optimizer = torch.optim.Adam(
        [
            {"params":lm.lstm1.parameters(), "lr":0.002},
            {"params":lm.lstm2.parameters(), "lr":0.003}
        ], lr=learning_rate)

scheduler = CyclicLR(optimizer,  max_lrs=[0.01, 0.008], 
                     mode='ulmfit', ratio=3, cut_frac=0.4, 
                     n_epochs=num_epochs, batchsize=batch_size, 
                     verbose=False, epoch_length=5000)

history = training_loop(batch_size, num_epochs, display_epoch_freq, 
                        lm, loss, optimizer, scheduler, device, 
                        train_dl, valid_dl, 
                        best_model_path=model_file_lm)

HBox(children=(IntProgress(value=0, max=300), HTML(value='')))

Epoch: 0000; Loss: 4.0624; Val-Loss 4.0746; Perplexity 58.1112; Val-Perplexity 58.8267
Sample: <START> just service $ 've go had would on which a was you place <END> he 

 in our our a more with food you would out your n't to one up which my up you by a out from food other when our 

 what , some really can back my be out are you place ... would had our good just you " <START> service to you ( at
