In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import datetime
import pandas as pd
import torch
import torch.nn as nn
from lr_scheduler import CyclicLR
from training_utils import training_loop, test_loop
from model import RNNLM
from data_utils import (IndexVectorizer, 
                        TextDataset, 
                        SpacyTokenizer,
                        LMDataLoader)
import pickle

In [3]:
####################################################
# Config
####################################################

## Input / output
data_dir = '../data/yelp'

## Tokenization
TOKENIZE = SpacyTokenizer().tokenize

## Vectorization
MIN_WORD_FREQ = 2
MAX_VOCAB_SIZE = 20000
STAT_END_TOK = True

## Model Architecture
hidden_dim = 100
embedding_dim = 200
dropout = 0.2
lstm_layers = 1 # this is useless atm
lstm_bidirection = False

## Training Language Model
batch_size = 64
learning_rate = 1e-3
num_epochs = 100
display_epoch_freq = 10
target_seq_len = 50
max_seq_len = 70
min_seq_len = 5

In [4]:
# GPU setup
use_gpu = torch.cuda.is_available()
device_num = 0
device = torch.device(f"cuda:{device_num}" if use_gpu else "cpu")
device

device(type='cuda', index=0)

In [5]:
# IO setup
today = datetime.datetime.now().strftime('%Y-%m-%d')
model_cache_dir = os.path.join(data_dir, 'models')
os.makedirs(model_cache_dir, exist_ok=True)
model_file_lm = os.path.join(model_cache_dir, f'LM__{today}.json')
model_file_class = os.path.join(model_cache_dir, f'CLASS__{today}.json')

train_file = os.path.join(data_dir, 'train.csv')
valid_file = os.path.join(data_dir, 'valid.csv')

In [6]:
train = pd.read_csv(train_file)
valid = pd.read_csv(valid_file)

In [7]:
vectorizer = IndexVectorizer(max_words = MAX_VOCAB_SIZE, 
                             min_frequency=MIN_WORD_FREQ,
                             start_end_tokens=STAT_END_TOK, 
                             tokenize=TOKENIZE)


train_ds = TextDataset(data=train, vectorizer=vectorizer, text_col='text')
valid_ds = TextDataset(data=valid, vectorizer=vectorizer, text_col='text')

In [8]:
print(len(train_ds), len(valid_ds))
print("Next line should be 11665")
print(f"Vocab size: {len(vectorizer.vocabulary)}")

5000 1000
Next line should be 11665
Vocab size: 11665


In [9]:
train_dl = LMDataLoader(dataset=train_ds, 
                        target_seq_len=target_seq_len, 
                        shuffle=True, 
                        max_seq_len=max_seq_len, 
                        min_seq_len=min_seq_len, 
                        p_half_seq_len=0.05,
                        batch_size=batch_size)
valid_dl = LMDataLoader(dataset=valid_ds,
                        target_seq_len=target_seq_len, 
                        shuffle=True, 
                        max_seq_len=max_seq_len, 
                        min_seq_len=min_seq_len, 
                        p_half_seq_len=0.05,
                        batch_size=batch_size) 

In [10]:
if use_gpu: torch.cuda.manual_seed(303)
else: torch.manual_seed(303)

In [11]:
# set up Files to save stuff in
runtime = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
model_file_lm = model_file_lm
    
    
# Build and initialize the model
lm = RNNLM(device, vectorizer.vocabulary_size, embedding_dim, hidden_dim, batch_size, 
           dropout = dropout, 
           tie_weights = False, 
           num_layers = lstm_layers, 
           bidirectional = lstm_bidirection, 
           word2idx = vectorizer.word2idx,
           log_softmax = False)

In [12]:
if use_gpu:
    lm = lm.to(device)
lm.init_weights()

In [13]:
# Loss and Optimizer
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    [
        {'params': lm.encoder.parameters(), 'lr':1e-3},
        {'params': lm.lstm1.parameters(), 'lr':1e-3},
        {'params': lm.lstm2.parameters(), 'lr':1e-3},
        {'params': lm.lstm3.parameters(), 'lr':1e-3},
        {'params': lm.decoder.parameters(), 'lr':1e-3},
    ]
    , lr=0.01)

scheduler = CyclicLR(optimizer,  max_lrs=[0.01, 0.01, 0.01, 0.01, 0.01], 
                     mode='ulmfit', ratio=1.5, cut_frac=0.4, 
                     n_epochs=num_epochs, batchsize=5000/196, 
                     verbose=False, epoch_length=5000)

history = training_loop(batch_size, num_epochs, display_epoch_freq, 
                        lm, loss, optimizer, scheduler, device, 
                        train_dl, valid_dl, 
                        best_model_path=model_file_lm)

Epoch: 0000; Loss: 6.5737; Val-Loss 6.2891; Perplexity 716.0379; Val-Perplexity 538.6859
Sample: this is a bad a amazing i after the or ' . but pad
Epoch: 0010; Loss: 4.7324; Val-Loss 4.7321; Perplexity 113.5681; Val-Perplexity 113.5374
Sample: this is a bad surprise and let us feel so much that were stay
Epoch: 0020; Loss: 4.4855; Val-Loss 4.6682; Perplexity 88.7174; Val-Perplexity 106.5041
Sample: this is a bad value . the walls looked nice but i did buck
Epoch: 0030; Loss: 4.3742; Val-Loss 4.6715; Perplexity 79.3780; Val-Perplexity 106.8568
Sample: this is a bad spot , when they found more special . did n't
Epoch: 0040; Loss: 4.2964; Val-Loss 4.6882; Perplexity 73.4375; Val-Perplexity 108.6614
Sample: this is a bad view of the road ( beautiful bars on old no
Epoch: 0050; Loss: 4.2495; Val-Loss 4.7151; Perplexity 70.0734; Val-Perplexity 111.6178
Sample: this is a bad experience on my 2nd of our meal .    i
Epoch: 0060; Loss: 4.2140; Val-Loss 4.7464; Perplexity 67.6289; Val-Perplexity

In [14]:
[lm.idx2word[x] for x in lm.sample(seed='I see a silly ', length=10)]

['i',
 'see',
 'a',
 'silly',
 'vet',
 '"',
 'western',
 '"',
 'sauce',
 'that',
 'had',
 'me',
 'overwhelmed',
 'someone']

In [16]:
pickle.dump(vectorizer, open('lm_vectorizer.pkl', 'wb'))

In [17]:
i=0
for X,y in train_dl:
    i+=1
print(i)

197
