# CE7455 Assignment 1
Peng Hongyi (G2105029E)

## Run the provided code at first

### Load data

In [13]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

In [14]:
import data 

In [15]:
data_dir = './data/wikitext-2'
corpus = data.Corpus(data_dir)
print(f'Train: {corpus.train.shape}, Val: {corpus.valid.shape}, Test: {corpus.test.shape}')

Train: torch.Size([2088628]), Val: torch.Size([217646]), Test: torch.Size([245569])


In [16]:
def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

In [17]:
BATCH_SIZE = 20
EVAL_BATCH_SIZE = 10
train_data = batchify(corpus.train, BATCH_SIZE)
val_data = batchify(corpus.valid, EVAL_BATCH_SIZE)
test_data = batchify(corpus.test, EVAL_BATCH_SIZE)

In [18]:
N_TOKENS = len(corpus.dictionary)
import model
MODEL = "LSTM"
EMSIZE = 200
N_HID = 200
N_LAYERS = 2
DROPOUT = 0.2
TIED = "store_true"
LR = 20
CLIP_TH = 0.25
model = model.RNNModel(MODEL, N_TOKENS, EMSIZE, N_HID, N_LAYERS, DROPOUT, TIED).to(device)

In [19]:
BPTT = 35
def get_batch(source, i):
    seq_len = min(BPTT, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

In [20]:
def repackage_hidden(h):
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [21]:
from torch import nn
import time
import math
criterion = nn.NLLLoss()
model.train()
total_loss = 0
start_time = time.time()
hidden = model.init_hidden(BATCH_SIZE)
for epoch in range(1):
    for batch, i in enumerate(range(0, train_data.size(0)-1, BPTT)):
        data, targets = get_batch(train_data, i)
        model.zero_grad()
        hidden = repackage_hidden(hidden)
        output, hidden = model(data, hidden)
        loss = criterion(output, targets)
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), CLIP_TH)
        for p in model.parameters():
            p.data.add_(p.grad, alpha=-LR)
        total_loss += loss.item()

        if batch % 200 == 0 and batch > 0:
            cur_loss = total_loss / 200
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // BPTT, LR,
                elapsed * 1000 / 200, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()


| epoch   0 |   200/ 2983 batches | lr 20.00 | ms/batch 19.42 | loss  7.62 | ppl  2034.00
| epoch   0 |   400/ 2983 batches | lr 20.00 | ms/batch 16.20 | loss  6.80 | ppl   896.94
| epoch   0 |   600/ 2983 batches | lr 20.00 | ms/batch 16.00 | loss  6.37 | ppl   586.04
| epoch   0 |   800/ 2983 batches | lr 20.00 | ms/batch 15.92 | loss  6.20 | ppl   493.94
| epoch   0 |  1000/ 2983 batches | lr 20.00 | ms/batch 15.85 | loss  6.05 | ppl   426.07
| epoch   0 |  1200/ 2983 batches | lr 20.00 | ms/batch 15.88 | loss  5.96 | ppl   387.96
| epoch   0 |  1400/ 2983 batches | lr 20.00 | ms/batch 15.95 | loss  5.85 | ppl   346.41
| epoch   0 |  1600/ 2983 batches | lr 20.00 | ms/batch 15.82 | loss  5.85 | ppl   348.40
| epoch   0 |  1800/ 2983 batches | lr 20.00 | ms/batch 15.52 | loss  5.70 | ppl   297.87
| epoch   0 |  2000/ 2983 batches | lr 20.00 | ms/batch 15.51 | loss  5.66 | ppl   288.17
| epoch   0 |  2200/ 2983 batches | lr 20.00 | ms/batch 15.88 | loss  5.56 | ppl   258.54
| epoch   

### Write my own FNN model

In [22]:
class FNNModel(nn.Module):
    def __init__(self, n_token, n_emb, n_hidden, seq_len):
        super().__init__()
        self.n_token = n_token
        self.n_emb = n_emb 
        self.n_hidden = n_hidden
        self.seq_len = seq_len
        self.encoder = nn.Embedding(n_token, n_emb)
        self.hidden = nn.Linear(n_emb*seq_len, n_hidden)
        self.decoder = nn.Linear(n_hidden, n_token)
    def forward(self, input):
        emb = self.encoder(input)
        batch_size = emb.shape[0]
        emb = emb.view(batch_size, -1)
        emb = torch.tanh(emb)
        out = self.hidden(emb)
        decoded = self.decoder(out)
        decoded = nn.functional.log_softmax(decoded, dim=1)
        return decoded
        

In [81]:
# Own Dataset
class SequenceDataset(torch.utils.data.Dataset):
    def __init__(self, tensor_data, seq_len):
        self.data = tensor_data
        self.seq_len = seq_len
    def __len__(self):
        return len(self.data) - self.seq_len - 1
    def __getitem__(self, i):
        return self.data[i:i+self.seq_len], self.data[i+self.seq_len]


In [82]:
TRAIN_BATCH = 1024
EVAL_BATCH = 10000
SEQ_LEN = 10
train_loader = torch.utils.data.DataLoader(SequenceDataset(corpus.train, seq_len=SEQ_LEN), batch_size=TRAIN_BATCH)
val_loader = torch.utils.data.DataLoader(SequenceDataset(corpus.valid, seq_len=SEQ_LEN), batch_size=EVAL_BATCH)
test_loader = torch.utils.data.DataLoader(SequenceDataset(corpus.test, seq_len=SEQ_LEN), batch_size=EVAL_BATCH)

In [83]:
model = FNNModel(
    n_token=N_TOKENS,
    n_emb=200,
    n_hidden=200,
    seq_len = SEQ_LEN
)

criterion = nn.NLLLoss()
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [84]:
def train():
    model.train()
    total_loss = 0.
    start_time = time.time()
    for i, (X, y) in enumerate(train_loader):
        model.zero_grad()
        out = model(X)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if i%100 == 0 and i > 0:
            cur_loss = total_loss/100
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches |  ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, i, len(train_loader), 
                elapsed * 1000/100, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

In [85]:
def evaluate():
    model.eval()
    total_loss = 0.
    with torch.no_grad():
        for i, (X ,y) in enumerate(val_loader):
            out = model(X)
            loss = criterion(out, y)
            total_loss += loss.item()
        return total_loss/len(val_loader)

In [86]:
SAVE_DIR = 'model.pt'
print("Training Starts")

best_val_loss = None
count = 0
for epoch in range(1, 50+1):
    print("-"*89)
    epoch_start_time = time.time()
    train()
    val_loss = evaluate()
    print("-"*89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
    print('-' * 89)
    if not best_val_loss or val_loss < best_val_loss:
        with open(SAVE_DIR, 'wb') as f:
            torch.save(model, f)
        best_val_loss = val_loss
        count = 0
    else:
        count += 1
    if count >= 5:
        print(f'Early Stop at epoch {epoch}')
        break 


Training Starts
-----------------------------------------------------------------------------------------
| epoch   1 |   100/ 2040 batches |  ms/batch 204.57 | loss  7.91 | ppl  2727.57
| epoch   1 |   200/ 2040 batches |  ms/batch 201.60 | loss  6.96 | ppl  1051.32
| epoch   1 |   300/ 2040 batches |  ms/batch 198.83 | loss  6.88 | ppl   972.76
| epoch   1 |   400/ 2040 batches |  ms/batch 205.47 | loss  6.80 | ppl   902.08
| epoch   1 |   500/ 2040 batches |  ms/batch 196.52 | loss  6.59 | ppl   728.11
| epoch   1 |   600/ 2040 batches |  ms/batch 200.35 | loss  6.56 | ppl   709.69
| epoch   1 |   700/ 2040 batches |  ms/batch 197.50 | loss  6.53 | ppl   686.07
| epoch   1 |   800/ 2040 batches |  ms/batch 199.97 | loss  6.41 | ppl   605.70
| epoch   1 |   900/ 2040 batches |  ms/batch 200.21 | loss  6.43 | ppl   622.46
| epoch   1 |  1000/ 2040 batches |  ms/batch 205.46 | loss  6.40 | ppl   599.25
| epoch   1 |  1100/ 2040 batches |  ms/batch 202.07 | loss  6.38 | ppl   587.06
| e

### Test

In [87]:
with open(SAVE_DIR, 'rb') as f:
    model = torch.load(f)
    model.eval()
    total_loss = 0.
    with torch.no_grad():
        for i, (X ,y) in enumerate(test_loader):
            out = model(X)
            loss = criterion(out, y)
            total_loss += loss.item()
        test_loss = total_loss/len(test_loader)
    print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))


| End of training | test loss  5.70 | test ppl   297.93


Apparently, it overfits the tranining data. Train agian with drop out layer and smaller length of sequence

In [None]:
!python TrainWithDropOut.py

In [1]:
from TrainWithDropOut import FNNModelDropout
from TrainWithDropOut import N_TOKENS, SEQ_LEN, DROPOUT_P
model = FNNModelDropout(
    n_token=N_TOKENS,
    n_emb = 200,
    n_hidden=200,
    seq_len=SEQ_LEN,
    dropout_p= DROPOUT_P
    ).to(device)

ImportError: cannot import name 'N_TOKENS' from 'TrainWithDropOut' (c:\Users\HONGYI001\Desktop\NLP Assignment1\examples\word_language_model\TrainWithDropOut.py)

In [27]:
model 
with open('model_dropout.pt', 'rb') as f:
    model = torch.load(f)
    model.eval()
    total_loss = 0.
    with torch.no_grad():
        for i, (X ,y) in enumerate(test_loader):
            out = model(X)
            loss = criterion(out, y)
            total_loss += loss.item()
        test_loss = total_loss/len(test_loader)
    print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))

AttributeError: Can't get attribute 'FNNModelDropout' on <module '__main__'>