In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
import copy
import math

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import random
import sys
import pickle
import argparse

device = torch.device(0 if torch.cuda.is_available() else "cpu")

In [2]:
d_model=50
TRAIN_DATA_PATH = '../data/train_data_'+str(d_model)+'d.pkl'
f=open(TRAIN_DATA_PATH, "rb")
train_x0=pickle.load(f)
train_x1=pickle.load(f)
train_Y=pickle.load(f)
f.close()

VAL_DATA_PATH = '../data/dev_data_'+str(d_model)+'d.pkl'
f=open(VAL_DATA_PATH, "rb")
val_x0=pickle.load(f)
val_x1=pickle.load(f)
val_Y=pickle.load(f)
f.close()

d_len=train_x0.size()[0]
s_len=train_x0.size()[1]
print(train_x0.size())
print(train_x1.size())
print(train_Y.size())

torch.Size([11530, 18, 50])
torch.Size([11530, 18, 50])
torch.Size([11530])


In [3]:
class AttentionEnc(nn.Module):

    def __init__(self, encoder, embedding, d_model, max_len, output_dim=2):
        super(AttentionEnc, self).__init__()
        self.encoder = encoder
        self.embed = embedding
        self.size=d_model*max_len
        self.linear=nn.Linear(self.size, output_dim)
        
    def forward(self, x0, x1):
        x0 = self.embed(x0)
        x0 = self.encoder(x0)
        x0 = x0.view(-1, self.size)
        x1 = self.embed(x1)
        x1 = self.encoder(x1)
        x1 = x1.view(-1, self.size)
        
        return self.linear(x0-x1)

In [4]:
def clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [5]:
class Encoder(nn.Module):
    
    def __init__ (self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, mask=None):
        for layer in self.layers:
            x=layer(x, mask)
        return self.norm(x)        

In [6]:
class LayerNorm(nn.Module):

    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [7]:
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))

In [8]:
class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

In [9]:
def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

In [10]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, query, key, value, mask=None):
        "Implements Figure 2"
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)
        
        # 1) Do all the linear projections in batch from d_model => h x d_k 
        query, key, value = \
            [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
             for l, x in zip(self.linears, (query, key, value))]
        
        # 2) Apply attention on all the projected vectors in batch. 
        x, self.attn = attention(query, key, value, mask=mask, 
                                 dropout=self.dropout)
        
        # 3) "Concat" using a view and apply a final linear. 
        x = x.transpose(1, 2).contiguous() \
             .view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)

In [11]:
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

In [12]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

In [13]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, dropout, max_len=50):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        self.d_model = d_model
        
    def forward(self, x):
        x = x * math.sqrt(self.d_model)
        x = x + Variable(self.pe[:, :x.size(1)], 
                         requires_grad=False)
        return self.dropout(x)

In [14]:
def make_model(d_model, max_len, N=6, d_ff=256, h=5, dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = AttentionEnc(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        c(position), d_model, max_len)
        
    
    # This was important from their code. 
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform(p)
    return model

In [15]:
def generate_batch(x0, x1, Y, batch_size, start_i):
    l=min(batch_size, len(x0)-start_i)
    x0_batch=[]
    x1_batch=[]
    Y_batch=[]
    for i in range(l):
        x0_batch.append(x0[start_i+i].numpy())
        x1_batch.append(x1[start_i+i].numpy())
        Y_batch.append(Y[start_i+i])
    x0_batch=torch.Tensor(x0_batch).float().to(device)
    x1_batch=torch.tensor(x1_batch).float().to(device)
    Y_batch=torch.tensor(Y_batch).long().to(device)
    return x0_batch, x1_batch, Y_batch

In [24]:
def compute_acc(val_x0, val_x1, val_Y, model):
    y_out = model(val_x0, val_x1)
    y_out = F.log_softmax(y_out, dim=1)
    y_out = torch.argmax(y_out, dim=1)
    cor=0
    for i in range(len(y_out)):
        if y_out[i]==val_Y[i]:
            cor+=1
    print("current accuracy is :", cor/len(y_out))
    return cor/len(y_out)

In [27]:
model = make_model(d_model, s_len).to(device)
criterion = nn.CrossEntropyLoss()
learning_rate=1e-3
reg=1e-7
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=reg)
tot_epoch=200
batch_size=500
# batch_arrange=[0] #overfit
batch_arrange = [i for i in range(0, d_len, batch_size)]
loss_history=[]
val_acc_history=[]



In [28]:
torch.cuda.empty_cache()
val_x0 = val_x0.to(device)
val_x1 = val_x1.to(device)
val_Y = val_Y.long().to(device)

for epoch in range(tot_epoch):
    random.shuffle(batch_arrange)
    for start_i in batch_arrange:
        x0_batch, x1_batch, Y_batch = generate_batch(train_x0, train_x1, train_Y, batch_size, start_i)
        optimizer.zero_grad()
        y_out = model(x0_batch, x1_batch)
        loss = criterion(y_out, Y_batch)
        loss.backward()
        optimizer.step()
        loss_history.append(loss)
    print("epoch ", epoch, ": current loss ", loss, sep="")
    val_acc_history.append(compute_acc(val_x0, val_x1, val_Y, model))
    if epoch%20==1:
        torch.save(model.state_dict(), 'models/attention_' + str(d_model) +'d_epoch_' + str(epoch) + '.torch')

epoch 0: current loss tensor(0.7479, device='cuda:0', grad_fn=<NllLossBackward>)
current accuracy is : 0.5654273297923709
epoch 1: current loss tensor(0.6874, device='cuda:0', grad_fn=<NllLossBackward>)
current accuracy is : 0.5485272815065186
epoch 2: current loss tensor(0.6656, device='cuda:0', grad_fn=<NllLossBackward>)
current accuracy is : 0.5692901979719942


KeyboardInterrupt: 