In [547]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import torch.utils.data
import torch.nn.functional as F
import json
import math

In [575]:
class Dataset(Dataset):
    def __init__(self):
        self.pair=json.load(open('pairs_encoded.json'))
        self.length=len(self.pair)

    def __getitem__(self,index):
        q=torch.LongTensor(self.pair[index][0])#64 bit computer for my pc.Need more bits to encode
        a=torch.LongTensor(self.pair[index][1])
        return q,a
    def __len__(self):
        return self.length




In [697]:
trainload=torch.utils.data.DataLoader(Dataset(),batch_size=100,shuffle=True,pin_memory=True)



In [698]:
q,a=next(iter(trainload))
a.shape

torch.Size([100, 27])

In [699]:
q.shape

torch.Size([100, 25])

In [700]:
a.shape

torch.Size([100, 27])

In [720]:
#masks
device = 'cuda' if torch.cuda.is_available() else 'cpu'##checked

def mask(question, reply_input, reply_target):

    def sub_mask(size):
        mask = torch.triu(torch.ones(size, size)).transpose(0, 1).type(dtype=torch.uint8)
        return mask.unsqueeze(0)

    question_mask = question!=0
    question_mask = question_mask.to(device)
    question_mask = question_mask.unsqueeze(1).unsqueeze(1)         # (batch_size, 1, 1, max_words)

    reply_input_mask = reply_input!=0
    reply_input_mask = reply_input_mask.unsqueeze(1)  # (batch_size, 1, max_words)
    reply_input_mask = reply_input_mask & sub_mask(reply_input.size(-1)).type_as(reply_input_mask.data)
    reply_input_mask = reply_input_mask.unsqueeze(1) # (batch_size, 1, max_words, max_words)
    reply_target_mask = reply_target!=0              # (batch_size, max_words)

    return question_mask, reply_input_mask, reply_target_mask

In [721]:
size=5
torch.triu(torch.ones(size,size)).transpose(0,1)  #diagonal matrix padding future words

tensor([[1., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1.]])

In [722]:
q[0]!=0

tensor([ True, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False])

In [723]:
class Embeddings(nn.Module): ##checked
    def __init__(self,size,dim,maxlen=50):
        super(Embeddings,self).__init__()
        self.dim=dim
        self.dropout=nn.Dropout(0.1)
        self.embed=nn.Embedding(size,dim)
        self.positional_encoder= self.create_positional_encoder(maxlen,dim)

    def create_positional_encoder(self,maxlen,dim):
        positionalencoder=torch.zeros(maxlen,dim).to(device)
        for positions in range(maxlen):
            for i in range(0,dim,2):
                positionalencoder[positions,i]=math.sin(positions/10000**((2*i)/dim))
                positionalencoder[positions,i+1]=math.cos(positions/(10000**((2*(i+1))/dim)))
            positionalencoder=positionalencoder.unsqueeze(0) #(1,maxlen,dim)
            return positionalencoder

    def forward(self,words):
        embedding=self.embed(words)*math.sqrt(self.dim) #(batch_size,maxwords,dim)
        embedding+=self.positional_encoder[:,:embedding.size(1)]
        embedding=self.dropout(embedding)
        return embedding




In [724]:
class Multiheadattention(nn.Module): ##checked
    def __init__(self,head,dim):
        super().__init__()
        assert dim % head==0
        self.dk=dim//head
        self.head=head
        self.dropout=nn.Dropout(0.1)
        self.query=nn.Linear(dim,dim)
        self.key=nn.Linear(dim,dim)
        self.val=nn.Linear(dim,dim)
        self.concatlayer=nn.Linear(dim,dim)
    def forward(self,query,key,val,mask):
        query=self.query(query) #(batch_size,max_words,512)
        key=self.key(key)
        val=self.val(val)
        query=query.view(query.shape[0],-1,self.head,self.dk).permute(0,2,1,3)
        key=key.view(key.shape[0],-1,self.head,self.dk).permute(0,2,1,3)
        val=val.view(val.shape[0],-1,self.head,self.dk).permute(0,2,1,3)
        #dot product
        score=torch.matmul(query,key.permute(0,1,3,2))/math.sqrt(self.dk)
        score=score.masked_fill(mask==0,-1e9)
        weights=F.softmax(score,dim=-1)
        weights=self.dropout(weights)
        contxt=torch.matmul(weights,val)
        contxt=contxt.permute(0,2,1,3).contiguous().view(contxt.shape[0],-1,self.head * self.dk)
        intlayer=self.concatlayer(contxt)
        return intlayer

In [725]:
class forward(nn.Module):##checked
    def __init__(self,dim,middim=2048):
        super().__init__()
        self.fullyconnected1=nn.Linear(dim,middim)
        self.fullyconnected2=nn.Linear(middim,dim)
        self.dropout=nn.Dropout(0.1)
    def forward(self,x):
        output=F.relu(self.fullyconnected1(x))
        output=self.fullyconnected2(self.dropout(output))
        return output


In [726]:
#encoder layer
class Encoder(nn.Module):
    def __init__(self,dim,head):
        super(Encoder,self).__init__()
        self.selfmultiheadattn=Multiheadattention(head,dim)
        self.feedforward=forward(dim)
        self.layernormalization=nn.LayerNorm(dim)
        self.dropout=nn.Dropout(0.1)

    def forward(self,embedding,mask):
        intlayer=self.dropout(self.selfmultiheadattn(embedding,embedding,embedding,mask))
        intlayer=self.layernormalization(intlayer+embedding)
        ffout=self.dropout(self.feedforward(intlayer))
        enc=self.layernormalization(ffout+intlayer)
        return enc



In [727]:
class Decoder(nn.Module):##checked
    def __init__(self,dim,head):
        super(Decoder,self).__init__()
        self.layernormalization=nn.LayerNorm(dim)
        self.selfmultiheadattn=Multiheadattention(head,dim)
        self.sourcemultiheadattn=Multiheadattention(head,dim)
        self.feedforward=forward(dim)
        self.dropout=nn.Dropout(0.1)

    def forward(self,embedding,encod,sourcemask,targetmask):
        query=self.dropout(self.selfmultiheadattn(embedding,embedding,embedding,targetmask))
        query=self.layernormalization(query+embedding)
        intlayer=self.dropout( self.sourcemultiheadattn(query,encod,encod,sourcemask))
        intlayer=self.layernormalization(intlayer+query)
        feedforwardoutput=self.dropout(self.feedforward(intlayer))
        decod=self.layernormalization(feedforwardoutput+intlayer)
        return decod


In [728]:
class finaltransformer(nn.Module):
    def __init__(self,dim,head,numlayer,wordmap):
        super(finaltransformer,self).__init__()
        self.dim=dim
        self.vocabsize=len(wordmap)
        self.embed=Embeddings(self.vocabsize,dim)
        self.encod=nn.ModuleList([Encoder(dim,head) for i in range(numlayer)])
        self.decod=nn.ModuleList([Decoder(dim,head) for i in range(numlayer)])
        self.logits=nn.Linear(dim,self.vocabsize)

    def encode(self,sourcewords,sourcemask):
        sourceembed=self.embed(sourcewords)
        for x in self.encod:
            sourceembed=x(sourceembed,sourcemask)
        return sourceembed

    def decode(self,target_word,target_mask,sourceembed,sourcemask):
        targetembed = self.embed(target_word)
        for x in self.decod:
            targetembed=x(targetembed,sourceembed,sourcemask,target_mask)
        return targetembed

    def forward(self,sourceword,sourcemask,targetword,targetmask):
        encod=self.encode(sourceword,sourcemask)
        decod = self.decode(targetword, targetmask, encod, sourcemask)
        output = F.log_softmax(self.logits(decod), dim=2)
        return output


In [729]:
class Adamw:
    def __init__(self,modelsize,steps,optimizer):
        self.modelsize=modelsize
        self.steps=steps
        self.optimizer=optimizer
        self.cstep=0
        self.lr=0

    def getlr(self):
        return self.modelsize**(-0.5) *min(self.cstep**(-0.5),self.cstep*self.steps**(-1.5))


    def step(self):
        self.cstep+=1
        lr=self.getlr()
        for x in self.optimizer.param_groups:
            x['lr']=lr
        self.lr=lr
        self.optimizer.step()



In [730]:
class Losssmooth(nn.Module):
    def __init__(self,size,smoothing):
        super().__init__()
        self.criterion=nn.KLDivLoss(size_average=False,reduce=False)
        self.confidence=1.0-smoothing
        self.smooth=smoothing
        self.size=size
    def forward(self,pred,target,mask):
        mask=mask.float()
        pred=pred.view(-1,pred.size(-1))
        target=target.contiguous().view(-1)
        mask=mask.view(-1)
        label=pred.data.clone()
        label.fill_(self.smooth/(self.size-1))
        label.scatter_(1,target.data.unsqueeze(1),self.confidence)
        loss=(self.criterion(pred,label))
        loss=(loss.sum(1)*mask).sum()/mask.sum()
        return loss



In [731]:
#model creation
dim=512
head=8
numlayer=3
epochs=10
with open('wordmap.json','r') as read1:
    wordmap=json.load(read1)

In [732]:
ourtransformer=finaltransformer(dim=dim,head=head,numlayer=numlayer,wordmap=wordmap)
ourtransformer=ourtransformer.to(device)


In [733]:
optimizer=torch.optim.Adam(ourtransformer.parameters(),lr=0,betas=(0.9,0.98),eps=1e-9)
ourtransformer_optimizer=Adamw(modelsize=dim,steps=4000,optimizer=optimizer)
criterion=Losssmooth(len(wordmap),smoothing=0.2)

In [734]:
def train(trainload,ourtransformer,criterion,epoch):
    ourtransformer.train()
    losses=0
    x=0
    for i,(q,a) in enumerate(trainload):
        sample=q.shape[0]
        q=q.to(device)
        a=a.to(device)
        a_inp=a[:,:-1]
        a_targ=a[:,1:]
        question_mask,reply_input_mask,reply_target_mask=mask(q,a_inp,a_targ)
        output=ourtransformer(q,question_mask,a_inp,reply_input_mask)
        loss=criterion(output,a_targ,reply_target_mask)
        ourtransformer_optimizer.optimizer.zero_grad()
        loss.backward()
        ourtransformer_optimizer.step()
        losses+=loss.item()*sample
        x+=sample
        if i%100==0:
            print('Epoch [{}][{}/{}]\tLoss:{:.3f}'.format(epoch,i,len(trainload),losses/x))



In [735]:
#evaluate
def eval(finaltransformer,ques,quesmask,maxlen,wordmap):
    revwordmap={v:k for k,v in wordmap.items()}
    finaltransformer.eval()
    start=wordmap['<start>']
    encod=finaltransformer.encode(ques,quesmask)
    word=torch.LongTensor([[start]]).to(device) #greedy search
    for step in range(maxlen-1):
        size=word.shape[0]
        targetmask=torch.triu(torch.ones(size,size)).transpose(0,1).type(dtype=torch.uint8)
        targetmask=targetmask.to(device).unsqueeze(0)
        decod=finaltransformer.decode(word,targetmask,encod,quesmask)
        pred=finaltransformer.logit(decod[:,-1])
        max1,nextw=torch.max(pred,dim=1)
        nextw=nextw.item()
        if nextw==wordmap["<end>"]:
            break
        words=torch.cat([word,torch.LongTensor([[nextw]]).to(device)],dim=1)
        words=words.squeeze(0)
        words=word.tolist()
        sen=[x for x in words if x not in {wordmap['<start>']}]
        fullsen=' '.join([revwordmap[sen[x]] for x in range (len(sen))])
        return fullsen


In [737]:
for epoch in range(epochs):

    train(trainload, ourtransformer, criterion, epoch)

    state = {'epoch': epoch, 'transformer': ourtransformer, 'transformer_optimizer': ourtransformer_optimizer}
    torch.save(state, 'checkpoint_' + str(epoch) + '.pth.tar')


Epoch [0][0/2217]	Loss:7.464


KeyboardInterrupt: 