In [1]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm , tqdm_notebook, notebook
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader , Dataset

In [2]:
PAD_token = 0 
SOS_token = 1
EOS_token = 2


In [3]:
# file read path

import os
import codecs

def read_in(folder):
    files = os.listdir(folder)
    a_list = []
    for a_file in files:
        if not a_file.startswith("."):
            if a_file == "dialogues_train.txt":
                f = codecs.open(folder + a_file, "r", encoding = "ISO-8859-1", errors = "ignore")
                a_list.append(f.read())
                f.close()
    return a_list

In [4]:
#test_list = read_in("/Users/Madara/Desktop/train/")
test_list = read_in("/mnt/c/Prayash/coding/nlp/train/")


In [5]:
#text splitting
new_test_list = [word for word in test_list[0].split(" __eou__ ")]

for index in range(0, 5):
    print(new_test_list[index])



Say , Jim , how about going for a few beers after dinner ?
You know that is tempting but is really not good for our fitness .
What do you mean ? It will help us to relax .
Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ?
I guess you are right.But what shall we do ? I don't feel like sitting at home .


In [6]:
#extract features
import nltk
from nltk import word_tokenize
nltk.download('punkt')


[nltk_data] Downloading package punkt to /home/prayash/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:



features = {}
word_index = 0
    
word_list = [word for text in new_test_list for word in word_tokenize(text.lower()) if word.isalpha()]
for word in word_list:
    if word in features:
        pass
    else:
       word_index += 1
       features[word] = word_index 
   





In [8]:
len(features)
features["PAD"] = len(features)  +1
features["SOS"] = len(features) + 1
features["EOS"] = len(features) + 1
vocab_size = len(features)

In [9]:
class DialogueData(Dataset):
    def __init__(self, data):
        A = [ dialogue for dialogue in data if (data.index(dialogue) % 2 ==0 )]
        B = [ dialogue for dialogue in data if (data.index(dialogue) % 2 != 0 )]
        tokentextA=[]
        tokentextB = []
        for text in A:
            wordA = [word for word in word_tokenize(text.lower()) if word.isalpha() ]
            if len(wordA) <=50:

                tokentextA.append(wordA)
        
        for text in B:
            wordB = [word for word in word_tokenize(text.lower()) if word.isalpha() ]
            if len(wordB) <= 50 :

                tokentextB.append(wordB)

        # max no of token in a sequence
        self.maxlenA = len(max(tokentextA, key = len))
        self.maxlenB  =len (max(tokentextB, key=len))
        self.maxA = max(tokentextA,key = len)
        self.padA = lambda x: x +(self.maxlenA- len(x))* [features["PAD"]]
        self.padB = lambda x: x +(self.maxlenB- len(x))* [features["PAD"]]
        self.encode = lambda x: [features[token] for token in x ]

        sequencesA = [self.encode(sequence) for sequence in tokentextA ]
        self.sequencesA = [self.padA(sequence) for sequence in sequencesA]
        sequencesB = [self.encode(sequence) for sequence in tokentextB ]
        self.sequencesB = [self.padB(sequence) for sequence in sequencesB]


    def __getitem__(self, i ):
        return self.sequencesA[i], self.sequencesB[i]

    
    def __len__(self):
        return len(self.sequencesA)
        




        





        




        








    

In [10]:
xhat = ['are','you','fine']
pader = lambda x: x +(50- len(x))* [features["PAD"]]
stuffs = lambda x: [features[token] for token in x ]

seq = stuffs(xhat)
seq = pader(seq)




In [11]:
dataset = DialogueData(new_test_list)


In [12]:
def collate(batch):
    inputs = torch.LongTensor([item[0] for item in batch])
    targets = torch.LongTensor([item[1] for item in batch])

    return inputs, targets

batch_size = 1000
# for eval 
#batch_size = 1
train_loader = DataLoader(dataset, batch_size= batch_size, collate_fn = collate)
 

In [13]:
torch.tensor(dataset.sequencesA[0]).shape
torch.tensor(dataset.sequencesB[0]).shape


torch.Size([50])

In [14]:


class EncoderRNN(nn.Module):
    def __init__(self,hidden_size,vocab_size, n_layers=1,dropout = 0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout =(0 if n_layers ==1 else dropout),batch_first= True, bidirectional= True)
    
    def forward(self, input_seq, input_length , hidden=None):
        
        embedded = self.embedding(input_seq)
       # packed = pack_padded_sequence(embedded, input_length) #for faster computation
        outputs, hidden = self.gru(embedded, hidden)

        
        outputs = outputs[:,:, :self.hidden_size] + outputs[:,:,self.hidden_size:] # bidirectional gru so  adding both halves 
#for eval
      #  outputs = outputs[:, :self.hidden_size] + outputs[:,self.hidden_size:] 
        return outputs, hidden

    

In [15]:
class Attn(nn.Module):
    def __init__(self, hidden_size,sequence_length):
        super(Attn,self).__init__()
        self.hidden_size = hidden_size
        self.w1 = nn.Linear(sequence_length* hidden_size, hidden_size)
        self.w2 = nn.Linear(hidden_size, hidden_size)
        self.V = nn.Linear(hidden_size,sequence_length)
        

    

    def forward(self, encoder_output, decoder_state,batch_size):
        encoder_output = encoder_output.reshape(batch_size,-1)
        
        FC = self.w1(encoder_output) + self.w2(decoder_state)
        tan = torch.tanh(FC)
        fc2 = self.V(tan)
        attention_weights = F.softmax(fc2)
        
        return attention_weights

    




In [16]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size,output_size, vocab_size, n_layers = 1,dropout=0.1):
        super(DecoderRNN , self).__init__()
        self.hidden_size = hidden_size
        self.output_size= output_size
        self.n_layers = n_layers
        self.dropout = dropout   #vocab_size == output_size
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size + dataset.maxlenA, hidden_size, n_layers, dropout = (0 if n_layers==1 else dropout),batch_first = True)
        
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input_step, context,last_hidden):
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        
        x= torch.cat((embedded, context),dim =1 )
     
        x= x.unsqueeze(1)
        rnn_output, hidden = self.gru(x, last_hidden)
        rnn_output = rnn_output.squeeze(0)
        output = self.out(rnn_output)
       # print("output", output.shape)
        #output = F.softmax(output, dim =2 )
        return output, hidden
        
        
        


        


In [17]:
# USE cross entropy loss
#The model returns tensor from a softmax operation The index with max value is the required ans 
# the ans is the key value for the dictonary which has 1-> hi , 2-> bye key value pairs for words and numbers
# make both index2string and string2index dictionary
# hidden_size,vocab_size,sequence_length, n_layers=1,dropout = 0

In [18]:
criterion = nn.CrossEntropyLoss()
hidden_size = 100
sequence_length = dataset.maxlenA
n_layers = 1
dropout =0.5

sequence_length_B = dataset.maxlenB
device = torch.device('cuda' if torch.cuda.is_available() else  'cpu')


In [19]:
Encode = EncoderRNN(hidden_size, vocab_size, n_layers, dropout)
Attention = Attn(hidden_size,sequence_length)
Decode = DecoderRNN(hidden_size,vocab_size, vocab_size, n_layers, dropout)

def init_hidden():
    return torch.randn(n_layers*2, batch_size, hidden_size).to(device)

def init_hidden_dec():
    return torch.randn(n_layers, batch_size, hidden_size).to(device)

encoder_optim = optim.Adam([p for p in Encode.parameters() if p.requires_grad],lr=0.01)
atten_optim = optim.Adam([p for p in Attention.parameters() if p.requires_grad],lr=0.01)
decoder_optim = optim.Adam([p for p in Decode.parameters() if p.requires_grad],lr=0.01)



In [36]:
Encode=torch.load('encoder.pth')
Decode=torch.load('decoder.pth')
Attention=torch.load('attention.pth')

In [37]:
Encode.train()
Attention.train()
Decode.train()
train_losses=[]
def train():
    for epoch in range (1):
        progress_bar = tqdm(train_loader)
        
        losses = []
        total = 0 
        
        past=0
        for inputs, target in progress_bar:
            lossn=0
           # print(inputs == past)
           # past = inputs
            inputs, target = inputs.to(device) , target.to(device)
            Encode.zero_grad()
            Attention.zero_grad()
            Decode.zero_grad()

            
            hidden = init_hidden()
            if inputs.shape[0]<1000:
                break
            encoder_output, encoder_hidden = Encode(inputs, sequence_length, hidden)
            encoder_hidden = encoder_hidden[0,:,:]+encoder_hidden[1,:,:]
            attn_weights = Attention(encoder_output,encoder_hidden,batch_size)# new addition batch_size as argument
            attn_weights = attn_weights.unsqueeze(2)
           #print("attn" , attn_weights.shape)
            
            context = torch.sum(attn_weights*encoder_output,dim =2) # context vector
            
            input_step = SOS_token* torch.ones(batch_size,dtype= torch.long,requires_grad=False)
            last_hidden = init_hidden_dec()
            sequence_b = sequence_length_B
            
            n=0
           # torch.autograd.set_detect_anomaly(True)
           # print("sequence_b",sequence_b)
            for i in range(sequence_b):
                
                check = torch.zeros(batch_size,vocab_size,dtype=torch.float,requires_grad=False)
                
                index =0
                with torch.no_grad():

                    for j in target[:,i]:
                        check[index, j] = 1.0
                        index +=1

                output_decoder, decoder_hidden = Decode(input_step, context, last_hidden)
                
                output_decoder = output_decoder.squeeze(1)
               
                loss = criterion(output_decoder, check)
                #print(loss)
                lossn += loss
               
                if n == 20:

                    print(loss)
                n+=1
                input_step = output_decoder
              
                _, input_step = input_step.topk(1)
            
              #  input_step = input_step.squeeze(1)
                input_step = input_step.squeeze(1)
            

                last_hidden = decoder_hidden
                last_hidden = last_hidden.squeeze(0)
                attn_weights = Attention(encoder_output,last_hidden,batch_size)
            
                attn_weights = attn_weights.unsqueeze(2)
                context = torch.sum(attn_weights*encoder_output,dim = 2) # context vector
                last_hidden = last_hidden.unsqueeze(0)
            lossn.backward()

               
           
            nn.utils.clip_grad_norm_(Encode.parameters(), 50)
            nn.utils.clip_grad_norm_(Attention.parameters(), 50)
            nn.utils.clip_grad_norm_(Decode.parameters(),50)

            decoder_optim.step()
            atten_optim.step()
            encoder_optim.step()
                
                

 

            

            progress_bar.set_description(f'Loss: {lossn.item():.3f}')

            losses.append(lossn.item())
            total +=1 
            

        epoch_loss = sum(losses)/total
        train_losses.append(epoch_loss)


        tqdm.write(f'Epoch #{epoch+1} \t Train Loss: {epoch_loss:3f}')




train()
            

            

    






  attention_weights = F.softmax(fc2)


tensor(1.6980, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.4187, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.5070, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.7352, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.6659, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.5335, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.6955, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.4595, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.6027, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.7443, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.5369, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.3175, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.6031, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.5770, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.6489, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.3313, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.3836, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.3528, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.4397, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.1998, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.4953, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.2718, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.4108, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.4978, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.2174, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.4425, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.1746, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.4726, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.4335, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.4133, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.4089, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.3990, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.3671, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.2687, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.2567, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.4075, grad_fn=<DivBackward1>)







[A[A[A[A[A




[A[A[A[A[A

tensor(1.3019, grad_fn=<DivBackward1>)







[A[A[A[A[A




                                                               



[A[A[A[A                                                   





[A[A[A[A[A[A                                           


[A[A[A                                                      
[A                                                            

[A[A                                  




Loss: 89.697:  97%|█████████▋| 37/38 [4:36:26<00:49, 49.38s/it]


[A[A[A




[A[A[A[A[A

[A[A
[A



[A[A[A[A

Epoch #1 	 Train Loss: 92.477068


In [38]:
torch.save(Encode, 'encoder.pth')
torch.save(Decode, 'decoder.pth')
torch.save(Attention,'attention.pth')

In [31]:
def init_hidden_eval():
    return torch.randn(n_layers*2,1, hidden_size).to(device)

def init_hidden_dec_eval():
    return torch.randn(n_layers,1, hidden_size).to(device)

In [32]:
Encode=torch.load('encoder.pth')
Decode=torch.load('decoder.pth')
Attention=torch.load('attention.pth')

In [33]:
batch_size =1
with torch.no_grad():
    hidden = init_hidden_eval()
    seq = torch.tensor(seq)
    
    
    #seq = seq.squeeze(2)
    seq = seq.squeeze(0)

    seq = seq.unsqueeze(0)
    

    encoder_output , encoder_hidden = Encode(seq, sequence_length,hidden)
    encoder_hidden = encoder_hidden[0,:,:]+encoder_hidden[1,:,:]
    print(encoder_output.shape)
    
    attn_weights = Attention(encoder_output, encoder_hidden,batch_size)
    context = torch.sum(attn_weights*encoder_output, dim=2 )
    input_step = SOS_token * torch.ones(1,dtype=torch.long) 
    last_hidden = init_hidden_dec_eval()
    sequence_b = sequence_length_B
    print("context",context.shape)
    for i in range(sequence_b):
        output_decoder, decoder_hidden = Decode(input_step, context, last_hidden)
        
        output = F.softmax(output_decoder, dim =1)
        #print("output",output)
        output = output.squeeze(0)

        outputi = torch.argmax(output)
        output = output[outputi]
        #print(outputi)
        output_decoder = output_decoder.squeeze(1)
        input_step = output_decoder
        _,input_step = input_step.topk(1)
        input_step = input_step.squeeze(1)
        last_hidden = decoder_hidden
        last_hidden = last_hidden.squeeze(0)
        attn_weights = Attention(encoder_output, last_hidden,batch_size)
        attn_weights = attn_weights.unsqueeze(1)
        context = torch.sum(attn_weights*encoder_output, dim =2)
        last_hidden = last_hidden.unsqueeze(0)
        key = [k for k, v in  features.items() if v==outputi]
        print(key)
        
        

    

        

        


    
    

torch.Size([1, 50, 100])


  attention_weights = F.softmax(fc2)


RuntimeError: The size of tensor a (50) must match the size of tensor b (100) at non-singleton dimension 2