In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
import re
from collections import Counter, defaultdict
import sentencepiece as spm

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device='cpu'

In [4]:
class Corpus(object):
    def __init__(self,path='data/HPBooks/resultant.txt',batch_size=64, val_size=10):
        self.path=path
        self.batch_size = batch_size
        self.val_size = val_size
        self.total_batch_size = batch_size+val_size
        self.minibatch_index = 0
        
        self.prep_data()
        
    def prep_data(self):
        subword_counter = Counter()
        num_of_tokens = 0
        with open(self.path,'r') as f:
            for line in f:
                line_subword = re.findall(r"\w+|[^\w\s]", line[:-1].lower(), re.UNICODE)+['\n']
                subword_counter.update(line_subword)
                num_of_tokens += len(line_subword)

        self.subwords_itos = ['_unk_','_pad_','_eos_','_bos_'] + sorted(subword_counter,key=subword_counter.get,reverse=True)
        self.subwords_stoi = defaultdict(lambda:0,{k:i for i,k in enumerate(self.subwords_itos)})

        ids = torch.LongTensor(num_of_tokens)
        token = 0
        with open(self.path,'r') as f:
            for line in f:
                line_subword = re.findall(r"\w+|[^\w\s]", line[:-1].lower(), re.UNICODE)+['\n'] 
                np_arr = np.array([self.subwords_stoi[s] for s in line_subword],np.int32)
                
                try:
                    ids[token:token+len(line_subword)] = torch.from_numpy(np_arr)
                except:
                    print(np_arr.shape, ids[token:token+len(line_subword)].shape)
                    print(line)
                token += len(line_subword)

        num_batches = ids.size(0) // (self.total_batch_size)
        ids = ids[:num_batches*self.total_batch_size]
        self.full_data = ids.view(self.total_batch_size, -1)
        
    def get_minibatch(self,bptt=120):
        if (self.minibatch_index + bptt+1 > self.full_data.size(1)):
            self.minibatch_index=0
        self.last_mbatch_x = self.full_data[:self.batch_size,self.minibatch_index:self.minibatch_index+bptt]
        self.last_mbatch_y = self.full_data[:self.batch_size,1+self.minibatch_index:1+self.minibatch_index+bptt]
        self.last_mbatch_x_val = self.full_data[self.batch_size:,self.minibatch_index:self.minibatch_index+bptt]
        self.last_mbatch_y_val = self.full_data[self.batch_size:,1+self.minibatch_index:1+self.minibatch_index+bptt]
        self.minibatch_index+=bptt
        return(self.last_mbatch_x,self.last_mbatch_y,self.last_mbatch_x_val,self.last_mbatch_y_val)

In [5]:
corpus=Corpus()

In [6]:
vocab_size=len(corpus.subwords_itos)
vocab_size

14839

In [7]:
class LangModel(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, dp_prob):
        super(LangModel,self).__init__()
    
        self.input_size=input_size
        self.embedding_size=embedding_size
        self.hidden_size=hidden_size
        self.dropout_prob=dp_prob
    
        self.dropout = nn.Dropout(dp_prob)
        self.emb_layer=nn.Embedding(input_size, embedding_size)
        self.rnn=nn.GRU(embedding_size, hidden_size, bidirectional=True)
        self.Linear=nn.Linear(2*hidden_size, input_size)
    
    def forward(self, input_sentence, init_hidden_state):
        #input_sentence: seq_len*batch_size
    
        emb=self.dropout(self.emb_layer(input_sentence))
        #emb: seq_len*batch_size*emb_size
    
        output, hidden=self.rnn(emb, init_hidden_state)
        #hidden: num_layers * num_directions, batch, hidden_size
    
        output=self.Linear(output.view(-1,2*hidden_size))
        
        return output

In [8]:
def train_minibatch(inputs, targets, hidden_size, mini_batch_size, model, model_optimizer, criterion, device=device):
    model_optimizer.zero_grad()
    hidden_state=(torch.zeros(2, mini_batch_size, hidden_size, device=device)).detach()
    outputs = model(inputs, hidden_state)
    loss = criterion(outputs, targets.reshape(-1))
    loss.backward()
    model_optimizer.step()
    return loss.item()


In [9]:
def validate(inputs, targets, hidden_size, validation_size,criterion, device=device):
    with torch.no_grad():
        hidden_state=(torch.zeros(2, validation_size, hidden_size, device=device)).detach()
        outputs = model(inputs, hidden_state)
        val_loss = criterion(outputs, targets.reshape(-1))
        return val_loss

In [10]:
model=LangModel(input_size=vocab_size, embedding_size=300, hidden_size=512, dp_prob=0.2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model=model.to(device)
model_optimizer=optim.Adam(model.parameters())
criterion=nn.CrossEntropyLoss()

In [11]:
hidden_size=512
mini_batch_size=64
val_size=10
tl=[]
vl=[]

In [12]:
batch_training_loss=0
batch_validation_loss=0
val_loss_benchmark=1000
for i in range(10000):
    train_input, train_target, val_input, val_target=corpus.get_minibatch()
    train_input=Variable(train_input.cuda()).permute(1,0)
    train_target=Variable(train_target.cuda()).permute(1,0)
    val_input=val_input.cuda().permute(1,0)
    val_target=val_target.cuda().permute(1,0)
    train_loss=train_minibatch(train_input, train_target, hidden_size, mini_batch_size, model, model_optimizer, criterion)
    val_loss=validate(val_input, val_target, hidden_size, val_size,criterion)
    batch_training_loss+=train_loss
    batch_validation_loss+=val_loss
    if (i+1)%100==0:
        tl.append(batch_training_loss)
        vl.append(batch_validation_loss)
        print ('Step: {}/{} | Training Loss: {} | Validation Loss: {}'.format((i+1)/100, 100, batch_training_loss, batch_validation_loss))
        if (batch_validation_loss<=val_loss_benchmark):
            print ('%---Saving the model---%')
            torch.save({
                'model_state_dict': model.state_dict(),
                'model_optimizer_state_dict': model_optimizer.state_dict(),
                },'models/LangModel.pth')
            val_loss_benchmark=batch_validation_loss
        batch_training_loss=0
        batch_validation_loss=0
    

Step: 1.0/100 | Training Loss: 337.9098275899887 | Validation Loss: 362.18280029296875
%---Saving the model---%
Step: 2.0/100 | Training Loss: 103.01831305027008 | Validation Loss: 132.1798095703125
%---Saving the model---%
Step: 3.0/100 | Training Loss: 37.729195564985275 | Validation Loss: 71.0864486694336
%---Saving the model---%
Step: 4.0/100 | Training Loss: 15.374464884400368 | Validation Loss: 54.47996139526367
%---Saving the model---%
Step: 5.0/100 | Training Loss: 7.290750823915005 | Validation Loss: 46.559383392333984
%---Saving the model---%
Step: 6.0/100 | Training Loss: 4.80288577824831 | Validation Loss: 45.715476989746094
%---Saving the model---%
Step: 7.0/100 | Training Loss: 3.819447632879019 | Validation Loss: 44.37186813354492
%---Saving the model---%
Step: 8.0/100 | Training Loss: 3.201359013095498 | Validation Loss: 43.390625
%---Saving the model---%
Step: 9.0/100 | Training Loss: 2.7470722012221813 | Validation Loss: 41.88705825805664
%---Saving the model---%
Step

Step: 87.0/100 | Training Loss: 0.0026814182620000793 | Validation Loss: 35.79955291748047
Step: 88.0/100 | Training Loss: 0.003089882311542169 | Validation Loss: 35.13523864746094
Step: 89.0/100 | Training Loss: 0.13051036297838436 | Validation Loss: 36.702144622802734
Step: 90.0/100 | Training Loss: 0.7139925598166883 | Validation Loss: 40.51657485961914
Step: 91.0/100 | Training Loss: 0.23780387663282454 | Validation Loss: 38.261783599853516
Step: 92.0/100 | Training Loss: 0.07293860662321094 | Validation Loss: 38.20721435546875
Step: 93.0/100 | Training Loss: 0.02848907030420378 | Validation Loss: 38.04297637939453
Step: 94.0/100 | Training Loss: 0.015421571748447604 | Validation Loss: 38.574913024902344
Step: 95.0/100 | Training Loss: 0.010865671189094428 | Validation Loss: 36.71501922607422
Step: 96.0/100 | Training Loss: 0.009134237188845873 | Validation Loss: 37.85733413696289
Step: 97.0/100 | Training Loss: 0.006865419178211596 | Validation Loss: 37.049808502197266
Step: 98.0/

In [13]:
checkpoint = torch.load('models/LangModel.pth')
model.load_state_dict(checkpoint['model_state_dict'])
model_optimizer.load_state_dict(checkpoint['model_optimizer_state_dict'])
model.eval()

LangModel(
  (dropout): Dropout(p=0.2)
  (emb_layer): Embedding(14839, 300)
  (rnn): GRU(300, 512, bidirectional=True)
  (Linear): Linear(in_features=1024, out_features=14839, bias=True)
)

In [42]:
sentence="The END. "
temp_begin = re.findall(r"\w+|[^\w\s]", sentence[:].lower(), re.UNICODE)
temp=torch.from_numpy(np.array([corpus.subwords_stoi[k] for k in temp_begin],np.int64))
temp=temp.unsqueeze(1)
temp=temp.type(torch.cuda.LongTensor)
temp.size()
newOutput=[0]
for i in range(50):
    with torch.no_grad():
        hidden_state=torch.zeros(2, 1, hidden_size, device=device)
        outputs = model(temp, hidden_state)
        newOutput = F.softmax(outputs,dim=1)
        #print(newOutput.size())
        (newOutput,indices)=torch.topk(newOutput[-1],8)
        #print(newOutput,indices)
        #newOutput=newOutput.max(1)[1][-1]
        newOutput =torch.multinomial(newOutput,1)[0]
        #print(newOutput)
        newOutput=indices[int(newOutput)]
        newOutput=newOutput.view(1)
        #print(newOutput)
        
        temp=torch.cat((temp, newOutput.unsqueeze(1)),dim=0)

In [43]:
[corpus.subwords_itos[int(k)] for k in temp.detach().cpu().numpy()]

['the',
 'end',
 '.',
 '\\',
 '\n',
 '"',
 'why',
 'couldn',
 "'",
 't',
 'want',
 'any',
 'crap',
 '-',
 '-',
 'go',
 'quick',
 'step',
 '-',
 '-',
 '"',
 '\\',
 '\n',
 '"',
 'why',
 'are',
 'you',
 'okay',
 '-',
 'a',
 'bun',
 'bun',
 'thing',
 'from',
 '"',
 'yes',
 '?',
 '"',
 '\\',
 '\n',
 '"',
 'yes',
 '?',
 '"',
 'said',
 'harry',
 'curiously',
 'curiously',
 'curiously',
 'curiously',
 'curiously',
 'curiously',
 'curiously']

In [33]:
a=[]
for i in range(1000):   
    temp_begin = corpus.sp.EncodeAsPieces("Daha once bu yonet")
    temp=torch.from_numpy(np.array([corpus.subwords_stoi[k] for k in temp_begin],np.int64))
    temp=temp.unsqueeze(1)
    temp=temp.type(torch.cuda.LongTensor)
    temp.size()
    newOutput=[0]

    with torch.no_grad():
        hidden_state=torch.zeros(2, 1, hidden_size, device=device)
        outputs = model(temp, hidden_state)
    a.append(corpus.subwords_itos[int(outputs.max(1)[1][-1])])
