## Търсене и извличане на информация. Приложение на дълбоко машинно обучение
> ### Стоян Михов
> #### Зимен семестър 2021/2022

### Упражнение 13

 За да работи програмата трябва корпуса от публицистични текстове за Югоизточна Европа,
 да се намира разархивиран в директорията, в която е програмата (виж упражнение 2).

 Преди да се стартира програмата е необходимо да се активира съответното обкръжение с командата: `conda activate tii`

In [1]:
import sys
import nltk
from nltk.corpus import PlaintextCorpusReader
import numpy as np
import torch
import random
import math

######  Визуализация на прогреса

In [2]:
class progressBar:
    def __init__(self ,barWidth = 50):
        self.barWidth = barWidth
        self.period = None
    def start(self, count):
        self.item=0
        self.period = int(count / self.barWidth)
        sys.stdout.write("["+(" " * self.barWidth)+"]")
        sys.stdout.flush()
        sys.stdout.write("\b" * (self.barWidth+1))
    def tick(self):
        if self.item>0 and self.item % self.period == 0:
            sys.stdout.write("-")
            sys.stdout.flush()
        self.item += 1
    def stop(self):
        sys.stdout.write("]\n")

In [3]:
def extractDictionary(corpus, limit=20000):
    pb = progressBar()
    pb.start(len(corpus))
    dictionary = {}
    for doc in corpus:
        pb.tick()
        for w in doc:
            if w not in dictionary: dictionary[w] = 0
        dictionary[w] += 1
    L = sorted([(w,dictionary[w]) for w in dictionary], key = lambda x: x[1] , reverse=True)
    if limit > len(L): limit = len(L)
    words = [ w for w,_ in L[:limit] ] + [unkToken] + [padToken]
    word2ind = { w:i for i,w in enumerate(words)}
    pb.stop()
    return words, word2ind

In [4]:
def splitSentCorpus(fullSentCorpus, testFraction = 0.1):
    random.seed(42)
    random.shuffle(fullSentCorpus)
    testCount = int(len(fullSentCorpus) * testFraction)
    testSentCorpus = fullSentCorpus[:testCount]
    trainSentCorpus = fullSentCorpus[testCount:]
    return testSentCorpus, trainSentCorpus

######   Зареждане на корпуса

In [5]:
corpus_root = 'JOURNALISM.BG/C-MassMedia'
myCorpus = PlaintextCorpusReader(corpus_root, '.*\.txt')
startToken = '<s>'
endToken = '</s>'
unkToken = '<unk>'
padToken = '<pad>'

In [6]:
corpus = [ [startToken] + [w.lower() for w in sent] + [endToken] for sent in myCorpus.sents()]
words, word2ind = extractDictionary(corpus)

[                                                  --------------------------------------------------]


In [7]:
testCorpus, trainCorpus  = splitSentCorpus(corpus, testFraction = 0.01)

In [8]:
batchSize = 32
emb_size = 50
hid_size = 100

In [None]:
!nvidia-smi

In [10]:
#device = torch.device("cpu")
device = torch.device("cuda:0")
#device = torch.device("cuda:1")

##### LSTM с пакетиране на партида

In [11]:
class LSTMLanguageModelPack(torch.nn.Module):
    def __init__(self, embed_size, hidden_size, word2ind, unkToken, padToken):
        super(LSTMLanguageModelPack, self).__init__()
        self.word2ind = word2ind
        self.unkTokenIdx = word2ind[unkToken]
        self.padTokenIdx = word2ind[padToken]
        self.lstm = torch.nn.LSTM(embed_size, hidden_size)
        self.embed = torch.nn.Embedding(len(word2ind), embed_size)
        self.projection = torch.nn.Linear(hidden_size,len(word2ind))
    
    def preparePaddedBatch(self, source):
        device = next(self.parameters()).device
        m = max(len(s) for s in source)
        sents = [[self.word2ind.get(w,self.unkTokenIdx) for w in s] for s in source]
        sents_padded = [ s+(m-len(s))*[self.padTokenIdx] for s in sents]
        return torch.t(torch.tensor(sents_padded, dtype=torch.long, device=device)) # (w,s)
    
    def forward(self, source):
        X = self.preparePaddedBatch(source) # (w,s)
        E = self.embed(X[:-1]) # (w,s,e) # cntg
        source_lengths = [len(s)-1 for s in source]
        outputPacked, _ = self.lstm(torch.nn.utils.rnn.pack_padded_sequence(E, source_lengths,enforce_sorted=False))
        output,_ = torch.nn.utils.rnn.pad_packed_sequence(outputPacked) # (w,s,h)

        Z = self.projection(output.flatten(0,1)) # (w*s,h)
        Y_bar = X[1:].flatten(0,1) # (w*s)
        H = torch.nn.functional.cross_entropy(Z,Y_bar,ignore_index=self.padTokenIdx)
        return H

In [12]:
lm = LSTMLanguageModelPack(emb_size, hid_size, word2ind, unkToken, padToken).to(device) # i n
optimizer = torch.optim.Adam(lm.parameters(), lr=0.01)

In [13]:
idx = np.arange(len(trainCorpus), dtype='int32')
np.random.shuffle(idx)

In [None]:
for b in range(0, len(idx), batchSize):
    batch = [ trainCorpus[i] for i in idx[b:min(b+batchSize, len(idx))] ]
    H = lm(batch)
    optimizer.zero_grad()
    H.backward()
    optimizer.step()
    if b % 10 == 0:
        print(b, '/', len(idx), H.item())

In [15]:
def perplexity(lm, testCorpus, batchSize):
    H = 0.
    c = 0
    for b in range(0,len(testCorpus),batchSize):
        batch = testCorpus[b:min(b+batchSize, len(testCorpus))]
        l = sum(len(s)-1 for s in batch)
        c += l
        with torch.no_grad():
            H += l * lm(batch)
    return math.exp(H/c)

In [16]:
perplexity(lm, testCorpus, batchSize)

33.608086011322

######  Двупосочен LSTM с пакетиране на партида

In [17]:
class BiLSTMLanguageModelPack(torch.nn.Module):
    def __init__(self, embed_size, hidden_size, word2ind, unkToken, padToken, endToken):
        super(BiLSTMLanguageModelPack, self).__init__()
        self.word2ind = word2ind
        self.unkTokenIdx = word2ind[unkToken]
        self.padTokenIdx = word2ind[padToken]
        self.endTokenIdx = word2ind[endToken]
        self.hidden_size = hidden_size
        self.lstm = torch.nn.LSTM(embed_size, hidden_size, bidirectional=True)
        self.embed = torch.nn.Embedding(len(word2ind), embed_size)
        self.projection = torch.nn.Linear(2*hidden_size,len(word2ind))

    def preparePaddedBatch(self, source): #
        device = next(self.parameters()).device
        m = max(len(s) for s in source)
        sents = [[self.word2ind.get(w,self.unkTokenIdx) for w in s] for s in source]
        sents_padded = [ s+(m-len(s))*[self.padTokenIdx] for s in sents]
        return torch.t(torch.tensor(sents_padded, dtype=torch.long, device=device))

    def forward(self, source):
        batch_size = len(source)
        X = self.preparePaddedBatch(source) # (w,s)
        E = self.embed(X) # (w,s,e)
        
        source_lengths = [len(s) for s in source]
        m = X.shape[0]
        outputPacked, _ = self.lstm(torch.nn.utils.rnn.pack_padded_sequence(E, source_lengths,enforce_sorted=False)) #bd
        
        output,_ = torch.nn.utils.rnn.pad_packed_sequence(outputPacked) # (w,s,2h) # d
        output = output.view(m, batch_size, 2, self.hidden_size) # (w,s,2,h) # d
        t = torch.cat((output[:-2,:,0,:], output[2:,:,1,:]),2) # (w,s,2h) # d
        Z = self.projection(t.flatten(0,1)) # (w*s,2h)

        Y_bar = X[1:-1].flatten(0,1)
        Y_bar[Y_bar==self.endTokenIdx] = self.padTokenIdx
        H = torch.nn.functional.cross_entropy(Z,Y_bar,ignore_index=self.padTokenIdx)
        return H

In [18]:
blm = BiLSTMLanguageModelPack(emb_size, hid_size, word2ind, unkToken, padToken, endToken).to(device)
optimizer = torch.optim.Adam(blm.parameters(), lr=0.01)

In [19]:
idx = np.arange(len(trainCorpus), dtype='int32')
np.random.shuffle(idx)

In [None]:
for b in range(0, len(idx), batchSize):
    batch = [ trainCorpus[i] for i in idx[b:min(b+batchSize, len(idx))] ]
    H = blm(batch)
    optimizer.zero_grad()
    H.backward()
    optimizer.step()
    if b % 10 == 0:
        print(b, '/', len(idx), H.item())

In [21]:
def perplexity(blm, testCorpus, batchSize):
    H = 0.
    c = 0
    for b in range(0,len(testCorpus),batchSize):
        batch = testCorpus[b:min(b+batchSize, len(testCorpus))]
        l = sum(len(s)-2 for s in batch)
        c += l
        with torch.no_grad():
            H += l * blm(batch)
    return math.exp(H/c)

In [22]:
perplexity(blm, testCorpus, batchSize)

11.285752269193237

###### LSTM класификатор на документи

In [23]:
class LSTMClassifier(torch.nn.Module):
    def __init__(self, langModel, classesCount):
        super(LSTMClassifier, self).__init__()
        self.langModel = langModel
        self.classProjection = torch.nn.Linear(langModel.lstm.hidden_size,classesCount)
    
    def forward(self, source):
        X = self.langModel.preparePaddedBatch(source)
        E = self.langModel.embed(X[:-1])
        source_lengths = [len(s)-1 for s in source]
        _, (h,_) = self.langModel.lstm(torch.nn.utils.rnn.pack_padded_sequence(E, source_lengths,enforce_sorted=False))
        
        Z = self.classProjection(torch.squeeze(h,dim=0)) # (s,h) -> (s,c)
        return Z

In [24]:
fileNames = myCorpus.fileids()

In [25]:
ecoCorpus = [ [startToken] + [w.lower() for w in myCorpus.words(f)] + [endToken] for f in fileNames if f.find('E-Economy'+'/')==0 ]
milCorpus = [ [startToken] + [w.lower() for w in myCorpus.words(f)] + [endToken] for f in fileNames if f.find('S-Military'+'/')==0 ]
polCorpus = [ [startToken] + [w.lower() for w in myCorpus.words(f)] + [endToken] for f in fileNames if f.find('J-Politics'+'/')==0 ]
culCorpus = [ [startToken] + [w.lower() for w in myCorpus.words(f)] + [endToken] for f in fileNames if f.find('C-Culture'+'/')==0 ]

In [26]:
testEcoCorpus, trainEcoCorpus = splitSentCorpus(ecoCorpus)
testMilCorpus, trainMilCorpus = splitSentCorpus(milCorpus)
testPolCorpus, trainPolCorpus = splitSentCorpus(polCorpus)
testCulCorpus, trainCulCorpus = splitSentCorpus(culCorpus)

In [27]:
trainClassCorpus = trainEcoCorpus + trainMilCorpus + trainPolCorpus + trainCulCorpus

In [28]:
trainY = np.concatenate((
                         np.ones(len(trainEcoCorpus),dtype='int32')*0,
                         np.ones(len(trainMilCorpus),dtype='int32')*1,
                         np.ones(len(trainPolCorpus),dtype='int32')*2,
                         np.ones(len(trainCulCorpus),dtype='int32')*3
                         ))

In [29]:
testY = np.concatenate((
                        np.ones(len(testEcoCorpus),dtype='int32')*0,
                        np.ones(len(testMilCorpus),dtype='int32')*1,
                        np.ones(len(testPolCorpus),dtype='int32')*2,
                        np.ones(len(testCulCorpus),dtype='int32')*3
                        ))

In [30]:
idx = np.arange(len(trainClassCorpus), dtype='int32')

In [31]:
classModel = LSTMClassifier(lm,4).to(device)
optimizer = torch.optim.Adam(classModel.parameters(), lr=0.01)

In [None]:
np.random.shuffle(idx)
for b in range(0, len(idx), batchSize):
    batch = [ trainClassCorpus[i] for i in idx[b:min(b+batchSize, len(idx))] ]
    target = torch.tensor(trainY[idx[b:min(b+batchSize, len(idx))]], dtype = torch.long, device = device)

    Z = classModel(batch)
    H = torch.nn.functional.cross_entropy(Z,target)

    optimizer.zero_grad()
    H.backward()
    optimizer.step()
    if b % 10 == 0:
        print(b, '/', len(idx), H.item())

In [33]:
testClassCorpus = [ testEcoCorpus, testMilCorpus, testPolCorpus, testCulCorpus ]

In [34]:
def gamma(s):
    with torch.no_grad():
        Z = classModel([s])
        return torch.argmax(Z[0]).item()

In [35]:
def testClassifier(testClassCorpus, gamma):
    L = [ len(c) for c in testClassCorpus ]
    pb = progressBar(50)
    pb.start(sum(L))
    classesCount = len(testClassCorpus)
    confusionMatrix = [ [0] * classesCount for _ in range(classesCount) ]
    for c in range(classesCount):
        for text in testClassCorpus[c]:
            pb.tick()
            c_MAP = gamma(text)
            confusionMatrix[c][c_MAP] += 1
    pb.stop()
    precision = []
    recall = []
    Fscore = []
    for c in range(classesCount):
        extracted = sum(confusionMatrix[x][c] for x in range(classesCount))
        if confusionMatrix[c][c] == 0:
            precision.append(0.0)
            recall.append(0.0)
            Fscore.append(0.0)
        else:
            precision.append( confusionMatrix[c][c] / extracted )
            recall.append( confusionMatrix[c][c] / L[c] )
            Fscore.append((2.0 * precision[c] * recall[c]) / (precision[c] + recall[c]))
    P = sum( L[c] * precision[c] / sum(L) for c in range(classesCount) )
    R = sum( L[c] * recall[c] / sum(L) for c in range(classesCount) )
    F1 = (2*P*R) / (P + R)
    print('=================================================================')
    print('Матрица на обърквания: ')
    for row in confusionMatrix:
        for val in row:
            print('{:4}'.format(val), end = '')
        print()
    print('Прецизност: '+str(precision))
    print('Обхват: '+str(recall))
    print('F-оценка: '+str(Fscore))
    print('Обща презизност: '+str(P))
    print('Общ обхват: '+str(R))
    print('Обща F-оценка: '+str(F1))
    print('=================================================================')
    print()

In [36]:
testClassifier(testClassCorpus, gamma)

[                                                  --------------------------------------------------]
Матрица на обърквания: 
   1  18  46   0
   1 144  11   2
   0  29 705   1
   0   1   2  42
Прецизност: [0.5, 0.75, 0.9227748691099477, 0.9333333333333333]
Обхват: [0.015384615384615385, 0.9113924050632911, 0.9591836734693877, 0.9333333333333333]
F-оценка: [0.029850746268656723, 0.8228571428571428, 0.9406270847231488, 0.9333333333333333]
Обща презизност: 0.8686336279120754
Общ обхват: 0.8893320039880359
Обща F-оценка: 0.8788609640877645



###### Двупосочен LSTM класификатор на документи

In [37]:
class BiLSTMClassifier(torch.nn.Module):
    def __init__(self, langModel, classesCount):
        super(BiLSTMClassifier, self).__init__()
        self.langModel = langModel
        self.classProjection = torch.nn.Linear(2*langModel.hidden_size,classesCount)
    
    def forward(self, source):
        batch_size = len(source)
        X = self.langModel.preparePaddedBatch(source)
        E = self.langModel.embed(X)
        source_lengths = [len(s) for s in source]
        _, (h,c) = self.langModel.lstm(torch.nn.utils.rnn.pack_padded_sequence(E, source_lengths,enforce_sorted=False))
        h = h.view(2,batch_size,self.langModel.hidden_size)
        
        Z = self.classProjection(torch.cat([h[0],h[1]],1)) # (batch_size,2*self.langModel.hidden_size) -> (batch_size,4=|C|)
        return Z

In [38]:
classModelB = BiLSTMClassifier(blm,4).to(device)
optimizer = torch.optim.Adam(classModelB.parameters(), lr=0.01)

In [None]:
idx = np.arange(len(trainClassCorpus), dtype='int32')
np.random.shuffle(idx)
for b in range(0, len(idx), batchSize):
    batch = [ trainClassCorpus[i] for i in idx[b:min(b+batchSize, len(idx))] ]
    target = torch.tensor(trainY[idx[b:min(b+batchSize, len(idx))]], dtype = torch.long, device = device)
    
    Z = classModelB(batch)
    H = torch.nn.functional.cross_entropy(Z,target)
    
    optimizer.zero_grad()
    H.backward()
    optimizer.step()
    if b % 10 == 0:
        print(b, '/', len(idx), H.item())

In [40]:
def gamma(s):
    with torch.no_grad():
        Z = classModelB([s])
        return torch.argmax(Z[0]).item()

In [41]:
testClassifier(testClassCorpus, gamma)

[                                                  --------------------------------------------------]
Матрица на обърквания: 
  40   3  21   1
  15 135   8   0
   5   5 725   0
   5   0   0  40
Прецизност: [0.6153846153846154, 0.9440559440559441, 0.9615384615384616, 0.975609756097561]
Обхват: [0.6153846153846154, 0.8544303797468354, 0.9863945578231292, 0.8888888888888888]
F-оценка: [0.6153846153846154, 0.8970099667774087, 0.9738079247817326, 0.9302325581395349]
Обща презизност: 0.9369830981216337
Общ обхват: 0.9371884346959122
Обща F-оценка: 0.9370857551603071



###### Конволюционен класификатор на документи

In [42]:
class ConvolutionClassifier(torch.nn.Module):
    def __init__(self, embed, filterSize, filterCount, classesCount, word2ind, unkToken, padToken):
        super(ConvolutionClassifier, self).__init__()
        self.embed = embed
        self.word2ind = word2ind
        self.unkTokenIdx = word2ind[unkToken]
        self.padTokenIdx = word2ind[padToken]
        self.convolution = torch.nn.Conv1d(in_channels=embed.embedding_dim, out_channels=filterCount, kernel_size=filterSize)
        self.dropout = torch.nn.Dropout(0.5)
        self.classProjection = torch.nn.Linear(filterCount,classesCount)
    
    def preparePaddedBatch(self, source):
        device = next(self.parameters()).device
        m = max(len(s) for s in source)
        sents = [[self.word2ind.get(w,self.unkTokenIdx) for w in s] for s in source]
        sents_padded = [ s+(m-len(s))*[self.padTokenIdx] for s in sents]
        return torch.tensor(sents_padded, dtype=torch.long, device=device) # (batch_size,max_sent_len,embed_size)
    
    def forward(self, source):
        X = self.preparePaddedBatch(source)
        
        E = torch.transpose(self.embed(X),1,2) # (s,e,w)
        ### Очаква се Е да е тензор с размер (batch_size, embed_size, max_sent_len)

        U,_ = torch.max(torch.relu(self.convolution(E)), dim=2) # (s,oc,w) # d -> (s,oc)
        Z = self.classProjection(self.dropout(U))
        return Z

In [43]:
EMB = lm.embed

In [44]:
classModelE = ConvolutionClassifier(EMB, 7, 400, 4, word2ind, unkToken, padToken).to(device)
optimizer = torch.optim.Adam(classModelE.parameters(), lr=0.01, weight_decay=0.0002)

In [None]:
idx = np.arange(len(trainClassCorpus), dtype='int32')
classModelE.train() # d
for epoch in range(10):
    np.random.shuffle(idx)
    for b in range(0, len(idx), batchSize):
        batch = [ trainClassCorpus[i] for i in idx[b:min(b+batchSize, len(idx))] ]
        target = torch.tensor(trainY[idx[b:min(b+batchSize, len(idx))]], dtype = torch.long, device = device)
    
        Z = classModelE(batch)
        H = torch.nn.functional.cross_entropy(Z,target)
    
        optimizer.zero_grad()
        H.backward()
        optimizer.step()
        if b % 10 == 0:
            print(b, '/', len(idx), H.item())
classModelE.eval()
testClassifier(testClassCorpus, gamma)

In [46]:
def gamma(s):
    with torch.no_grad():
        Z = classModelE([s])
        return torch.argmax(Z[0]).item()

In [47]:
testClassifier(testClassCorpus, gamma)

[                                                  --------------------------------------------------]
Матрица на обърквания: 
  45   0  20   0
   0 146  12   0
   0   3 732   0
   0   0   2  43
Прецизност: [1.0, 0.9798657718120806, 0.9556135770234987, 1.0]
Обхват: [0.6923076923076923, 0.9240506329113924, 0.9959183673469387, 0.9555555555555556]
F-оценка: [0.8181818181818181, 0.9511400651465798, 0.9753497668221185, 0.9772727272727273]
Обща презизност: 0.9643018654621938
Общ обхват: 0.9631106679960121
Обща F-оценка: 0.9637058986316202



###### LSTM с посимволово влагане с КНН и пакетиране на партида

In [48]:
class CharEmbedding(torch.nn.Module):
    def __init__(self, word2ind, char_embed_size, word_embed_size, filter_size=5, dropoutrate=0.3, padding=1):
        super(CharEmbedding, self).__init__()
        self.word2ind = word2ind
        self.char_embed_size = char_embed_size
        self.word_embed_size = word_embed_size
        self.filter_size = filter_size
        self.dropoutrate = dropoutrate
        self.padding = padding

        alphabetSet = {c for w in word2ind for c in w}
        alphabet = ['§','`','~','№']+list(alphabetSet)
        self.char2id = {c:i for i, c in enumerate(alphabet) }
        self.char_pad = self.char2id['§']
        self.start_of_word = self.char2id['`']
        self.end_of_word = self.char2id['~']
        self.char_unk = self.char2id['№']

        self.CharEmbedding = torch.nn.Embedding(len(self.char2id),self.char_embed_size, padding_idx = self.char_pad)
        self.conv = torch.nn.Conv1d(char_embed_size, word_embed_size, filter_size, padding=padding)
        self.highway_proj = torch.nn.Linear(word_embed_size,word_embed_size)
        self.highway_gate = torch.nn.Linear(word_embed_size,word_embed_size)

        self.Dropout = torch.nn.Dropout(dropoutrate)

    def preparePaddedBatch(self, source):
        device = next(self.parameters()).device
        source_ids = [[ [self.start_of_word] + [self.char2id.get(c, self.char_unk) for c in w ] + [self.end_of_word] for w in s] for s in source]

        max_word_length = max(len(w) for s in source_ids for w in s )
        max_sent_len = max(len(s) for s in source_ids)
    
        sents_padded = []
        for sentence in source_ids:
            sent_padded = [ w + [self.char_pad]*(max_word_length-len(w)) for w in sentence ] + [[self.char_pad]*max_word_length] * (max_sent_len - len(sentence))
            sents_padded.append(sent_padded)

        return torch.transpose(torch.tensor(sents_padded, dtype=torch.long, device=device),0,1).contiguous()

    def forward(self, source):
        batch_size = len(source) # (s,w,c)
        X = self.preparePaddedBatch(source) # (w,s,c)
        X_emb = self.CharEmbedding(X).transpose(2,3) # (w,s,ce,c)

        x_conv = self.conv(X_emb.flatten(0,1)) # (w*s,we,c)
        x_conv_out0,_ = torch.max(torch.nn.functional.relu(x_conv),dim=2) # (w*s,we)
        x_conv_out = x_conv_out0.view((-1,batch_size,self.word_embed_size)) # (w,s,we)

        x_proj = torch.nn.functional.relu(self.highway_proj(x_conv_out)) # (w,s,we)
        x_gate = torch.sigmoid(self.highway_gate(x_conv_out)) # (w,s,we)
        x_highway = x_gate * x_proj + (1 - x_gate) * x_conv_out # (w,s,we)

        output = self.Dropout(x_highway)
        return output

In [49]:
class CharCNNLSTMLanguageModelPack(torch.nn.Module):
    def __init__(self, word_embed_size, hidden_size, word2ind, unkToken, padToken, char_embed_size, filter_size=5, dropoutrate=0.3, padding=1):
        super(CharCNNLSTMLanguageModelPack, self).__init__()
        self.word2ind = word2ind
        self.unkTokenIdx = word2ind[unkToken]
        self.padTokenIdx = word2ind[padToken]

        self.charEmbedding = CharEmbedding(word2ind, char_embed_size, word_embed_size, filter_size, dropoutrate, padding)
        self.lstm = torch.nn.LSTM(word_embed_size, hidden_size)
        self.projection = torch.nn.Linear(hidden_size,len(word2ind))
    
    def preparePaddedBatch(self, source):
        device = next(self.parameters()).device
        m = max(len(s) for s in source)
        sents = [[self.word2ind.get(w,self.unkTokenIdx) for w in s] for s in source]
        sents_padded = [ s+(m-len(s))*[self.padTokenIdx] for s in sents]
        return torch.t(torch.tensor(sents_padded, dtype=torch.long, device=device))
    
    def forward(self, source):
        X = self.preparePaddedBatch(source)
        E = self.charEmbedding(source)
        source_lengths = [len(s)-1 for s in source]
        outputPacked, _ = self.lstm(torch.nn.utils.rnn.pack_padded_sequence(E, source_lengths,enforce_sorted=False))
        output,_ = torch.nn.utils.rnn.pad_packed_sequence(outputPacked)
        
        Z = self.projection(output.flatten(0,1))
        Y_bar = X[1:].flatten(0,1)
        H = torch.nn.functional.cross_entropy(Z,Y_bar,ignore_index=self.padTokenIdx)
        return H

In [50]:
clm = CharCNNLSTMLanguageModelPack(256, 256, word2ind, unkToken, padToken, 32).to(device)
optimizer = torch.optim.Adam(clm.parameters(), lr=0.001)

In [51]:
idx = np.arange(len(trainCorpus), dtype='int32')
np.random.shuffle(idx)

In [None]:
trainCorpus

In [None]:
clm.train()
for b in range(0, len(idx), batchSize):
    batch = [ trainCorpus[i] for i in idx[b:min(b+batchSize, len(idx))] ]
    H = clm(batch)
    optimizer.zero_grad()
    H.backward()
    optimizer.step()
    if b % 10 == 0:
        print(b, '/', len(idx), H.item())
clm.eval()
perplexity(clm, testCorpus, batchSize)

######  Двупосочен LSTM с пакетиране на партида

In [54]:
class CharCNNBiLSTMLanguageModelPack(torch.nn.Module):
    def __init__(self, word_embed_size, hidden_size, word2ind, unkToken, padToken, endToken, char_embed_size, filter_size=5, dropoutrate=0.3, padding=1):
        super(CharCNNBiLSTMLanguageModelPack, self).__init__()
        self.word2ind = word2ind
        self.unkTokenIdx = word2ind[unkToken]
        self.padTokenIdx = word2ind[padToken]
        self.endTokenIdx = word2ind[endToken]
        self.hidden_size = hidden_size

        self.charEmbedding = CharEmbedding(word2ind, char_embed_size, word_embed_size, filter_size, dropoutrate, padding)
        self.lstm = torch.nn.LSTM(word_embed_size, hidden_size, bidirectional=True)
        self.projection = torch.nn.Linear(2*hidden_size,len(word2ind))
    
    def preparePaddedBatch(self, source):
        device = next(self.parameters()).device
        m = max(len(s) for s in source)
        sents = [[self.word2ind.get(w,self.unkTokenIdx) for w in s] for s in source]
        sents_padded = [ s+(m-len(s))*[self.padTokenIdx] for s in sents]
        return torch.t(torch.tensor(sents_padded, dtype=torch.long, device=device))
    
    def forward(self, source):
        batch_size = len(source)
        X = self.preparePaddedBatch(source)
        E = self.charEmbedding(source)

        source_lengths = [len(s) for s in source]
        m = X.shape[0]
        outputPacked, _ = self.lstm(torch.nn.utils.rnn.pack_padded_sequence(E, source_lengths,enforce_sorted=False))
        
        output,_ = torch.nn.utils.rnn.pad_packed_sequence(outputPacked)
        output = output.view(m, batch_size, 2, self.hidden_size)
        t = torch.cat((output[:-2,:,0,:], output[2:,:,1,:]),2)
        Z = self.projection(t.flatten(0,1))
        
        Y_bar = X[1:-1].flatten(0,1)
        Y_bar[Y_bar==self.endTokenIdx] = self.padTokenIdx
        H = torch.nn.functional.cross_entropy(Z,Y_bar,ignore_index=self.padTokenIdx)
        return H

In [55]:
cblm = CharCNNBiLSTMLanguageModelPack(256, 256, word2ind, unkToken, padToken, endToken, 32).to(device)
optimizer = torch.optim.Adam(cblm.parameters(), lr=0.001)

In [56]:
idx = np.arange(len(trainCorpus), dtype='int32')
np.random.shuffle(idx)

In [None]:
cblm.train()
for b in range(0, len(idx), batchSize):
    batch = [ trainCorpus[i] for i in idx[b:min(b+batchSize, len(idx))] ]
    H = cblm(batch)
    optimizer.zero_grad()
    H.backward()
    optimizer.step()
    if b % 10 == 0:
        print(b, '/', len(idx), H.item())

In [58]:
perplexity(cblm, testCorpus, batchSize)

10.162559473913722