In [33]:
import torch
import torch.nn as nn
import numpy as np
import os
import gensim
from gensim.utils import simple_preprocess

In [34]:
import nltk
from nltk import sent_tokenize
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\cyborg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [35]:
story = []
for filename in os.listdir('data'):
  with open(os.path.join('data', filename), encoding='latin-1') as f:
    corpus = f.read()
  raw_sent = sent_tokenize(corpus)
  for sent in raw_sent:
    story.append(simple_preprocess(sent))

In [36]:
len(story)

145020

In [37]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4
)

In [38]:
model.build_vocab(story)

In [39]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(6569624, 8628190)

In [40]:
model.wv.most_similar('daenerys')

[('stormborn', 0.8237999677658081),
 ('unburnt', 0.7762250900268555),
 ('targaryen', 0.7697181105613708),
 ('queen', 0.6973780393600464),
 ('elia', 0.6915681958198547),
 ('myrcella', 0.6823866963386536),
 ('princess', 0.6806121468544006),
 ('margaery', 0.6693328619003296),
 ('viserys', 0.6624032258987427),
 ('khal', 0.6531725525856018)]

In [41]:
class Head(nn.Module):
    
    def __init__(self, num_features):
        super().__init__();

        self.wq = nn.Linear(num_features, num_features, bias=False)
        self.wk = nn.Linear(num_features, num_features, bias=False)
        self.wv = nn.Linear(num_features, num_features, bias=False)

    def forward(self, x):

        q = self.wq(x)
        k = self.wk(x)
        v = self.wv(x)

        energy = torch.matmul(q, k.T) * x.shape[1] ** -0.5
        # energy.tril_()
        # energy[energy==0] = float('-inf')
        # energy = energy.masked_fill(self.tril)
        # mask = torch.full((energy.shape[0], energy.shape[1]), float('-inf'))
        # mask = torch.triu(mask, diagonal=1)
        
        attention = torch.softmax(energy, dim=-1)

        out = torch.matmul(attention, v)

        return out

In [42]:
class MultiHeadAttention(nn.Module):

    def __init__(self, num_heads, num_features):
        super().__init__()

        self.heads = nn.ModuleList([Head(num_features=num_features) for _ in range(num_heads)])
        self.wo = nn.Linear(num_features * num_heads, num_features, bias=False)
        
    def forward(self, x):

        out = torch.cat([h(x) for h in self.heads], dim=-1)

        out = self.wo(out)
        
        return out


In [43]:
class FeedForward(nn.Module):

    def __init__(self, num_features):
        super().__init__()

        self.model = nn.Sequential(
            nn.Linear(num_features, 2048),
            nn.ReLU(),
            nn.Linear(2048, num_features),
        )


    def forward(self, x):

        out = self.model(x)

        return out

In [44]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, total_tokens):
        super().__init__()

        self.pe = torch.zeros((total_tokens, d_model))
        self.d_model = d_model
        self.total_tokens = total_tokens

    def forward(self, x):

        for pos in range(self.total_tokens):

            for i in range(self.d_model // 2):

                theta = torch.tensor(pos / (10000 ** ((2 * i) / self.d_model)))

                self.pe[pos, 2 * i] = torch.sin(theta) 
                self.pe[pos, 2 * + 1] = torch.cos(theta)

        x = x + self.pe 

        return x

In [45]:
class Block(nn.Module):

    def __init__(self, num_heads, num_features, total_tokens, x):
        super().__init__()

        self.token_embedding = x
        self.mha = MultiHeadAttention(num_heads, num_features)
        self.pe = PositionalEncoding(num_features, total_tokens)
        self.ffwd = FeedForward(num_features)
        self.ln1 = nn.LayerNorm(num_features)
        self.ln2 = nn.LayerNorm(num_features)

    def forward(self, x):

        x = x + self.mha(self.ln1(x))
        x = x + self.ln2(self.ffwd(x))
        
        return x

In [46]:
# x = torch.tensor(model.wv.get_normed_vectors())

# block = Block(2, x.shape[1], x.shape[0], x)
# x = block(x)

In [47]:
class Encoder(nn.Module):

    def __init__(self, num_features, total_tokens, num_heads, x):
        super().__init__()

        self.pe = PositionalEncoding(d_model=num_features, total_tokens=total_tokens)
        self.ffwd = FeedForward(num_features)
        self.blocks = nn.Sequential(
            Block(num_heads, num_features, total_tokens, x=x),
            Block(num_heads, num_features, total_tokens, x=x)
        )

    def forward(self, x):

        x = x + self.pe(x)
        x = self.blocks(x)

        return x

In [48]:
x = torch.tensor(model.wv.get_normed_vectors())

encoder = Encoder(num_features=x.shape[1], total_tokens=x.shape[0], num_heads=2, x=x)

x = encoder(x)

In [49]:
torch.set_printoptions(threshold=float('inf'))
print(x.shape)

torch.Size([17453, 100])
