In [64]:
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F
import spacy
import pandas as pd
import re
import string
from time import time

In [65]:
df = pd.read_csv('imdb_master.csv', encoding="latin-1")
df.shape

(499, 5)

In [66]:
df = df.drop(['type', 'label', 'file', 'Unnamed: 0'], axis=1)
# df = df.drop(['raw_character_text'], axis=1)

In [67]:
df.head()

Unnamed: 0,review
0,Once again Mr. Costner has dragged out a movie...
1,This is an example of why the majority of acti...
2,"First of all I hate those moronic rappers, who..."
3,Not even the Beatles could write songs everyon...
4,Brass pictures (movies is not a fitting word f...


In [68]:
df.isnull().sum()

review    0
dtype: int64

In [69]:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

review    0
dtype: int64

In [70]:
nlp = spacy.load('en_core_web_sm')

def cleaning(doc):
    txt = [token.lemma_ for token in doc if not token.is_stop]
    if len(txt) > 2:
        return ' '.join(txt)

In [71]:
brief_cleaning = (str(row).lower() for row in df['review'])

In [72]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000)]

In [73]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(499, 1)

In [74]:
all_txt = ""

for row in df_clean['clean']:
    all_txt = all_txt + row

In [75]:
class CBOW(nn.Module):
    def __init__(self, context_size=2, embedding_size=256, vocab_size=None):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_size)
        self.linear1 = nn.Linear(embedding_size, embedding_size*2)
        self.linear2 = nn.Linear(embedding_size*2, vocab_size)
        # self.linear3 = nn.Linear(embedding_size*4, embedding_size*8)
        # self.linear4 = nn.Linear(embedding_size*8, vocab_size)

    def forward(self, inputs):
        lookup_embeds = self.embeddings(inputs)
        embeds = lookup_embeds.sum(dim=0)
        out = nn.LeakyReLU()(self.linear1(embeds))
        out = nn.LeakyReLU()(self.linear2(out))
        # out = nn.LeakyReLU()(self.linear3(out))
        # out = nn.LeakyReLU()(self.linear4(out))
        out = F.log_softmax(out, dim=0)
        return out

In [76]:
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    tensor = torch.LongTensor(idxs).to('cuda')
    return autograd.Variable(tensor)

In [77]:
CONTEXT_SIZE = 1
EMBEDDING_SIZE = 256
# raw_text = all_txt.split()
raw_text = re.findall( r'\w+|[^\s\w]+', all_txt)

vocab = set(raw_text)
vocab_size = len(vocab)

word_to_ix = {word: i for i, word in enumerate(vocab)}
print(word_to_ix.keys())

word_to_ix_ = {}

i = 0
for row in word_to_ix:
    word_to_ix_[row.translate(str.maketrans("", "", string.punctuation))] = i
    i += 1

word_to_ix = word_to_ix_

print(word_to_ix.keys())

data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 1].translate(str.maketrans("", "", string.punctuation)), raw_text[i + 1].translate(str.maketrans("", "", string.punctuation))]
    target = raw_text[i]
    data.append((context, target))

loss_func = nn.CrossEntropyLoss()
net = CBOW(CONTEXT_SIZE, embedding_size=EMBEDDING_SIZE, vocab_size=vocab_size).to('cuda')
optimizer = optim.SGD(net.parameters(), lr=0.005)

for epoch in range(30):
    total_loss = 0
    for context, target in data:
        context_ = context

        i = 0
        for val in context:
            context_[i] = val.translate(str.maketrans("", "", string.punctuation))
            i += 1

        context_var = make_context_vector(context, word_to_ix)
        net.zero_grad()
        log_probs = net(context_var)

        loss = loss_func(log_probs.view(1,-1), autograd.Variable(
            torch.LongTensor([word_to_ix[target.translate(str.maketrans("", "", string.punctuation))]]).to('cuda')
        ))

        loss.backward()
        optimizer.step()

        total_loss += loss.data
    print(total_loss)

tensor(454452.3750, device='cuda:0')


KeyboardInterrupt: 

In [None]:
list(word_to_ix.keys()).__len__()

8703

In [None]:
context = ['i', 'hungry']
print(word_to_ix)
context_var = make_context_vector(context, word_to_ix)
print(context_var)

log_probs = net(context_var)

key_list = list(word_to_ix.keys())
val_list = list(word_to_ix.values())

print(log_probs.argmax())

print(key_list[val_list.index(log_probs.argmax())])

tensor([7230, 2357], device='cuda:0')
tensor(7659, device='cuda:0')
try
