In [234]:
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F
import spacy
import pandas as pd
import re
import string
from time import time

In [235]:
df = pd.read_csv('imdb_master.csv', encoding="latin-1")
df.shape

(499, 5)

In [236]:
df.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [237]:
df.isnull().sum()

Unnamed: 0    0
type          0
review        0
label         0
file          0
dtype: int64

In [238]:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

Unnamed: 0    0
type          0
review        0
label         0
file          0
dtype: int64

In [239]:
nlp = spacy.load('en_core_web_sm')

def cleaning(doc):
    txt = [token.lemma_ for token in doc if not token.is_stop]
    if len(txt) > 2:
        return ' '.join(txt)

In [240]:
brief_cleaning = (str(row).lower() for row in df['review'])

In [241]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=128)]

In [242]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(499, 1)

In [243]:
all_txt = ""

for row in df_clean['clean']:
    all_txt = all_txt + row

In [244]:
class CBOW(nn.Module):
    def __init__(self, context_size=2, embedding_size=256, vocab_size=None):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_size)
        self.linear1 = nn.Linear(embedding_size, vocab_size)

    def forward(self, inputs):
        lookup_embeds = self.embeddings(inputs)
        embeds = lookup_embeds.sum(dim=0)
        out = self.linear1(embeds)
        out = F.log_softmax(out, dim=0)
        return out

In [245]:
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    tensor = torch.LongTensor(idxs).to('cuda')
    return autograd.Variable(tensor)

In [246]:
CONTEXT_SIZE = 1
EMBEDDING_SIZE = 256
# raw_text = all_txt.split()
raw_text = re.findall( r'\w+|[^\s\w]+', all_txt)

vocab = set(raw_text)
vocab_size = len(vocab)

word_to_ix = {word: i for i, word in enumerate(vocab)}
print(word_to_ix.keys())

word_to_ix_ = {}

i = 0
for row in word_to_ix:
    word = row.translate(str.maketrans("", "", string.punctuation))
    if word != '':
        word_to_ix_[word] = i
    else:
        i -= 1
    i += 1

word_to_ix = word_to_ix_

print(word_to_ix.keys())

data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 1].translate(str.maketrans("", "", string.punctuation)), raw_text[i + 1].translate(str.maketrans("", "", string.punctuation))]
    target = raw_text[i]
    data.append((context, target))

loss_func = nn.CrossEntropyLoss()
net = CBOW(CONTEXT_SIZE, embedding_size=EMBEDDING_SIZE, vocab_size=vocab_size).to('cuda')
optimizer = optim.SGD(net.parameters(), lr=0.001) #optimize lr

for epoch in range(30):
    total_loss = 0
    iterations = 0
    for context, target in data:

        i = 0
        prev_word = "blank"
        for val in context:
            word = val.translate(str.maketrans("", "", string.punctuation))
            if word != '':
                context[i] = word
                prev_word = word
            else:
                if i == 0:
                    if context[0] == context[1] == '':
                        context[0] = 'blank'
                        prev_word = 'blank'
                    else:
                        i -= 1
                else:
                    context[i] = prev_word
            i += 1

        word = target.translate(str.maketrans("", "", string.punctuation))
        if word != '':
            target = word
        else:
            if i == 0:
                i -= 1
            else:
                target = prev_word

        context_var = make_context_vector(context, word_to_ix)
        net.zero_grad()
        log_probs = net(context_var)

        loss = loss_func(log_probs.view(1,-1), autograd.Variable(
            torch.LongTensor([word_to_ix[target.translate(str.maketrans("", "", string.punctuation))]]).to('cuda')
        ))

        loss.backward()
        optimizer.step()

        iterations += 1
        total_loss += loss.data
    print(total_loss / iterations)

tensor([4798, 4798], device='cuda:0')
tensor([4530, 7050], device='cuda:0')
tensor([4798, 4954], device='cuda:0')
tensor([7050, 4742], device='cuda:0')
tensor([4954,  365], device='cuda:0')
tensor([4742, 4742], device='cuda:0')
tensor([ 365, 6118], device='cuda:0')
tensor([7023, 7023], device='cuda:0')
tensor([6118,   66], device='cuda:0')
tensor([7023, 6607], device='cuda:0')
tensor([  66, 4216], device='cuda:0')
tensor([6607, 6607], device='cuda:0')
tensor([4216, 4737], device='cuda:0')
tensor([7451, 7451], device='cuda:0')
tensor([4737, 4737], device='cuda:0')
tensor([7451, 6071], device='cuda:0')
tensor([3195, 3195], device='cuda:0')
tensor([6071, 6071], device='cuda:0')
tensor([3195, 4530], device='cuda:0')
tensor([7451, 7451], device='cuda:0')
tensor([4530,  574], device='cuda:0')
tensor([7451, 7892], device='cuda:0')
tensor([574, 574], device='cuda:0')
tensor([7892, 2904], device='cuda:0')
tensor([2661, 2661], device='cuda:0')
tensor([2904, 2904], device='cuda:0')
tensor([2661, 

KeyboardInterrupt: 

In [None]:
list(word_to_ix.keys()).__len__()

8702

In [None]:
context = ['why', 'apology']
print(word_to_ix)
context_var = make_context_vector(context, word_to_ix)
print(context_var)

log_probs = net(context_var)

key_list = list(word_to_ix.keys())
val_list = list(word_to_ix.values())

print(log_probs.argmax())

print(key_list[val_list.index(log_probs.argmax())])

tensor([1419,   10], device='cuda:0')
tensor(1419, device='cuda:0')
why
