# Treinando Movie Plos from Wikipedia

## Bibliotecas

In [34]:
import pandas as pd

import nltk
from nltk.corpus import stopwords

import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import DataLoader

from sklearn.metrics.pairwise import cosine_similarity

In [16]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Base de dados

In [2]:
df = pd.read_csv("Dados/wiki_movie_plots_deduped.csv")
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [3]:
corpus = df["Plot"]
corpus.head()

0    A bartender is working at a saloon, serving dr...
1    The moon, painted with a smiling face hangs ov...
2    The film, just over a minute long, is composed...
3    Lasting just 61 seconds and consisting of two ...
4    The earliest known adaptation of the classic f...
Name: Plot, dtype: object

### Remoção de stopwords e tokenização

In [4]:
stopwords = stopwords.words('english')
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [6]:
corpus = list(corpus)

# plots = [nltk.word_tokenize(word.lower()) for word in corpus if not word.lower() in stopwords]
plots = [nltk.word_tokenize(word.lower()) for word in corpus[:5] if not word.lower() in stopwords]
plots[0]

['a',
 'bartender',
 'is',
 'working',
 'at',
 'a',
 'saloon',
 ',',
 'serving',
 'drinks',
 'to',
 'customers',
 '.',
 'after',
 'he',
 'fills',
 'a',
 'stereotypically',
 'irish',
 'man',
 "'s",
 'bucket',
 'with',
 'beer',
 ',',
 'carrie',
 'nation',
 'and',
 'her',
 'followers',
 'burst',
 'inside',
 '.',
 'they',
 'assault',
 'the',
 'irish',
 'man',
 ',',
 'pulling',
 'his',
 'hat',
 'over',
 'his',
 'eyes',
 'and',
 'then',
 'dumping',
 'the',
 'beer',
 'over',
 'his',
 'head',
 '.',
 'the',
 'group',
 'then',
 'begin',
 'wrecking',
 'the',
 'bar',
 ',',
 'smashing',
 'the',
 'fixtures',
 ',',
 'mirrors',
 ',',
 'and',
 'breaking',
 'the',
 'cash',
 'register',
 '.',
 'the',
 'bartender',
 'then',
 'sprays',
 'seltzer',
 'water',
 'in',
 'nation',
 "'s",
 'face',
 'before',
 'a',
 'group',
 'of',
 'policemen',
 'appear',
 'and',
 'order',
 'everybody',
 'to',
 'leave',
 '.',
 '[',
 '1',
 ']']

### Tamanho de treino e teste

In [7]:
size = int(len(plots) * 0.2)
treino = plots[size:]
teste = plots[:size]

len(treino), len(teste)

(4, 1)

## CBOW - Continuos Bag-of-words
Rede neural proposta para o treinamento de word embeddings livre de contexto. O modelo CBOW aprende representações vetoriais ao ser treinado na tarefa de prever um determinado token-alvo dado os tokens ao seu redor.

Vamos começar definindo o vocabulário para o qual treinaremos representações vetoriais:

In [8]:
def get_vocab(texts):

    vocab = []

    for row in texts:
        vocab.extend(row)

    vocab = list(set(vocab))
    vocab += ['<pad>', '<oov>'] # padding & out of vocabulary

    w2id = { w:i for i, w in enumerate(vocab) } # word -> id
    id2w = { i:w for i, w in enumerate(vocab) } # id -> word

    return vocab, w2id, id2w

In [9]:
vocab, w2id, id2w = get_vocab(treino)
print('Número de palavras: ', len(vocab))

Número de palavras:  247


Vamos definir então os exemplos de treinamento. Como mencionado acima, as redes neurais CBOW aprendem os word embeddings ao serem treinadas na tarefa de prever um token dado um contexto de tokens ao redor. Portanto, devemos construir instâncias compostas de um token e seus tokens vizinhos de acordo com um tamanho de janela ( window size ). Neste caso, vamos definir uma janela de tamanho 2, isto é, vamos construir um contexto compostos pelos 2 tokens anteriores e os 2 tokens posteriores de um token alvo:

In [10]:
def context_window(tokens, size = 3):

    tokens = (['<pad>'] * size) + tokens + (['<pad>'] * size)

    contexts = []

    for i in range(size, len(tokens) - size):
        context = tokens[i-size : i] + tokens[i+1 : i+size+1]
        word = tokens[i]
        contexts.append({ 'context': ' '.join(context), 'word': word })

    return contexts

In [11]:
data = []
window_size = 3

for plot in treino:
    data.extend(context_window(plot, size = window_size))

In [12]:
data[10]

{'context': 'face hangs over park at night', 'word': 'a'}

In [13]:
data[100]

{'context': '<pad> <pad> the , just over', 'word': 'film'}

Vamos definir o modelo utilizando Pytorch:

In [18]:
class CBOW(nn.Module):

    def __init__(self, inp_dim, nvocab, window_size, w2id, device):
        '''
        Inicializando uma rede neural CBOW

        Parâmetros:
        - inp_dim: dimensão dos embeddings
        - nvocab: tamanho do vocabulário de palavras para as quais treinaremos word embeddings
        - window_size: tamanho da janela de contexto
        - w2id: mapping de um token para seu índice na matriz de embeddings
        - device: dispositivo onde a rede neural será alocada (cpu ou cuda)
        '''
        super(CBOW, self).__init__()

        self.device = device
        self.w2id = w2id
        self.lookup = nn.Embedding(nvocab, inp_dim)
        self.weight = nn.Linear(2 * window_size * inp_dim, nvocab)
        self.softmax = nn.LogSoftmax(1)

    def forward(self, X):
        '''
        Forward pass

        Parâmetros:
        - X: lista de contextos de entrada

        Return:
        Probabilidade entre as palavras do vocabulário
        '''
        contexts = []

        for context in X:
            indexes = []

            for token in context.split():
                try:
                    indexes.append(w2id[token])
                except:
                    indexes.append(w2id['<oov>']) # out of vocabulary
            
            contexts.append(indexes)

        contexts = torch.tensor(contexts).to(self.device)
        embeddings = self.lookup(contexts)

        batch_size, window_size, inp_dim = embeddings.size()
        concatenation = embeddings.view(batch_size, window_size * inp_dim)

        z = self.weight(concatenation)

        return self.softmax(z)

Definindo os parâmetros da rede neural e de treinamento

In [26]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
inp_dim = 300
nvocab = len(vocab)
nepochs = 1
batch_size = 1 # 256
batch_status = 1 # 256
learning_rate = 0.01
window_size = window_size

Inicializando o modelo, a função de erro e o otimizador

In [27]:
model = CBOW(
    inp_dim,
    nvocab,
    window_size = window_size,
    w2id = w2id,
    device = device
).to(device)

criterion = nn.NLLLoss()

optimizer = optim.Adam(model.parameters(), lr = learning_rate)

Separando o conjunto de treinamento em lotes ( batches )

In [28]:
train_data = DataLoader(data, batch_size = batch_size, shuffle = True)

Treinando

In [29]:
for epoch in range(nepochs):

    losses = []

    for batch_index, row in enumerate(train_data):
        X = row['context']
        y = []

        for word in row['word']:
            try:
                y.append(w2id[word])
            except:
                y.append(w2id['oov'])

        y = torch.tensor(y).to(device)

        # Forward
        outputs = model(X)

        # Loss
        loss = criterion(outputs, y)
        losses.append(float(loss))

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # Display
        if(batch_index + 1) % batch_status == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tTotal Loss: {:.6f}'.format(
                epoch + 1, batch_index + 1, len(train_data),
                100. * batch_index / len(train_data), float(loss), 
                round(sum(losses) / len(losses), 5))
            )



In [33]:
embeddings = model.lookup.weight.data.cpu().numpy()

w2emb = { w: emb for (w, emb) in zip(vocab, list(embeddings)) }

In [53]:
lookup_word = 'assassin'
similarities = cosine_similarity([w2emb[lookup_word]], embeddings)[0]

candidates = sorted([(vocab[i], sim) for i, sim in enumerate(similarities)], 
    key = lambda x: x[1], reverse = True)[:10]

for candidate in candidates:
    print(candidate)

('assassin', 1.0)
('chases', 0.15653107)
('ascends', 0.1514222)
('mother', 0.13588268)
('assassination', 0.12911765)
('vice-president', 0.12196292)
('center', 0.12073536)
('railing', 0.11702977)
('young', 0.114941105)
('or', 0.114193514)


## Skip-gram
Arquitetura proposta junto com a CBOW. Contudo, a skip-gram aprende representações vetoriais ao ser treinada prevendo os tokens ao redor de um token alvo.

Como já definimos o vocabulário anteriormente, vamos criar os exemplos para o treinamento da arquitetura. Note que agora daremos o token alvo como entrada e orientaremos a rede neural a prever as palavras ao redor: