In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd
import matplotlib as plt
import sklearn
import gensim
import nltk
%matplotlib inline

In [2]:
data = pd.read_csv('data/movies_metadata.csv', low_memory=False)
data.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [3]:
print('Initial number of movies: ', len(data))
data = data[data['original_language'] == 'en']
print('Number of movies in English: ', len(data))
data = data[['title', 'overview']].dropna()
print('Number of films in English with title annd overview: ', len(data))

Initial number of movies:  45466
Number of movies in English:  32269
Number of films in English with title annd overview:  32198


In [4]:
data.head()

Unnamed: 0,title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...


In [None]:
from gensim.models.fasttext import FastText

%time ft_model = FastText.load_fasttext_format('../fastText/models/wiki.en/wiki.en.bin')

In [None]:
print(ft_model.__doc__)

### Metric
$$quality(truetitle, predtitle) = cosinedistance(truetitle.embeds.mean(), predtitle.embeds.mean())$$

### seq2seq

In [7]:
from nltk.corpus import stopwords
from nltk.corpus import words

nltk.download('stopwords')
nltk.download('words')

en_stopwords = set(stopwords.words('english'))
en_words = set(word.lower() for word in words.words())

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/femoiseev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/femoiseev/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [8]:
import string
import nltk
from scipy import spatial

nltk.download('punkt')

def get_mean_embeds(text, words, model=ft_model, stopwords=[]):
    embeds = []
    for word in nltk.word_tokenize(text):
        word = word.lower()
        if word not in stopwords and word in words:
            try:
                embed = model.wv[word]
                embeds.append(embed)
            except:
                pass
    return np.mean(embeds, axis=0)

def metric(title, pred, words, model=ft_model, stopwords=[]):
    if type(title) == str:
        title_embed = get_mean_embeds(title, words=words, model=model, stopwords=stopwords)
        pred_embed = get_mean_embeds(pred, words=words, model=model, stopwords=stopwords)
        return spatial.distance.cosine(title_embed, pred_embed)
    else:
        return np.mean([metric(x, y, words, model, stopwords) for x, y in zip(title, pred)])

[nltk_data] Downloading package punkt to /home/femoiseev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, random_state=42)

In [10]:
X_train, X_test, y_train, y_test = train['overview'].values, test['overview'].values, train['title'].values, test['title'].values

In [11]:
def tokenize(text, words=en_words, stopwords=[]):
    res = []
    for word in nltk.word_tokenize(text):
        word = word.lower()
        if word not in stopwords and word in words:
            res.append(word)
    return res

In [12]:
%time tmp = [len(tokenize(x, words=en_words, stopwords=en_stopwords)) for x in X_train]
np.max(tmp)

CPU times: user 7.31 s, sys: 3.15 ms, total: 7.31 s
Wall time: 7.33 s


86

In [13]:
X_train[0]

'The federal agent Joe Dee Foster is currently investigating a serial killer, helped by doctor Animal who is isolated in a maximum security jail.'

In [14]:
tokenize(X_train[0], stopwords=en_stopwords)

['federal',
 'agent',
 'joe',
 'dee',
 'foster',
 'currently',
 'investigating',
 'serial',
 'killer',
 'doctor',
 'animal',
 'isolated',
 'maximum',
 'security',
 'jail']

In [15]:
def save_dataset(X, Y, path, words=en_words, stopwords=[]):
    with open(path, 'w') as fout:
        for x, y in zip(X, Y):
            x_toks = tokenize(x, words=words, stopwords=stopwords)
            y_toks = tokenize(y, words=words, stopwords=stopwords)
            fout.write("\t".join([" ".join(x_toks), " ".join(y_toks)]))
            fout.write('\n')

In [16]:
%time save_dataset(X_train, y_train, 'data/train_stopwords.tsv', en_words, en_stopwords)
%time save_dataset(X_test, y_test, 'data/test_stopwords.tsv', en_words, en_stopwords)

CPU times: user 8.65 s, sys: 3.12 ms, total: 8.65 s
Wall time: 8.65 s
CPU times: user 2.23 s, sys: 0 ns, total: 2.23 s
Wall time: 2.23 s


In [17]:
from seq2seq.dataset import SourceField, TargetField
import torchtext

def load_dataset(path):
    src = SourceField()
    tgt = TargetField()
    max_len = 100
    def len_filter(example):
        return len(example.src) <= max_len and len(example.tgt) <= max_len
    dataset = torchtext.data.TabularDataset(
        path=path, format='tsv',
        fields=[('src', src), ('tgt', tgt)],
        filter_pred=len_filter
    )
    src.build_vocab(dataset, max_size=50000)
    tgt.build_vocab(dataset, max_size=50000)
    return dataset, src, tgt

In [18]:
train, src, tgt = load_dataset('data/train_stopwords.tsv')
input_vocab = src.vocab
output_vocab = tgt.vocab

In [19]:
len(output_vocab)

7995

In [20]:
import seq2seq
from seq2seq import *

In [24]:
from seq2seq.models import EncoderRNN, DecoderRNN, Seq2seq

bidirectional = False
hidden_size = 128

encoder = EncoderRNN(vocab_size=len(src.vocab), max_len=100, hidden_size=hidden_size, rnn_cell='gru', bidirectional=bidirectional, n_layers=1)
decoder = DecoderRNN(vocab_size=len(tgt.vocab), max_len=100, hidden_size=hidden_size * 2 if bidirectional else hidden_size, 
                                    rnn_cell='gru', bidirectional=bidirectional, n_layers=1, use_attention=False,
                                    eos_id=tgt.eos_id, sos_id=tgt.sos_id)
model = Seq2seq(encoder, decoder)
model.cuda()
for param in model.parameters():
    param.data.uniform_(-0.08, 0.08)

In [29]:
from seq2seq.loss import NLLLoss
from seq2seq.trainer import SupervisedTrainer

loss = NLLLoss()
loss.cuda()
t = SupervisedTrainer(loss=loss, batch_size=64,
                          checkpoint_every=10000,
                          print_every=10000, expt_dir='seq2seq')
optimizer = None

In [30]:
%time model = t.train(model, train, num_epochs=50, optimizer=optimizer, teacher_forcing_ratio=0.5, resume=False)

  predicted_softmax = function(self.out(output.contiguous().view(-1, self.hidden_size))).view(batch_size, output_size, -1)


CPU times: user 3min 40s, sys: 54.2 s, total: 4min 35s
Wall time: 4min 35s


In [35]:
predictor = seq2seq.evaluator.Predictor(model, input_vocab, output_vocab)

In [36]:
test = pd.read_csv('data/test_stopwords.tsv', header=None, delimiter='\t').dropna()
test.head()

Unnamed: 0,0,1
0,little wonder bill eileen three grown lonely w...,wonderland
1,rare remarkable theatrical experience controve...,mother
3,storm forming work robert watt pioneer radar t...,sky
7,buster around blacksmith shop smithy get fight...,blacksmith
8,man away join paramilitary group teens fightin...,mercury


In [37]:
X_test = [x.strip().split() for x in test[0]]
y_test = test[1].values

In [38]:
predictor.predict(X_test[550])

  predicted_softmax = function(self.out(output.contiguous().view(-1, self.hidden_size))).view(batch_size, output_size, -1)


['piranha', '<eos>']

In [39]:
y_test[550]

'forget paris'

In [40]:
import tqdm

preds = []
for x in tqdm.tqdm(X_test):
    preds.append(' '.join(predictor.predict(x)[:-1]))

  predicted_softmax = function(self.out(output.contiguous().view(-1, self.hidden_size))).view(batch_size, output_size, -1)
100%|██████████| 5588/5588 [06:20<00:00, 14.70it/s]


In [46]:
metric(y_test, preds, words=en_words)

0.7175242919855807

In [47]:
print('Examples of work:')
print()

for x, y, z in list(zip(X_test, y_test, preds))[:10]:
    print('Overview:')
    print(x)
    print()
    print('True title: {}'.format(y))
    print()
    print('Predicted title: {}'.format(z))
    print()

Examples of work:

Overview:
['little', 'wonder', 'bill', 'eileen', 'three', 'grown', 'lonely', 'waitress', 'personal', 'looking', 'love', 'debbie', 'single', 'men', 'hair', 'salon', 'son', 'part', 'weekend', 'ex', 'man', 'temper', 'molly', 'first', 'baby', 'father', 'acts', 'responsibility', 'much']

True title: wonderland

Predicted title: men men

Overview:
['rare', 'remarkable', 'theatrical', 'experience', 'controversial', 'provocative', 'shocking', 'two', 'academy', 'make', 'play', 'motion', 'picture', 'event', 'year', 'would', 'someone', 'sat', 'one', 'night', 'calmly', 'told', 'going', 'end', 'life', 'morning']

True title: sky

Predicted title: soul

Overview:
['storm', 'forming', 'work', 'robert', 'watt', 'pioneer', 'radar', 'team', 'eccentric', 'yet', 'brilliant', 'struggle', 'turn', 'concept', 'radar', 'workable', 'reality', 'tiny', 'budget', 'seemingly', 'insurmountable', 'technical', 'even', 'spy', 'camp', 'watt', 'also', 'deal', 'marital', 'dream', 'watt', 'team', 'world'

### With attention

In [48]:
from seq2seq.models import EncoderRNN, DecoderRNN, Seq2seq

bidirectional = False
hidden_size = 128

encoder = EncoderRNN(vocab_size=len(src.vocab), max_len=100, hidden_size=hidden_size, rnn_cell='gru', bidirectional=bidirectional, n_layers=1)
decoder = DecoderRNN(vocab_size=len(tgt.vocab), max_len=100, hidden_size=hidden_size * 2 if bidirectional else hidden_size, 
                                    rnn_cell='gru', bidirectional=bidirectional, n_layers=1, use_attention=True,
                                    eos_id=tgt.eos_id, sos_id=tgt.sos_id)
model = Seq2seq(encoder, decoder)
model.cuda()
for param in model.parameters():
    param.data.uniform_(-0.08, 0.08)

In [49]:
from seq2seq.loss import NLLLoss
from seq2seq.trainer import SupervisedTrainer

loss = NLLLoss()
loss.cuda()
t = SupervisedTrainer(loss=loss, batch_size=64,
                          checkpoint_every=10000,
                          print_every=10000, expt_dir='seq2seq')
optimizer = None

In [50]:
%time model = t.train(model, train, num_epochs=50, optimizer=optimizer, teacher_forcing_ratio=0.5, resume=False)

  predicted_softmax = function(self.out(output.contiguous().view(-1, self.hidden_size))).view(batch_size, output_size, -1)


CPU times: user 4min 13s, sys: 57.4 s, total: 5min 10s
Wall time: 5min 11s


In [51]:
predictor = seq2seq.evaluator.Predictor(model, input_vocab, output_vocab)

In [52]:
test = pd.read_csv('data/test_stopwords.tsv', header=None, delimiter='\t').dropna()
test.head()

Unnamed: 0,0,1
0,little wonder bill eileen three grown lonely w...,wonderland
1,rare remarkable theatrical experience controve...,mother
3,storm forming work robert watt pioneer radar t...,sky
7,buster around blacksmith shop smithy get fight...,blacksmith
8,man away join paramilitary group teens fightin...,mercury


In [53]:
X_test = [x.strip().split() for x in test[0]]
y_test = test[1].values

In [54]:
predictor.predict(X_test[550])

  attn = F.softmax(attn.view(-1, input_size)).view(batch_size, -1, input_size)
  predicted_softmax = function(self.out(output.contiguous().view(-1, self.hidden_size))).view(batch_size, output_size, -1)


['halloween', '<eos>']

In [55]:
y_test[550]

'forget paris'

In [56]:
import tqdm

preds = []
for x in tqdm.tqdm(X_test):
    preds.append(' '.join(predictor.predict(x)[:-1]))

  attn = F.softmax(attn.view(-1, input_size)).view(batch_size, -1, input_size)
  predicted_softmax = function(self.out(output.contiguous().view(-1, self.hidden_size))).view(batch_size, output_size, -1)
100%|██████████| 5588/5588 [07:17<00:00, 12.76it/s]


In [62]:
metric(y_test, preds, words=en_words)

0.716016441544048

In [47]:
print('Examples of work:')
print()

for x, y, z in list(zip(X_test, y_test, preds))[:10]:
    print('Overview:')
    print(x)
    print()
    print('True title: {}'.format(y))
    print()
    print('Predicted title: {}'.format(z))
    print()

Examples of work:

Overview:
['little', 'wonder', 'bill', 'eileen', 'three', 'grown', 'lonely', 'waitress', 'personal', 'looking', 'love', 'debbie', 'single', 'men', 'hair', 'salon', 'son', 'part', 'weekend', 'ex', 'man', 'temper', 'molly', 'first', 'baby', 'father', 'acts', 'responsibility', 'much']

True title: wonderland

Predicted title: men men

Overview:
['rare', 'remarkable', 'theatrical', 'experience', 'controversial', 'provocative', 'shocking', 'two', 'academy', 'make', 'play', 'motion', 'picture', 'event', 'year', 'would', 'someone', 'sat', 'one', 'night', 'calmly', 'told', 'going', 'end', 'life', 'morning']

True title: sky

Predicted title: soul

Overview:
['storm', 'forming', 'work', 'robert', 'watt', 'pioneer', 'radar', 'team', 'eccentric', 'yet', 'brilliant', 'struggle', 'turn', 'concept', 'radar', 'workable', 'reality', 'tiny', 'budget', 'seemingly', 'insurmountable', 'technical', 'even', 'spy', 'camp', 'watt', 'also', 'deal', 'marital', 'dream', 'watt', 'team', 'world'

### Pretrained embeddings

In [64]:
from seq2seq.dataset import SourceField, TargetField
import torchtext

def load_dataset(path):
    src = SourceField()
    tgt = TargetField()
    max_len = 100
    def len_filter(example):
        return len(example.src) <= max_len and len(example.tgt) <= max_len
    dataset = torchtext.data.TabularDataset(
        path=path, format='tsv',
        fields=[('src', src), ('tgt', tgt)],
        filter_pred=len_filter
    )
    src.build_vocab(dataset, max_size=50000)
    tgt.build_vocab(dataset, max_size=50000)
    return dataset, src, tgt

In [65]:
train, src, tgt = load_dataset('data/train_stopwords.tsv')
src.vocab.load_vectors('fasttext.en.300d')
input_vocab = src.vocab
output_vocab = tgt.vocab

.vector_cache/wiki.en.vec: 6.60GB [1:06:13, 1.66MB/s]                            
  0%|          | 0/2519371 [00:00<?, ?it/s]Skipping token 2519370 with 1-dimensional vector ['300']; likely a header
100%|██████████| 2519371/2519371 [02:43<00:00, 15402.18it/s]


In [67]:
len(input_vocab)

21460

In [68]:
from seq2seq.models import EncoderRNN, DecoderRNN, Seq2seq

bidirectional = False
hidden_size = 128

encoder = EncoderRNN(vocab_size=len(src.vocab), max_len=100, hidden_size=hidden_size, rnn_cell='gru', bidirectional=bidirectional, n_layers=1)
decoder = DecoderRNN(vocab_size=len(tgt.vocab), max_len=100, hidden_size=hidden_size * 2 if bidirectional else hidden_size, 
                                    rnn_cell='gru', bidirectional=bidirectional, n_layers=1, use_attention=False,
                                    eos_id=tgt.eos_id, sos_id=tgt.sos_id)
model = Seq2seq(encoder, decoder)
model.cuda()
for param in model.parameters():
    param.data.uniform_(-0.08, 0.08)

In [69]:
from seq2seq.loss import NLLLoss
from seq2seq.trainer import SupervisedTrainer

loss = NLLLoss()
loss.cuda()
t = SupervisedTrainer(loss=loss, batch_size=64,
                          checkpoint_every=10000,
                          print_every=10000, expt_dir='seq2seq')
optimizer = None

In [70]:
%time model = t.train(model, train, num_epochs=50, optimizer=optimizer, teacher_forcing_ratio=0.5, resume=False)

  predicted_softmax = function(self.out(output.contiguous().view(-1, self.hidden_size))).view(batch_size, output_size, -1)


CPU times: user 5min 55s, sys: 1min 52s, total: 7min 47s
Wall time: 7min 50s


In [71]:
predictor = seq2seq.evaluator.Predictor(model, input_vocab, output_vocab)

In [72]:
test = pd.read_csv('data/test_stopwords.tsv', header=None, delimiter='\t').dropna()
test.head()

Unnamed: 0,0,1
0,little wonder bill eileen three grown lonely w...,wonderland
1,rare remarkable theatrical experience controve...,mother
3,storm forming work robert watt pioneer radar t...,sky
7,buster around blacksmith shop smithy get fight...,blacksmith
8,man away join paramilitary group teens fightin...,mercury


In [73]:
X_test = [x.strip().split() for x in test[0]]
y_test = test[1].values

In [74]:
predictor.predict(X_test[550])

  predicted_softmax = function(self.out(output.contiguous().view(-1, self.hidden_size))).view(batch_size, output_size, -1)


['thief', '<eos>']

In [75]:
y_test[550]

'forget paris'

In [76]:
import tqdm

preds = []
for x in tqdm.tqdm(X_test):
    preds.append(' '.join(predictor.predict(x)[:-1]))

  predicted_softmax = function(self.out(output.contiguous().view(-1, self.hidden_size))).view(batch_size, output_size, -1)
100%|██████████| 5588/5588 [06:37<00:00, 14.07it/s]


In [82]:
metric(y_test, preds, words=en_words)

0.7196469058787607

In [83]:
X_test[0]

['little',
 'wonder',
 'bill',
 'eileen',
 'three',
 'grown',
 'lonely',
 'waitress',
 'personal',
 'looking',
 'love',
 'debbie',
 'single',
 'men',
 'hair',
 'salon',
 'son',
 'part',
 'weekend',
 'ex',
 'man',
 'temper',
 'molly',
 'first',
 'baby',
 'father',
 'acts',
 'responsibility',
 'much']

In [87]:
from collections import Counter

def dummy_predict(text):
    cnt = Counter(text)
    return cnt.most_common(1)[0][0]

In [90]:
dummy_preds = [dummy_predict(x) for x in X_test]

In [91]:
metric(y_test, dummy_preds, words=en_words)

0.7726990462191043

In [47]:
print('Examples of work:')
print()

for x, y, z in list(zip(X_test, y_test, preds))[:10]:
    print('Overview:')
    print(x)
    print()
    print('True title: {}'.format(y))
    print()
    print('Predicted title: {}'.format(z))
    print()

Examples of work:

Overview:
['little', 'wonder', 'bill', 'eileen', 'three', 'grown', 'lonely', 'waitress', 'personal', 'looking', 'love', 'debbie', 'single', 'men', 'hair', 'salon', 'son', 'part', 'weekend', 'ex', 'man', 'temper', 'molly', 'first', 'baby', 'father', 'acts', 'responsibility', 'much']

True title: wonderland

Predicted title: men men

Overview:
['rare', 'remarkable', 'theatrical', 'experience', 'controversial', 'provocative', 'shocking', 'two', 'academy', 'make', 'play', 'motion', 'picture', 'event', 'year', 'would', 'someone', 'sat', 'one', 'night', 'calmly', 'told', 'going', 'end', 'life', 'morning']

True title: sky

Predicted title: soul

Overview:
['storm', 'forming', 'work', 'robert', 'watt', 'pioneer', 'radar', 'team', 'eccentric', 'yet', 'brilliant', 'struggle', 'turn', 'concept', 'radar', 'workable', 'reality', 'tiny', 'budget', 'seemingly', 'insurmountable', 'technical', 'even', 'spy', 'camp', 'watt', 'also', 'deal', 'marital', 'dream', 'watt', 'team', 'world'

### Bidirectional

In [107]:
from seq2seq.models import EncoderRNN, DecoderRNN, Seq2seq

bidirectional = True
hidden_size = 128

encoder = EncoderRNN(vocab_size=len(src.vocab), max_len=100, hidden_size=hidden_size, rnn_cell='gru', bidirectional=bidirectional, n_layers=1)
decoder = DecoderRNN(vocab_size=len(tgt.vocab), max_len=100, hidden_size=hidden_size * 2 if bidirectional else hidden_size, 
                                    rnn_cell='gru', bidirectional=bidirectional, n_layers=1, use_attention=True,
                                    eos_id=tgt.eos_id, sos_id=tgt.sos_id)
model = Seq2seq(encoder, decoder)
model.cuda()
for param in model.parameters():
    param.data.uniform_(-0.08, 0.08)

In [108]:
from seq2seq.loss import NLLLoss
from seq2seq.trainer import SupervisedTrainer

loss = NLLLoss()
loss.cuda()
t = SupervisedTrainer(loss=loss, batch_size=64,
                          checkpoint_every=10000,
                          print_every=10000, expt_dir='seq2seq')
optimizer = None

In [109]:
%time model = t.train(model, train, num_epochs=50, optimizer=optimizer, teacher_forcing_ratio=0.5, resume=False)

  attn = F.softmax(attn.view(-1, input_size)).view(batch_size, -1, input_size)
  predicted_softmax = function(self.out(output.contiguous().view(-1, self.hidden_size))).view(batch_size, output_size, -1)


CPU times: user 5min 36s, sys: 1min 15s, total: 6min 52s
Wall time: 6min 51s


In [110]:
predictor = seq2seq.evaluator.Predictor(model, input_vocab, output_vocab)

In [111]:
test = pd.read_csv('data/test_stopwords.tsv', header=None, delimiter='\t').dropna()
test.head()

Unnamed: 0,0,1
0,little wonder bill eileen three grown lonely w...,wonderland
1,rare remarkable theatrical experience controve...,mother
3,storm forming work robert watt pioneer radar t...,sky
7,buster around blacksmith shop smithy get fight...,blacksmith
8,man away join paramilitary group teens fightin...,mercury


In [112]:
X_test = [x.strip().split() for x in test[0]]
y_test = test[1].values

In [113]:
predictor.predict(X_test[550])

  attn = F.softmax(attn.view(-1, input_size)).view(batch_size, -1, input_size)
  predicted_softmax = function(self.out(output.contiguous().view(-1, self.hidden_size))).view(batch_size, output_size, -1)


['actress', '<eos>']

In [114]:
y_test[550]

'forget paris'

In [115]:
import tqdm

preds = []
for x in tqdm.tqdm(X_test):
    preds.append(' '.join(predictor.predict(x)[:-1]))

  attn = F.softmax(attn.view(-1, input_size)).view(batch_size, -1, input_size)
  predicted_softmax = function(self.out(output.contiguous().view(-1, self.hidden_size))).view(batch_size, output_size, -1)
100%|██████████| 5588/5588 [07:35<00:00, 12.27it/s]


In [121]:
metric(y_test, preds, words=en_words)

0.7063390948060938

In [123]:
scores = [metric(y_test[i], preds[i], words=en_words) for i in range(len(y_test))]

In [127]:
inds = list(np.argsort(scores))

In [131]:
scores[inds[0]]

0.0

In [148]:
i = 130
y_test[inds[i]], preds[inds[i]], X_test[inds[i]]

('monster high city',
 'monster',
 ['sons',
  'international',
  'sent',
  'boot',
  'camp',
  'taught',
  'basic',
  'survival',
  'teach',
  'responsibility',
  'taken',
  'hostage',
  'taken',
  'ransom',
  'need',
  'utilize',
  'every',
  'skill',
  'learned',
  'survive'])

In [122]:
print('Examples of work:')
print()

for x, y, z in list(zip(X_test, y_test, preds))[:10]:
    print('Overview:')
    print(x)
    print()
    print('True title: {}'.format(y))
    print()
    print('Predicted title: {}'.format(z))
    print()

Examples of work:

Overview:
['little', 'wonder', 'bill', 'eileen', 'three', 'grown', 'lonely', 'waitress', 'personal', 'looking', 'love', 'debbie', 'single', 'men', 'hair', 'salon', 'son', 'part', 'weekend', 'ex', 'man', 'temper', 'molly', 'first', 'baby', 'father', 'acts', 'responsibility', 'much']

True title: wonderland

Predicted title: hello

Overview:
['rare', 'remarkable', 'theatrical', 'experience', 'controversial', 'provocative', 'shocking', 'two', 'academy', 'make', 'play', 'motion', 'picture', 'event', 'year', 'would', 'someone', 'sat', 'one', 'night', 'calmly', 'told', 'going', 'end', 'life', 'morning']

True title: mother

Predicted title: jump

Overview:
['storm', 'forming', 'work', 'robert', 'watt', 'pioneer', 'radar', 'team', 'eccentric', 'yet', 'brilliant', 'struggle', 'turn', 'concept', 'radar', 'workable', 'reality', 'tiny', 'budget', 'seemingly', 'insurmountable', 'technical', 'even', 'spy', 'camp', 'watt', 'also', 'deal', 'marital', 'dream', 'watt', 'team', 'world