In [5]:
from __future__ import print_function
import numpy as np
import pandas as pd
import matplotlib as plt
import sklearn
import gensim
%matplotlib inline

In [6]:
data = pd.read_csv('data/movies_metadata.csv', low_memory=False)
data.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [7]:
print('Initial number of movies: ', len(data))
data = data[data['original_language'] == 'en']
print('Number of movies in English: ', len(data))
data = data[['title', 'overview']].dropna()
print('Number of films in English with title annd overview: ', len(data))

Initial number of movies:  45466
Number of movies in English:  32269
Number of films in English with title annd overview:  32198


In [8]:
data.head()

Unnamed: 0,title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...


In [9]:
from gensim.models.fasttext import FastText

%time ft_model = FastText.load_fasttext_format('../fastText/models/wiki.en/wiki.en.bin')

CPU times: user 3min 14s, sys: 7.63 s, total: 3min 21s
Wall time: 3min 37s


In [10]:
print(ft_model.__doc__)

Class for training, using and evaluating word representations learned using method
    described in [1]_ aka Fasttext.

    The model can be stored/loaded via its :meth:`~gensim.models.fasttext.FastText.save()` and
    :meth:`~gensim.models.fasttext.FastText.load()` methods, or loaded in a format compatible with the original
    fasttext implementation via :meth:`~gensim.models.fasttext.FastText.load_fasttext_format()`.

    


### Пример embedding'ов

In [11]:
# Getting the tokens 
model_words = []
for word in ft_model.wv.vocab:
    model_words.append(word)


# Printing out number of tokens available
print("Number of Tokens: {}".format(len(model_words)))

# Printing out the dimension of a word vector 
print("Dimension of a word vector: {}\n".format(
    len(ft_model.wv['car'])
))


# Pick a word 
find_similar_to = 'car'

# Finding out similar words [default= top 10]
for similar_word in ft_model.wv.similar_by_word(find_similar_to):
    print("Word: {0}, Similarity: {1:.2f}".format(
        similar_word[0], similar_word[1]
    ))

Number of Tokens: 2519370
Dimension of a word vector: 300

Word: cars, Similarity: 0.83
Word: automobile, Similarity: 0.72
Word: truck, Similarity: 0.71
Word: motorcar, Similarity: 0.70
Word: vehicle, Similarity: 0.70
Word: driver, Similarity: 0.69
Word: drivecar, Similarity: 0.69
Word: minivan, Similarity: 0.67
Word: roadster, Similarity: 0.67
Word: racecars, Similarity: 0.67


### Baseline 1

Предсказание - самый близкий вектор в пространстве эмбеддингов для средней суммы слов из описания без stopwords.

In [12]:
from nltk.corpus import stopwords
from nltk.corpus import words
import nltk

nltk.download('stopwords')
nltk.download('words')

en_stopwords = set(stopwords.words('english'))
en_words = set(word.lower() for word in words.words())

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/femoiseev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/femoiseev/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [20]:
import string
import nltk
from scipy import spatial

nltk.download('punkt')

def get_mean_embeds(text, words, model=ft_model, stopwords=[]):
    embeds = []
    for word in nltk.word_tokenize(text):
        word = word.lower()
        if word not in stopwords and word in words:
            try:
                embed = model.wv[word]
                embeds.append(embed)
            except:
                pass
    if len(embeds) == 0:
        embeds.append(np.zeros_like(model.wv['and']) + 0.1)
    return np.mean(embeds, axis=0)

def metric(title, pred, words, model=ft_model, stopwords=[]):
    if type(title) == str:
        title_embed = get_mean_embeds(title, words=words, model=model, stopwords=stopwords)
        pred_embed = get_mean_embeds(pred, words=words, model=model, stopwords=stopwords)
        return spatial.distance.cosine(title_embed, pred_embed)
    else:
        return np.mean([metric(x, y, words, model, stopwords) for x, y in zip(title, pred)])

[nltk_data] Downloading package punkt to /home/femoiseev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [76]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, random_state=42)

In [77]:
X_train, X_test, y_train, y_test = train['overview'].values, test['overview'].values, train['title'].values, test['title'].values

In [23]:
def predict(text, words=en_words, model=ft_model, stopwords=[]):
    if type(text) == str:
        embed = get_mean_embeds(text, words, model=model, stopwords=stopwords)
        for pair in model.wv.similar_by_vector(embed, topn=100, restrict_vocab=50000):
            word = pair[0]
            if len(word) > 5:
                return word
        return None
    else:
        return [predict(x, words, model, stopwords) for x in text]

In [24]:
print(predict(X_test[5], words=en_words, stopwords=en_stopwords))

chases


In [81]:
%time preds = predict(X_test, stopwords=en_stopwords)

CPU times: user 1min 7s, sys: 253 ms, total: 1min 7s
Wall time: 17 s


In [82]:
metric(y_test, preds, words=en_words, stopwords=en_stopwords)

0.7422436773219335

In [83]:
print('Examples of work:')
print()

for x, y, z in list(zip(X_test, y_test, preds))[:10]:
    print('Overview:')
    print(x)
    print()
    print('True title: {}'.format(y))
    print()
    print('Predicted title: {}'.format(z))
    print()

Examples of work:

Overview:
There's little wonder in the working-class lives of Bill, Eileen, and their three grown daughters. They're lonely Londoners. Nadia, a cafe waitress, places personal ads, looking for love; Debbie, a single mom, entertains men at the hair salon after hours; her son spends part of the weekend with her ex, a man with a hair-trigger temper. Molly is expecting her first baby and its father acts as if the responsibility is too much for him.

True title: Wonderland

Predicted title: girlfriend

Overview:
It happened in 1983. It was a rare and remarkable theatrical experience. Controversial. Provocative. And shocking. Now, two Academy Award-winning actresses make the Pulitzer Prize-winning play the motion picture event of the year. What would you do if someone you loved sat down with you one night and calmly told you that they were going to end their life before morning?

True title: 'night, Mother

Predicted title: shocking

Overview:
After his best friend dies in 

### Baseline 2

In [96]:
X_test_upd = [[z for z in x.split() if z in en_words and z not in en_stopwords] for x in X_test]

In [97]:
from collections import Counter

def dummy_predict(text):
    cnt = Counter(text)
    try:
        return cnt.most_common(1)[0][0]
    except:
        return '<unk>'

In [98]:
dummy_predict(X_test_upd[0])

'little'

In [99]:
dummy_preds = [dummy_predict(x) for x in X_test_upd]

In [100]:
metric(y_test, dummy_preds, words=en_words, stopwords=en_stopwords)

0.7663872657898056