# Импорт библиотек и чтение файлов

In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline 
import numpy as np
import string
from tqdm import tqdm_notebook as tqdm
import re

#NLTK
import nltk
from nltk.corpus import stopwords
nltk.download("english")
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from gensim.models import Word2Vec

[nltk_data] Error loading english: Package 'english' not found in
[nltk_data]     index


In [2]:
df_movies = pd.read_csv(r'movies_metadata.csv')
df_movies.shape

  exec(code_obj, self.user_global_ns, self.user_ns)


(45466, 24)

# Preprocessing

*Проверим на наличие NaN*

In [3]:
df_movies.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

In [4]:
df_movies.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


*Удалим столбцы с большим количеством NaN и столбцы, которые нам не особо нужны, чтоб уменьшим размер датафрейма*

In [5]:
df_movies['original_title'] = df_movies['title']

In [7]:
df_movies.drop(df_movies.columns.difference(['original_title', 'overview', 'id']), 1, inplace=True)

  df_movies.drop(df_movies.columns.difference(['original_title', 'overview', 'id']), 1, inplace=True)


In [8]:
id_film = df_movies['id'].nunique()

In [9]:
df_movies = df_movies.fillna(0)
df_movies.isnull().sum()

id                0
original_title    0
overview          0
dtype: int64

*Функця препроцессинга*

In [10]:
def preprocessing(data, lemmatizer = WordNetLemmatizer(), 
                  stop_words = stopwords.words('english')):
    
    """
    На вход: датафрейм с описанием и названием фильмов,
    также пробрасываются стоп-слова и класс для лемматизации текстов
    
    На выходе: список с предобработанными текстовыми данными
    """
    
    texts = data.values.tolist()
    new_text = []

    for i in tqdm(range(len(texts))):
        text = texts[i]
        # Lowercase & Drop stopwords and punctuation
        punct_free = re.sub('[^a-zA-Z]', ' ', str(text).lower())
        punct_free = re.sub(r'\s+', ' ', punct_free)
        stop_free = " ".join([i for i in punct_free.split() if i not in stop_words])
        # stem_text = " ".join([stemmer.stem(word) for word in stop_free.split(' ')])
        
        lemma_text = ' '.join([lemmatizer.lemmatize(word) for word in stop_free.split(' ')])
        new_text.append(lemma_text)
    return new_text

In [11]:
text_overview = preprocessing(df_movies["overview"])
text_title = preprocessing(df_movies["original_title"])

df_movies["overview"] = text_overview
df_movies["original_title"] = text_title 

# Закидываем предобработанные описания и названия фильмов в общий список
text_prepr = text_title + text_overview

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm(range(len(texts))):


  0%|          | 0/45466 [00:00<?, ?it/s]

  0%|          | 0/45466 [00:00<?, ?it/s]

In [33]:
text_prepr = [i.split() for i in text_overview]

In [34]:
len(text_prepr)

45466

# Word2Vec

In [35]:
model = Word2Vec(text_prepr, 
                 window=10, 
                 min_count=2)

In [36]:
# Построим словарь из последовательности предложений

model.build_vocab(text_prepr)
words = model.wv.index_to_key
vocab_size = len(words)

print("Vocab size =", vocab_size)

Vocab size = 37888


*Обучим модель W2V для поиска наиболее похожих слов*

In [22]:
model.train(text_prepr, total_examples=len(text_prepr), epochs=5)

(7225363, 7531220)

In [23]:
model.wv.most_similar("snow")

[('magically', 0.7416269779205322),
 ('hospitable', 0.7379919290542603),
 ('boatswain', 0.72590172290802),
 ('descendant', 0.7252901196479797),
 ('boar', 0.7210928201675415),
 ('devours', 0.7198227643966675),
 ('lineage', 0.7162126898765564),
 ('ananka', 0.7152280807495117),
 ('resting', 0.7143144607543945),
 ('braunger', 0.7142908573150635)]

In [24]:
model.wv.most_similar("power")

[('energy', 0.7368856072425842),
 ('demon', 0.7276333570480347),
 ('evil', 0.7214480638504028),
 ('strength', 0.7113788723945618),
 ('healing', 0.6987121105194092),
 ('powerful', 0.696567952632904),
 ('contained', 0.692743718624115),
 ('technology', 0.690617024898529),
 ('superhuman', 0.6860851049423218),
 ('superman', 0.6849820613861084)]

In [25]:
model.wv.most_similar("woman")

[('man', 0.6290788650512695),
 ('couple', 0.6136555671691895),
 ('girl', 0.5953514575958252),
 ('prostitute', 0.5938584208488464),
 ('housewife', 0.5589686036109924),
 ('stranger', 0.5094226002693176),
 ('distressed', 0.5083330273628235),
 ('lonely', 0.5040274262428284),
 ('attractive', 0.4993456304073334),
 ('franzi', 0.49914056062698364)]

## Tockenizer & embeding

In [37]:
tokenizer = Tokenizer()

In [38]:
# Обновляем внутренний словарь на основе списка текстов.
tokenizer.fit_on_texts(df_movies["overview"])
vocab_size = len(tokenizer.word_index) + 1

print("Total words:", vocab_size)

Total words: 67108


*Создание матрицы эмбедингов*

*Функция построения матрицы эмбедингов*

In [39]:
def embedding_matrix(tock, model):
    
    """
    На вход: словарь tokenizer.word_index 
    модель word2vec
    
    На выходе: матрица эмбедингов текста
    """
    
    embedding_matrix = np.zeros((id_film, 100))

    for word, i in tock.items():
        if word in model:
            embedding_matrix[i] = model[word]
    return embedding_matrix

In [40]:
embedding_matrix = embedding_matrix(tokenizer.word_index, model.wv)

In [58]:
df_emb = pd.concat([df_movies[['id']],
                    pd.DataFrame(embedding_matrix)], axis=1)

In [60]:
df_emb.head(5)

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,862,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8844,-1.130172,-0.203857,-0.26567,1.005027,-0.30345,-1.180258,0.296309,1.438673,1.021423,...,1.989233,0.054273,-0.872724,-0.601492,0.401695,0.966455,0.363875,-0.444427,-0.117064,0.346752
2,15602,-0.594582,1.11217,-0.733422,0.645595,0.193194,-0.837224,0.091867,2.327102,0.459391,...,0.593009,0.270496,0.790447,-0.303808,0.579712,0.231543,0.506603,-0.293885,0.316924,1.120909
3,31357,-0.223846,1.122805,0.847333,-0.301235,0.221316,-0.771223,-0.397641,1.510231,0.053264,...,0.402126,-0.261817,0.493453,-0.949528,0.204875,-0.038256,0.064748,0.098242,-0.255637,0.325463
4,11862,0.801947,-0.497675,0.179389,0.670217,0.565143,-0.561894,0.38174,1.638131,-0.976643,...,1.685586,-0.253695,-1.297128,-0.106872,-0.188993,0.55561,-1.166854,-0.724291,-0.468282,0.22844


In [61]:
df_emb.shape

(45466, 101)