In [46]:
import pandas as pd
import ast
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import nltk
from nltk.stem.porter import PorterStemmer

In [47]:
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

movies = movies.merge(credits, on = 'title')
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [48]:
def convert(obj):
    a = []
    count = 0
    for i in ast.literal_eval(obj):
        if count < 3:
            a.append(i['name'])
            count += 1
        else:
            break
    return a

def director(obj):
    a = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            a.append(i['name'])
            break
    return a

In [49]:
movies.dropna(inplace = True)
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert)
movies['crew'] = movies['crew'].apply(director)

movies.rename(columns = {'crew': 'director'}, inplace = True)

In [50]:
def collapse(a):
    arr = []
    for i in a:
        if isinstance(i, str):
            arr.append(i.replace(" ", ""))
        elif isinstance(i, list):
            arr.extend([x.replace(" ", "") for x in i if insistance(x, str)])
        else:
            arr.append("")
    return arr

movies['cast'] = movies['cast'].apply(collapse)
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)
movies['director'] = movies['director'].apply(collapse)

movies['overview'] = movies['overview'].apply(lambda x: str(x).split())

movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['director']

df2 = movies[['movie_id', 'title', 'tags']].copy()
df2['tags'] = df2['tags'].apply(lambda x: " ".join(x))
df2['tags'] = df2['tags'].apply(lambda x: x.lower())

In [51]:
print(df2.head(1)['tags'].iloc[0])

in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy cultureclash future spacewar samworthington zoesaldana sigourneyweaver jamescameron


In [52]:
tfidf = TfidfVectorizer(max_features = 5000, stop_words = 'english')

vector = tfidf.fit_transform(df2['tags']).toarray()

print(f"Форма матрицы векторов (фильмы * слова): {vector.shape}")

Форма матрицы векторов (фильмы * слова): (4806, 5000)


In [53]:
similarity = cosine_similarity(vector)
print(f"Форма матрицы сходства (фильмы x фильмы): {similarity.shape}")

Форма матрицы сходства (фильмы x фильмы): (4806, 4806)


In [54]:
similarity = similarity.astype(np.float32)
pickle.dump(df2.to_dict(), open('movie_dict.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))
print("Модель готова")

Модель готова


In [55]:
ps = PorterStemmer()

def stem(text):
    y = []
    
    for i in text.split():
        y.append(ps.stem(i))
    
    return " ".join(y)
    
df2['tags'] = df2['tags'].apply(stem)