In [1]:
import numpy as np
import pandas as pd
import ast
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import pickle

In [2]:
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L

In [3]:
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L

In [4]:
def stem(text):
    ps = PorterStemmer()
    L = []
    for i in text.split():
        L.append(ps.stem(i))
    return ' '.join(L)

In [5]:
def weighted_vote_average(record):
    return ((record['vote_count'] * record['vote_average']) + (movies['vote_count'].quantile(0.6) * movies['vote_average'].mean()))/(movies['vote_count'].quantile(0.6) + record['vote_count'])

In [6]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [7]:
movies = movies.merge(credits, on = 'title')

In [8]:
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew', 'popularity', 'revenue', 'vote_average', 'vote_count', 'release_date']]
movies.insert(3, 'date', '2000-01-01')

In [9]:
movies[['release_date', 'date']] = movies[['release_date', 'date']].apply(pd.to_datetime)

In [10]:
movies['days'] = (movies['release_date'] - movies['date'])/np.timedelta64(1, 'D')
movies['weighted_vote'] = movies.apply(weighted_vote_average, axis=1)

In [11]:
movies.drop(['vote_average', 'vote_count', 'date', 'release_date'], axis = 'columns', inplace = True)
movies.dropna(inplace = True)

In [12]:
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert)
movies['cast'] = movies['cast'].apply(lambda x: x[0: 3])
movies['crew'] = movies['crew'].apply(fetch_director)

In [13]:
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(' ', '') for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(' ', '') for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(' ', '') for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(' ', '') for i in x])
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [14]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [15]:
movies.drop(['overview', 'genres', 'keywords', 'cast', 'crew'], axis = 'columns', inplace = True)
movies['tags'] = movies['tags'].apply(lambda x: ' '.join(x))
movies['tags'] = movies['tags'].apply(lambda x: x.lower())

In [16]:
movies['tags'] = movies['tags'].apply(stem)

In [17]:
cv = CountVectorizer(max_features = 5000, stop_words = 'english')
mms = MinMaxScaler(feature_range = (0, 1))

In [18]:
tag_vector = cv.fit_transform(movies['tags']).toarray()
numeric_feature_vector = mms.fit_transform(movies[['weighted_vote', 'revenue', 'popularity', 'days']])

In [19]:
final_vector = np.concatenate((tag_vector, numeric_feature_vector), axis = 1)

In [20]:
movies.drop(['weighted_vote', 'revenue', 'popularity', 'days'], axis = 'columns', inplace = True)
similarity = cosine_similarity(final_vector)

In [21]:
pickle.dump(movies.to_dict(), open('movie_list.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))