In [1]:
import pandas as pd
import os

In [2]:
data_directory = os.path.join('..', 'data')
movies = pd.read_csv(os.path.join(data_directory, 'tmdb_5000_movies.csv'))
credits = pd.read_csv(os.path.join(data_directory, 'tmdb_5000_credits.csv'))

In [3]:
movies = movies.merge(credits, on='title')

In [4]:
movies = movies[['movie_id', 'title', 'overview',
                 'genres', 'keywords', 'cast', 'crew']]

In [5]:
movies.dropna(inplace=True)

In [6]:
import ast

In [7]:
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [8]:
movies['genres'] = movies['genres'].apply(convert)

In [9]:
movies['keywords'] = movies['keywords'].apply(convert)

In [10]:
def convert2(obj):
    L = []
    cnt = 0
    for i in ast.literal_eval(obj):
        if cnt != 3:
            L.append(i['name'])
            cnt += 1
        else:
            break
    return L

In [11]:
movies['cast'] = movies['cast'].apply(convert2)

In [12]:
def dirr(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [13]:
movies['crew'] = movies['crew'].apply(dirr)

In [14]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [15]:
movies['genres'].apply(
    lambda x: [i.replace(" ",  "") for i in x])
movies['keywords'] = movies['keywords'].apply(
    lambda x: [i.replace(" ",  "") for i in x])
movies['cast'] = movies['cast'].apply(
    lambda x: [i.replace(" ",  "") for i in x])
movies['crew'] = movies['crew'].apply(
    lambda x: [i.replace(" ",  "") for i in x])

In [16]:
movies['tags'] = (movies['overview']
                  + movies['genres']
                  + movies['keywords']
                  + movies['cast']
                  + movies['crew'])

In [17]:
new_df = movies[['movie_id' , 'title', 'tags']]

In [18]:
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))


In [19]:
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())


In [20]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [21]:
vector = cv.fit_transform(new_df['tags']).toarray()

In [22]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [23]:
def stem(text):
    y  = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [24]:
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [25]:
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
sim = cosine_similarity(vector)

In [27]:
def rec(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = sim[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True,
                         key=lambda x: x[1])[1:6]
    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [28]:
rec('Interstellar')

The Martian
Space Pirate Captain Harlock
Silent Running
Gattaca
Lost in Space


In [29]:
import pickle

In [30]:
pickle.dump(new_df, open(os.path.join(data_directory, 'movies.pkl') , 
                         'wb'))

In [31]:
pickle.dump(sim, open(os.path.join(data_directory, 'sim.pkl'), 'wb'))

In [32]:
pickle.dump(new_df.to_dict(),
            open(os.path.join(data_directory, 'movie_dict.pkl'), 'wb'))

In [33]:
import shutil

In [34]:
shutil.make_archive(data_directory, 'zip', data_directory)
shutil.rmtree(data_directory)