# 0. Imports and set up

In [16]:
import numpy as np
import pandas as pd
import json
import warnings
warnings.filterwarnings("ignore")

In [17]:
movies = pd.read_csv('movies.csv')
credits = pd.read_csv('credits.csv')

df_raw = movies.merge(credits, on='title', how='left')

# 1.0 Data Cleaning and Engineering

In [18]:
df1 = df_raw.copy()

In [19]:
df_movies = df1[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

df_movies['overview'] = df_movies['overview'].fillna("")

In [20]:
def get_list(obj):
    if pd.isnull(obj):
        return []
    try:
        data = json.loads(obj)
        return [i['name'] for i in data]
    except:
        return []
    
def get_list3(obj):
    if pd.isnull(obj):
        return []
    try:
        data = json.loads(obj)
        list =  [i['name'] for i in data]
        return list[:3]
    except:
        return []
    
def get_director(obj):
    if pd.isnull(obj):
        return ""
    try:
        data = json.loads(obj)
        return [i['name'] for i in data if i['job'] == 'Director']
    except:
        return ""
    

In [21]:
df_movies['genres'] = df_movies['genres'].apply(get_list)
df_movies['keywords'] = df_movies['keywords'].apply(get_list)
df_movies['cast'] = df_movies['cast'].apply(get_list3)
df_movies['crew'] = df_movies['crew'].apply(get_director)
df_movies.rename(columns={'crew': 'director'}, inplace=True)

df_movies['overview'] = df_movies['overview'].apply(lambda x: x.split())

# Remove Space

df_movies['genres'] = df_movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
df_movies['keywords'] = df_movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
df_movies['cast'] = df_movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
df_movies['director'] = df_movies['director'].apply(lambda x: [i.replace(" ", "") for i in x])
df_movies['overview'] = df_movies['overview'].apply(lambda x: [i.replace(" ", "") for i in x])

In [22]:
df_movies['tags'] =  df_movies['overview'] + df_movies['genres'] + df_movies['keywords'] + df_movies['cast'] + df_movies['director'] 
new_df = df_movies[['movie_id', 'title', 'tags']]

new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

# 2. Vectorization

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem.porter import PorterStemmer

In [24]:
cv = CountVectorizer(max_features=5000, stop_words='english')

vectors = cv.fit_transform(new_df['tags']).toarray()

In [25]:
ps = PorterStemmer()

def steam(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    
    return " ".join(y)

In [26]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vectors)

In [27]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie.title()].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key= lambda x: x[1])[1:6]

    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [28]:
import pickle

pickle.dump(new_df, open('new_df.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))