In [2]:
import pandas as pd
import ast
import pickle

In [3]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [4]:
movies = movies.merge(credits,on='title')
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]
movies.dropna(inplace=True)

In [5]:
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L 

In [6]:
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert)
movies['cast'] = movies['cast'].apply(lambda x:x[0:3])

In [7]:
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L 

In [8]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [9]:
def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

In [10]:
movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [11]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [12]:
new_df = movies.drop(columns=['overview','genres','keywords','cast','crew'])
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

In [13]:
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

In [14]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [15]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
        
    return " ".join(y)

In [16]:
new_df['tags'] = new_df['tags'].apply(stem)

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')
vectors = cv.fit_transform(new_df['tags']).toarray()

In [18]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

In [19]:
def get_total_tags(liked_movies):
    total_tags = ""
    for i in liked_movies:
        tag = new_df.loc[new_df['title']==i,'tags'].iloc[0]
        list_of_str = [total_tags,tag]
        total_tags = "".join(list_of_str)
    return total_tags

In [20]:
from numpy.linalg import norm
from numpy import vdot

def recommendLiked(liked_movies):
    
    tags = get_total_tags(liked_movies)
    
    similarity = []
    combined_tags = [tags]    
    
    combined_tags_vector = cv.transform(combined_tags).toarray()
    
    for i in vectors:
        cosine_similarity = (vdot([i],combined_tags_vector) / (norm(i) * norm(combined_tags_vector)))
        similarity.append(cosine_similarity)
    
    recommended_movies_list_index = sorted(list(enumerate(similarity)), reverse = True, key = lambda x: x[1])[1:20]
    
    for i in recommended_movies_list_index:
        print(new_df.iloc[i[0]].title)


In [21]:
liked_movies = ['Avatar','Spider-Man']
recommendLiked(liked_movies)

Spider-Man
Spider-Man 3
Spider-Man 2
Small Soldiers
Jupiter Ascending
Aliens vs Predator: Requiem
The Amazing Spider-Man 2
Falcon Rising
U.F.O.
Krull
The Helix... Loaded
Guardians of the Galaxy
Titan A.E.
Independence Day
Independence Daysaster
Predators
Aliens
Battle: Los Angeles
The Fifth Element


In [22]:
pickle.dump(new_df,open('movie_list.pkl','wb'))