In [8]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

print(movies.shape)
print(credits.shape)

(4803, 20)
(4803, 4)


In [16]:
movies = movies.merge(credits, on='title')
movies.head()

Unnamed: 0,movie_id_x,title,overview,genres,keywords,cast_x,crew_x,movie_id_y,cast_y,crew_y
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron],19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski],285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes],206647,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan],49026,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton],49529,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [11]:
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [12]:
movies.dropna(inplace=True)

In [13]:
import ast

def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name'])
    return L

movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert)

In [14]:
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L

movies['crew'] = movies['crew'].apply(fetch_director)

In [15]:
movies['cast'] = movies['cast'].apply(lambda x: x[:3])

In [18]:
movies = movies[['movie_id_x','title','overview','genres','keywords','cast_x','crew_x']]
movies.rename(columns={
    'movie_id_x': 'movie_id',
    'cast_x': 'cast',
    'crew_x': 'director'
}, inplace=True)

movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,director
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [23]:
movies['overview'] = movies['overview'].astype(str)
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [24]:
def remove_space(L):
    return [i.replace(" ", "") for i in L]

movies['tags'] = movies['tags'].apply(remove_space)

In [25]:
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x))

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000, stop_words='english')

vectors = tfidf.fit_transform(new_df['tags']).toarray()

In [28]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vectors)

In [31]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    
    movies_list = sorted(
        list(enumerate(distances)),
        reverse=True,
        key=lambda x: x[1]
    )[1:11]   
    
    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [34]:
full_movies = pd.read_csv("tmdb_5000_movies.csv")

# Merged rating into new_df using title
new_df = new_df.merge(full_movies[['title','vote_average']], on='title')

new_df.rename(columns={'vote_average':'rating'}, inplace=True)

new_df.head()

Unnamed: 0,movie_id,title,tags,rating
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",7.2
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",6.9
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,6.3
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,7.6
4,49529,John Carter,"John Carter is a war-weary, former military ca...",6.1


In [35]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    
    movies_list = sorted(
        list(enumerate(distances)),
        reverse=True,
        key=lambda x: x[1]
    )[1:21]   # Taken top 20 first
    
    recommendations = []
    
    for i in movies_list:
        idx = i[0]
        score = i[1]
        rating = new_df.iloc[idx].rating
        
        hybrid_score = score * 0.7 + (rating / 10) * 0.3
        
        recommendations.append((idx, hybrid_score))
    
    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)[:10]
    
    for i in recommendations:
        print(new_df.iloc[i[0]].title)

In [38]:
recommend("Avatar")

Salvador
Star Trek Into Darkness
The Book of Life
Edge of Tomorrow
Ender's Game
Jarhead
Step Up 2: The Streets
Titan A.E.
Battle: Los Angeles
The Rainmaker
