# Movie Recommender

### Imports

In [63]:
import numpy as np
import pandas as pd
import ast
import warnings
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
warnings.filterwarnings("ignore")

### Loading and Merging the datasets

In [64]:
pd.set_option('display.max_columns', None)
credits = pd.read_csv('credits.csv')
movies = pd.read_csv('movies.csv')
movies = movies.merge(credits, on = 'title')
df = movies[['movie_id', 'title', 'genres', 'overview', 'keywords', 'crew', 'cast', 'vote_average']]
df.dropna(inplace=True)
df.head()

Unnamed: 0,movie_id,title,genres,overview,keywords,crew,cast,vote_average
0,19995,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","In the 22nd century, a paraplegic Marine is di...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...",7.2
1,285,Pirates of the Caribbean: At World's End,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","Captain Barbossa, long believed to be dead, ha...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...",6.9
2,206647,Spectre,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",A cryptic message from Bond’s past sends him o...,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...",6.3
3,49026,The Dark Knight Rises,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",Following the death of District Attorney Harve...,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...",7.6
4,49529,John Carter,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","John Carter is a war-weary, former military ca...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...",6.1


### Preprocessing and Feature Engineering

In [65]:
# Functions to extract required information about each movie
def convert(entry):
    return [x['name'] for x in ast.literal_eval(entry)]

def convert_2(entry):
    result, counter = [], 0
    for x in ast.literal_eval(entry):
        if counter != 4:
            counter += 1
            result.append(x['name'])
        else:
            break
    return result

def director(entry):
    return [x['name'] for x in ast.literal_eval(entry) if x['job'] == 'Director']

# Extracting the features of each movie
df['genres'] = df['genres'].apply(convert)
df['keywords'] = df['keywords'].apply(convert)
df['cast'] = df['cast'].apply(convert_2)
df['crew'] = df['crew'].apply(director)
df['overview'] = df['overview'].apply(lambda text: text.split())

# Consolidating the features into a usable form
columns = ['genres', 'keywords', 'cast', 'crew']
for column in columns:
    df[column] = df[column].apply(lambda entry: [x.replace(" ", "") for x in entry])

df['info'] = df['overview'] + 2*df['genres'] + df['keywords'] + df['cast'] + 2*df['crew']
df['info'] = df['info'].apply(lambda l: " ".join(l))
df['info'] = df['info'].apply(lambda string: string.lower())

# Utilized PorterStemmer to find the 'stem' of each word for uniformity
ps = PorterStemmer()
def root(string):
    new_string = [ps.stem(x) for x in string.split()]
    return " ".join(new_string)

df['info'] = df['info'].apply(root)

df.head()

Unnamed: 0,movie_id,title,genres,overview,keywords,crew,cast,vote_average,info
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[In, the, 22nd, century,, a, paraplegic, Marin...","[cultureclash, future, spacewar, spacecolony, ...",[JamesCameron],"[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",7.2,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[Captain, Barbossa,, long, believed, to, be, d...","[ocean, drugabuse, exoticisland, eastindiatrad...",[GoreVerbinski],"[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...",6.9,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,"[Action, Adventure, Crime]","[A, cryptic, message, from, Bond’s, past, send...","[spy, basedonnovel, secretagent, sequel, mi6, ...",[SamMendes],"[DanielCraig, ChristophWaltz, LéaSeydoux, Ralp...",6.3,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[Following, the, death, of, District, Attorney...","[dccomics, crimefighter, terrorist, secretiden...",[ChristopherNolan],"[ChristianBale, MichaelCaine, GaryOldman, Anne...",7.6,follow the death of district attorney harvey d...
4,49529,John Carter,"[Action, Adventure, ScienceFiction]","[John, Carter, is, a, war-weary,, former, mili...","[basedonnovel, mars, medallion, spacetravel, p...",[AndrewStanton],"[TaylorKitsch, LynnCollins, SamanthaMorton, Wi...",6.1,"john carter is a war-weary, former militari ca..."


### Creating the model

In [66]:
# Employed a CountVectorizer model in order to indicate common words between movies
model = CountVectorizer(max_features=5000, stop_words='english')
count_vectors = model.fit_transform(df['info']).toarray()

# Cosine similarity to compute the similarity between two movies
similarities = cosine_similarity(count_vectors)

### View the results!

In [67]:
def recommend(movie):

    index = df[df['title'] == movie].index[0]
    distances = similarities[index]

    # Find the most similar 25 movies
    recommended_movies = sorted(list(enumerate(distances)), reverse = True, key = lambda s: s[1])[1:26]

    # Find the 10 highest rated movies from the 25 'most similar' movies
    recommended_movies = [(df.iloc[suggestion[0]]['title'], df.iloc[suggestion[0]]['vote_average']) for suggestion in recommended_movies]
    final_recommendations = sorted(recommended_movies, reverse=True, key=lambda x: x[1])[1:11]
    return pd.DataFrame(final_recommendations, columns = ['Movie', 'Rating']).set_index('Movie')

In [68]:
recommend('Batman')

Unnamed: 0_level_0,Rating
Movie,Unnamed: 1_level_1
The Dark Knight Rises,7.6
Batman Begins,7.5
Batman,7.0
Superman,6.9
Highlander,6.8
Spider-Man 2,6.7
Batman Returns,6.6
Curse of the Golden Flower,6.6
The Amazing Spider-Man,6.5
Hancock,6.2
