In [1]:
# Importing libraries
import pandas as pd 
import os 
import numpy as np
import requests
import matplotlib.pyplot as plt


#Load .env
from dotenv import load_dotenv

In [2]:
# Loading access keys 
load_dotenv()

API_KEY = os.getenv("API_KEY")

accept = os.getenv("accept")
Authorization = os.getenv("Authorization")

In [3]:
# Loading dataset from api
path1 = "../Resources/tmdb_movies_list.csv"

metadata = pd.read_csv(path1)

In [4]:
metadata

Unnamed: 0,id,imdb_id,Title,Genre,Plot,Popularity,movie_rating_avg,Poster
0,863,tt0120363,Toy Story 2,"[16, 35, 10751]","Andy heads off to Cowboy Camp, leaving his toy...",111.148,7.591,https://m.media-amazon.com/images/M/MV5BMWM5ZD...
1,10193,tt0435761,Toy Story 3,"[16, 10751, 35]","Woody, Buzz, and the rest of Andy's toys haven...",76.340,7.795,https://m.media-amazon.com/images/M/MV5BMTgxOT...
2,9487,tt0120623,A Bug's Life,"[12, 16, 35, 14, 10751]","On behalf of ""oppressed bugs everywhere,"" an i...",70.368,6.969,https://m.media-amazon.com/images/M/MV5BNThmZG...
3,8587,tt0110357,The Lion King,"[10751, 16, 18]",A young lion prince is cast out of his pride b...,102.851,8.257,https://m.media-amazon.com/images/M/MV5BYTYxNG...
4,585,tt0198781,"Monsters, Inc.","[16, 35, 10751]",Lovable Sulley and his wisecracking sidekick M...,128.295,7.842,https://m.media-amazon.com/images/M/MV5BMTY1NT...
...,...,...,...,...,...,...,...,...
441,11861,tt0113347,How to Make an American Quilt,18,Soon-to-be-wed graduate student Finn Dodd deve...,19.315,6.400,https://m.media-amazon.com/images/M/MV5BNGYwZm...
442,8391,tt0114916,When Night Is Falling,18,A prudish woman working on tenure as a literac...,12.404,6.239,https://m.media-amazon.com/images/M/MV5BNGUxMD...
443,11448,tt0113819,Mighty Aphrodite,35,"When Lenny and his wife, Amanda, adopt a baby,...",15.684,6.722,https://m.media-amazon.com/images/M/MV5BMTZmNj...
444,49133,tt0110299,Lamerica,18,"Fiore, an Italian conman, arrives in post Comm...",8.535,7.200,https://m.media-amazon.com/images/M/MV5BNjlmZm...


In [5]:
# Combing genre and plot together
metadata['genre_plot'] = metadata['Genre'].apply(lambda x: x.strip("[]"))
metadata["genre_plot"] = metadata["genre_plot"] + " " + metadata["Plot"]
metadata.head()


Unnamed: 0,id,imdb_id,Title,Genre,Plot,Popularity,movie_rating_avg,Poster,genre_plot
0,863,tt0120363,Toy Story 2,"[16, 35, 10751]","Andy heads off to Cowboy Camp, leaving his toy...",111.148,7.591,https://m.media-amazon.com/images/M/MV5BMWM5ZD...,"16, 35, 10751 Andy heads off to Cowboy Camp, l..."
1,10193,tt0435761,Toy Story 3,"[16, 10751, 35]","Woody, Buzz, and the rest of Andy's toys haven...",76.34,7.795,https://m.media-amazon.com/images/M/MV5BMTgxOT...,"16, 10751, 35 Woody, Buzz, and the rest of And..."
2,9487,tt0120623,A Bug's Life,"[12, 16, 35, 14, 10751]","On behalf of ""oppressed bugs everywhere,"" an i...",70.368,6.969,https://m.media-amazon.com/images/M/MV5BNThmZG...,"12, 16, 35, 14, 10751 On behalf of ""oppressed ..."
3,8587,tt0110357,The Lion King,"[10751, 16, 18]",A young lion prince is cast out of his pride b...,102.851,8.257,https://m.media-amazon.com/images/M/MV5BYTYxNG...,"10751, 16, 18 A young lion prince is cast out ..."
4,585,tt0198781,"Monsters, Inc.","[16, 35, 10751]",Lovable Sulley and his wisecracking sidekick M...,128.295,7.842,https://m.media-amazon.com/images/M/MV5BMTY1NT...,"16, 35, 10751 Lovable Sulley and his wisecrack..."


# Creating functions to get recommendations 

In [6]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

# Function to return a tfidf matrix of target column
def matrix_creation(element):
    
    #Replace NaN with an empty string
    metadata[element] = metadata[element].fillna('')

    #Construct the required TF-IDF matrix by fitting and transforming the data
    tfidf_matrix = tfidf.fit_transform(metadata[element])
    
    return tfidf_matrix 

In [7]:
# Function to get api recommendations 
def api_rec(title):
    #Set up api request 
    
    # Collect target movie ID
    movie_ID = metadata.loc[metadata["Title"] == title]["id"].values[0]
    
    # Set up API CALL
    url = "https://api.themoviedb.org/3/movie/" + str(movie_ID) + "/recommendations?language=en-US&page=1"
    
    headers = {
        "accept": accept,
        "Authorization": Authorization
    }
    
    #API request for recommendation
    response = requests.get(url, headers=headers).json()


    #Set up list to store api recommendation
    movie_titles = []
    for j in range(0,10):
        movie_titles.append(response["results"][j]['title'])
    return movie_titles

In [8]:
# Testing fit_trafnsform
tfidf_matrix = tfidf.fit_transform(metadata["Plot"])

tfidf_matrix

<446x5043 sparse matrix of type '<class 'numpy.float64'>'
	with 10692 stored elements in Compressed Sparse Row format>

In [9]:
# Testing matrix creation function output
matrix_creation("Plot")

<446x5043 sparse matrix of type '<class 'numpy.float64'>'
	with 10692 stored elements in Compressed Sparse Row format>

In [10]:
# Creating indices list for movies 
indices = pd.Series(metadata.index, index = metadata["Title"])

indices.head()

Title
Toy Story 2       0
Toy Story 3       1
A Bug's Life      2
The Lion King     3
Monsters, Inc.    4
dtype: int64

In [11]:
# Importing linear_kernel from sklearn
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics import ndcg_score

# Creating cosine similarity, to compare tfid matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix )

# Recommendation using cosine similarity 

In [12]:
def get_recommendations(title, element, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]
    
    # Create tfidf_matrix
    tfidf_matrix = matrix_creation(element)
    
    # Compute the cosine similarity matrix
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Set up dataframe
    test = pd.DataFrame({"movie_id":metadata["id"].iloc[movie_indices], "Title":metadata['Title'].iloc[movie_indices], "sim_score": sim_scores, "Genres":metadata['Genre'].iloc[movie_indices], "Plot":metadata['Plot'].iloc[movie_indices]}).reset_index(drop=True)
    
    return test

# Testing recommendation function

In [13]:
# Testing recommendation function
get_recommendations('Toy Story', "Plot")

Unnamed: 0,movie_id,Title,sim_score,Genres,Plot
0,10193,Toy Story 3,"(1, 0.4428023497950352)","[16, 10751, 35]","Woody, Buzz, and the rest of Andy's toys haven..."
1,863,Toy Story 2,"(0, 0.4076509308833214)","[16, 35, 10751]","Andy heads off to Cowboy Camp, leaving his toy..."
2,350,The Devil Wears Prada,"(188, 0.08921505701538683)","[18, 35]",Andy moves to New York to work in the fashion ...
3,7303,Maid in Manhattan,"(172, 0.074097281618203)","[35, 18, 10749]",Marisa Ventura is a struggling single mom who ...
4,278,The Shawshank Redemption,"(262, 0.06198884972606259)","[18, 80]",Framed in the 1940s for the double murder of h...
5,5,Four Rooms,"(419, 0.05195801249430182)",35,It's Ted the Bellhop's first night on the job....
6,393559,My Life as a Zucchini,"(395, 0.04125615465200745)","[16, 35, 18, 10751, 10749]","After his mother’s death, Zucchini is befriend..."
7,18093,Northanger Abbey,"(238, 0.03854185905287105)","[10749, 18, 10770]",A young woman's penchant for sensational Gothi...
8,8467,Dumb and Dumber,"(165, 0.03786497586471528)",[35],Lloyd and Harry are two men whose stupidity is...
9,11674,101 Dalmatians,"(285, 0.0366108521851194)","[10751, 35]","An evil, high-fashion designer plots to steal ..."


# Checking API Call

In [14]:
# Define url with test movie id 862
url = "https://api.themoviedb.org/3/movie/" + str(862) + "/recommendations?language=en-US&page=1"

# Headers for access keys
headers = {
"accept": accept,
"Authorization": Authorization
}

#API request for recommendation
response = requests.get(url, headers=headers).json()

In [15]:
response

{'page': 1,
 'results': [{'adult': False,
   'backdrop_path': '/91qfG6VaxRsCh1SbIi1w2vxknsL.jpg',
   'id': 863,
   'title': 'Toy Story 2',
   'original_language': 'en',
   'original_title': 'Toy Story 2',
   'overview': "Andy heads off to Cowboy Camp, leaving his toys to their own devices. Things shift into high gear when an obsessive toy collector named Al McWhiggen, owner of Al's Toy Barn kidnaps Woody. Andy's toys mount a daring rescue mission, Buzz Lightyear meets his match and Woody has to decide where he and his heart truly belong.",
   'poster_path': '/2MFIhZAW0CVlEQrFyqwa4U6zqJP.jpg',
   'media_type': 'movie',
   'genre_ids': [16, 35, 10751],
   'popularity': 111.148,
   'release_date': '1999-10-30',
   'video': False,
   'vote_average': 7.591,
   'vote_count': 13228},
  {'adult': False,
   'backdrop_path': '/uAfhsySkr1UzQg1zdg3dZQRz9Fd.jpg',
   'id': 10193,
   'title': 'Toy Story 3',
   'original_language': 'en',
   'original_title': 'Toy Story 3',
   'overview': "Woody, Buzz,

# Re-writing Cosine recommendation function to also display accuracy at K

In [16]:
# Function to get list of recommendation from cosine similarity scores
def get_accuracy(title, element, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    print(title)
    idx = indices[title]
    
    # Set up TFIDF matrix
    tfidf_matrix = matrix_creation(element)
    
    # Compute the cosine similarity matrix
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
        
    #Get API movie recommendation for target movie 
    movie_titles = api_rec(title)
    
    #Set up dataframe to hold recommendations
    Recommendation_list = pd.DataFrame({"movie_id":metadata["id"].iloc[movie_indices], 
                          "Title":metadata['Title'].iloc[movie_indices],
                          "Genres":metadata['Genre'].iloc[movie_indices],
                          "API_Rec_Titles":movie_titles})
    
    #Calculate accuracy
    movie_in_api = [i for i in list(Recommendation_list["Title"]) if i in list(Recommendation_list["API_Rec_Titles"])]
    accuracy_of_rec = len(movie_in_api)/10
    
   
    print("Accuracy at 10: " + str(accuracy_of_rec))
    print("----------------")
    return Recommendation_list

# COSINE Recommendation using Plot

#### Calculation of overall precision @k for plot column 

0.2 + 0.4 + 0.1 + 0.0 + 0.1 + 0.0 + 0.1 + 0.1 +0.7 + 0.0 = 1.7/10

Overall accuracy at k(10) = 0.17 Accuracy

In [17]:
# Get accuracy on cosine on plot column
get_accuracy('Toy Story', "Plot")
get_accuracy('Jumanji', "Plot")
get_accuracy('Grumpier Old Men', "Plot")
get_accuracy('Waiting to Exhale', "Plot")
get_accuracy('Father of the Bride Part II', "Plot")
get_accuracy('Heat', "Plot")
get_accuracy('Tom and Huck', "Plot")
get_accuracy('Sudden Death', "Plot")
get_accuracy('GoldenEye', "Plot")
get_accuracy('The Shawshank Redemption', "Plot")

Toy Story
Accuracy at 10: 0.2
----------------
Jumanji
Accuracy at 10: 0.4
----------------
Grumpier Old Men
Accuracy at 10: 0.1
----------------
Waiting to Exhale
Accuracy at 10: 0.0
----------------
Father of the Bride Part II
Accuracy at 10: 0.1
----------------
Heat
Accuracy at 10: 0.0
----------------
Tom and Huck
Accuracy at 10: 0.1
----------------
Sudden Death
Accuracy at 10: 0.1
----------------
GoldenEye
Accuracy at 10: 0.7
----------------
The Shawshank Redemption
Accuracy at 10: 0.0
----------------


Unnamed: 0,movie_id,Title,Genres,API_Rec_Titles
203,9972,Lock Up,"[28, 80]",The Godfather
186,8077,Alien³,"[878, 28, 27]",Schindler's List
1,10193,Toy Story 3,"[16, 10751, 35]",The Dark Knight
136,293863,The Age of Adaline,"[10749, 14, 18]",Pulp Fiction
367,10144,The Little Mermaid,"[16, 10751, 14]",The Godfather Part II
123,4347,Atonement,"[18, 10749]",Fight Club
105,36593,Naked Gun 33⅓: The Final Insult,"[35, 80]",Forrest Gump
287,6075,Carlito's Way,"[80, 18, 10749, 53]",The Green Mile
329,341013,Atomic Blonde,"[28, 53]",The Silence of the Lambs
188,350,The Devil Wears Prada,"[18, 35]",Spirited Away


# Cosine Recommendation model using genre_plot

#### Calculation of overall accuracy @k for genre_plot column 

0.2 + 0.4 + 0.1 + 0.0 + 0.1 + 0.0 + 0.1 + 0.1 +0.7 + 0.0 = 1.7/10

Overall accuracy at k(10) = 0.17 accuracy

In [18]:
# Get accuracy on cosine on genre + plot column
get_accuracy('Toy Story', "genre_plot")
get_accuracy('Jumanji', "genre_plot")
get_accuracy('Grumpier Old Men', "genre_plot")
get_accuracy('Waiting to Exhale', "genre_plot")
get_accuracy('Father of the Bride Part II', "genre_plot")
get_accuracy('Heat', "genre_plot")
get_accuracy('Tom and Huck', "genre_plot")
get_accuracy('Sudden Death', "genre_plot")
get_accuracy('GoldenEye', "genre_plot")
get_accuracy('The Shawshank Redemption', "genre_plot")

Toy Story
Accuracy at 10: 0.2
----------------
Jumanji
Accuracy at 10: 0.4
----------------
Grumpier Old Men
Accuracy at 10: 0.1
----------------
Waiting to Exhale
Accuracy at 10: 0.0
----------------
Father of the Bride Part II
Accuracy at 10: 0.1
----------------
Heat
Accuracy at 10: 0.0
----------------
Tom and Huck
Accuracy at 10: 0.1
----------------
Sudden Death
Accuracy at 10: 0.1
----------------
GoldenEye
Accuracy at 10: 0.7
----------------
The Shawshank Redemption
Accuracy at 10: 0.0
----------------


Unnamed: 0,movie_id,Title,Genres,API_Rec_Titles
203,9972,Lock Up,"[28, 80]",The Godfather
186,8077,Alien³,"[878, 28, 27]",Schindler's List
136,293863,The Age of Adaline,"[10749, 14, 18]",The Dark Knight
1,10193,Toy Story 3,"[16, 10751, 35]",Pulp Fiction
287,6075,Carlito's Way,"[80, 18, 10749, 53]",The Godfather Part II
105,36593,Naked Gun 33⅓: The Final Insult,"[35, 80]",Fight Club
123,4347,Atonement,"[18, 10749]",Forrest Gump
367,10144,The Little Mermaid,"[16, 10751, 14]",The Green Mile
188,350,The Devil Wears Prada,"[18, 35]",The Silence of the Lambs
202,819,Sleepers,"[80, 18, 53]",Spirited Away


# Cosine Recommendation using Genre

#### Calculation of overall precision @k for Genre column 

0.0 + 0.2 + 0.0 + 0.1 + 0.0 + 0.2 + 0.1 + 0.1 +0.7 + 0.0 = 1.4/10

Overall precision at k(10) = 0.14 precision

In [19]:
# Get accuracy on cosine on genre column
get_accuracy('Toy Story', "Genre")
get_accuracy('Jumanji', "Genre")
get_accuracy('Grumpier Old Men', "Genre")
get_accuracy('Waiting to Exhale', "Genre")
get_accuracy('Father of the Bride Part II', "Genre")
get_accuracy('Heat', "Genre")
get_accuracy('Tom and Huck', "Genre")
get_accuracy('Sudden Death', "Genre")
get_accuracy('GoldenEye', "Genre")
get_accuracy('The Shawshank Redemption', "Genre")

Toy Story
Accuracy at 10: 0.0
----------------
Jumanji
Accuracy at 10: 0.2
----------------
Grumpier Old Men
Accuracy at 10: 0.0
----------------
Waiting to Exhale
Accuracy at 10: 0.1
----------------
Father of the Bride Part II
Accuracy at 10: 0.0
----------------
Heat
Accuracy at 10: 0.2
----------------
Tom and Huck
Accuracy at 10: 0.1
----------------
Sudden Death
Accuracy at 10: 0.1
----------------
GoldenEye
Accuracy at 10: 0.7
----------------
The Shawshank Redemption
Accuracy at 10: 0.0
----------------


Unnamed: 0,movie_id,Title,Genres,API_Rec_Titles
53,769,GoodFellas,"[18, 80]",The Godfather
142,311,Once Upon a Time in America,"[18, 80]",Schindler's List
262,278,The Shawshank Redemption,"[18, 80]",The Dark Knight
265,9344,Kids,"[18, 80]",Pulp Fiction
266,1646,Freedom Writers,"[80, 18]",The Godfather Part II
327,4133,Blow,"[80, 18]",Fight Club
355,359156,Don't Be Bad,"[80, 18]",Forrest Gump
122,106646,The Wolf of Wall Street,"[80, 18, 35]",The Green Mile
242,8321,In Bruges,"[35, 18, 80]",The Silence of the Lambs
420,9273,Ace Ventura: When Nature Calls,80,Spirited Away


# Recommendation using NearestNeighbors

In [20]:
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

In [21]:
# Define X
X = metadata.Plot
X_tfidf = tfidf.fit_transform(X)

In [22]:
neigh = NearestNeighbors(n_neighbors=2, radius=0.4)
neigh.fit(X_tfidf)

In [23]:
# Kneigbors Graphs to array
a = neigh.kneighbors_graph(X_tfidf)
a.toarray()

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [24]:
# Returns the top 10 movies for tfidf matrix row 0 which is Toy Story

for i in range(0, len(neigh.kneighbors(X_tfidf[0], 10, return_distance=False)[0])):
    
    print(metadata.iloc[neigh.kneighbors(X_tfidf[0], 10, return_distance=False)[0][i]].Title)

Toy Story 2
Toy Story
Toy Story 3
The Devil Wears Prada
Zero Dark Thirty
Heat
The Shawshank Redemption
The Untouchables
In Bruges
The City of Lost Children


# Nearest Neighbor functions

In [25]:
# Function looking at using genre values to calculcate nearest neigbors
def nearest_neigh_model_genre(title, element, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    print(title)
    idx = indices[title]
    
    # Set up tfidf on the Genre column 
    X = metadata.Genre
    X_tfidf = tfidf.fit_transform(X)
    
    # Fitting to Nearest Neighbors
    neigh = NearestNeighbors(n_neighbors=2, radius=0.4)
    neigh.fit(X_tfidf)
    
    # Creating a list of movies closest to the target movie
    movie_indices = []
    for i in neigh.kneighbors(X_tfidf[idx], 11, return_distance=False)[0][1:]:
        movie_indices.append(i)
        
    ##### API SET UP #####
    
    #Get API movie recommendation for target movie 
    movie_titles = api_rec(title)
    
    # Data frame creation
    Recommendation_list = pd.DataFrame({"movie_id":metadata["id"].iloc[movie_indices], 
                          "Title":metadata['Title'].iloc[movie_indices],
                          "Genres":metadata['Genre'].iloc[movie_indices],
                          "API_Rec_Titles":movie_titles})
    
    # Calculating the amount of movies recommended that are in the api recommendation list
    movie_in_api = [i for i in list(Recommendation_list["Title"]) if i in list(Recommendation_list["API_Rec_Titles"])]
    accuracy_of_rec = len(movie_in_api)/10
    
    print("Accuracy at 10: " + str(accuracy_of_rec))
    print("-------------------")

    return Recommendation_list

In [26]:
# Nearest Neighbor function but using Plot for the tfidf matrix and model fitting
def nearest_neigh_model_plot(title, element, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    print(title)
    idx = indices[title]
    
    # Set up tfidf on the plot column 
    X = metadata.Plot
    X_tfidf = tfidf.fit_transform(X)
    
    # Set up nearest Neighbors
    neigh = NearestNeighbors(n_neighbors=2, radius=0.4)
    neigh.fit(X_tfidf)
    
    # Collect movie indices
    movie_indices = []
    for i in neigh.kneighbors(X_tfidf[idx], 11, return_distance=False)[0][1:]:
        movie_indices.append(i)
    
    
    ##### API SET UP #####
    
    #Get API movie recommendation for target movie 
    movie_titles = api_rec(title)
    
    # Data frame creation
    Recommendation_list = pd.DataFrame({"movie_id":metadata["id"].iloc[movie_indices], 
                          "Title":metadata['Title'].iloc[movie_indices],
                          "Genres":metadata['Genre'].iloc[movie_indices],
                          "API_Rec_Titles":movie_titles})
    
    # Calculate accuracy at k
    movie_in_api = [i for i in list(Recommendation_list["Title"]) if i in list(Recommendation_list["API_Rec_Titles"])]
    accuracy_of_rec = len(movie_in_api)/10
     
    print("Accuracy at 10: " + str(accuracy_of_rec))
    print("-------------------")

    return Recommendation_list

In [27]:
# Nearest Neighbor function but using genre and plot for the tfidf matrix and model fitting

def nearest_neigh_model_both(title, element, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    print(title)
    idx = indices[title]
    
    tfidf_matrix = matrix_creation(element)
    
    # Compute the cosine similarity matrix
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Take the cosine similarity scores of all the movies 
    second_elements = [x[1] for x in sim_scores]
    
    # Set up tfidf on the plot column 
    X = metadata.genre_plot
    X_tfidf = tfidf.fit_transform(X)
    
    # Set up Nearest Neighbors
    neigh = NearestNeighbors(n_neighbors=2, radius=0.4)
    neigh.fit(X_tfidf)
    
    # Collect movie indices
    movie_indices = []
    for i in neigh.kneighbors(X_tfidf[idx], 11, return_distance=False)[0][1:]:
        movie_indices.append(i)
        
    ##### API SET UP #####
    
    #Get API movie recommendation for target movie 
    movie_titles = api_rec(title)
    
    # Data frame creation
    Recommendation_list = pd.DataFrame({"movie_id":metadata["id"].iloc[movie_indices], 
                          "Title":metadata['Title'].iloc[movie_indices],
                          "Genres":metadata['Genre'].iloc[movie_indices],
                          "API_Rec_Titles":movie_titles})
    
    #Calculate accuracy at k
    movie_in_api = [i for i in list(Recommendation_list["Title"]) if i in list(Recommendation_list["API_Rec_Titles"])]
    accuracy_of_rec = len(movie_in_api)/10
    
    
    print("Accuracy at 10: " + str(accuracy_of_rec))
    print("-------------------")
    return Recommendation_list

# Nearest Neighbors recommendations

In [28]:
# Getting accuracy of nearest neighbors genre columns
nearest_neigh_model_genre('Jumanji', "Genre")
nearest_neigh_model_genre('Grumpier Old Men', "Genre")
nearest_neigh_model_genre('Waiting to Exhale', "Genre")
nearest_neigh_model_genre('Father of the Bride Part II', "Genre")
nearest_neigh_model_genre('Tom and Huck', "Genre")
nearest_neigh_model_genre('Sudden Death', "Genre")
nearest_neigh_model_genre('GoldenEye', "Genre")
nearest_neigh_model_genre('The Shawshank Redemption', "Genre")
nearest_neigh_model_genre('Toy Story', "Genre")

Jumanji
Accuracy at 10: 0.2
-------------------
Grumpier Old Men
Accuracy at 10: 0.0
-------------------
Waiting to Exhale
Accuracy at 10: 0.1
-------------------
Father of the Bride Part II
Accuracy at 10: 0.0
-------------------
Tom and Huck
Accuracy at 10: 0.1
-------------------
Sudden Death
Accuracy at 10: 0.1
-------------------
GoldenEye
Accuracy at 10: 0.7
-------------------
The Shawshank Redemption
Accuracy at 10: 0.0
-------------------
Toy Story
Accuracy at 10: 0.0
-------------------


Unnamed: 0,movie_id,Title,Genres,API_Rec_Titles
16,862,Toy Story,"[16, 12, 10751, 35]",Toy Story 2
316,46195,Rio,"[16, 12, 35, 10751]",Toy Story 3
408,150540,Inside Out,"[16, 10751, 12, 18, 35]",A Bug's Life
317,4978,An American Tail,"[35, 12, 18, 10751, 16]",The Lion King
280,21032,Balto,"[10751, 16, 12]","Monsters, Inc."
366,10674,Mulan,"[16, 10751, 12]",The Incredibles
116,34942,Balto III: Wings of Change,"[10751, 12, 16]",Up
118,12144,The Land Before Time,"[10751, 16, 12]",Finding Nemo
112,25913,Balto II: Wolf Quest,"[10751, 16, 12]",Se7en
47,177572,Big Hero 6,"[12, 10751, 16, 28, 35]",WALL·E


In [29]:
# Printing overall precision for genre
print("Accuracy at 10 overall" + " " + str((0.0 + 0.2 + 0.0 + 0.1 + 0.0 + 0.1 + 0.1 + 0.7 + 0.0)/10))

Accuracy at 10 overall 0.12


In [30]:
# Getting nearest neighbor for plot column
nearest_neigh_model_plot('Jumanji', "Plot")
nearest_neigh_model_plot('Grumpier Old Men', "Plot")
nearest_neigh_model_plot('Waiting to Exhale', "Plot")
nearest_neigh_model_plot('Father of the Bride Part II', "Plot")
nearest_neigh_model_plot('Tom and Huck', "Plot")
nearest_neigh_model_plot('Sudden Death', "Plot")
nearest_neigh_model_plot('GoldenEye', "Plot")
nearest_neigh_model_plot('The Shawshank Redemption', "Plot")
nearest_neigh_model_plot('Toy Story', "Plot")


Jumanji
Accuracy at 10: 0.4
-------------------
Grumpier Old Men
Accuracy at 10: 0.1
-------------------
Waiting to Exhale
Accuracy at 10: 0.0
-------------------
Father of the Bride Part II
Accuracy at 10: 0.1
-------------------
Tom and Huck
Accuracy at 10: 0.1
-------------------
Sudden Death
Accuracy at 10: 0.1
-------------------
GoldenEye
Accuracy at 10: 0.7
-------------------
The Shawshank Redemption
Accuracy at 10: 0.0
-------------------
Toy Story
Accuracy at 10: 0.2
-------------------


Unnamed: 0,movie_id,Title,Genres,API_Rec_Titles
1,10193,Toy Story 3,"[16, 10751, 35]",Toy Story 2
0,863,Toy Story 2,"[16, 35, 10751]",Toy Story 3
188,350,The Devil Wears Prada,"[18, 35]",A Bug's Life
172,7303,Maid in Manhattan,"[35, 18, 10749]",The Lion King
262,278,The Shawshank Redemption,"[18, 80]","Monsters, Inc."
419,5,Four Rooms,35,The Incredibles
395,393559,My Life as a Zucchini,"[16, 35, 18, 10751, 10749]",Up
238,18093,Northanger Abbey,"[10749, 18, 10770]",Finding Nemo
165,8467,Dumb and Dumber,[35],Se7en
285,11674,101 Dalmatians,"[10751, 35]",WALL·E


In [31]:
# Printing overall precision for plot
print("Accuracy at 10 overall" + " " + str((0.2 + 0.4 + 0.1 + 0.0 + 0.1 + 0.1 + 0.1 + 0.7 + 0.0)/10))

Accuracy at 10 overall 0.16999999999999998


In [32]:
# getting accuracy for genre + plot column
nearest_neigh_model_both('Jumanji', "genre_plot")
nearest_neigh_model_both('Grumpier Old Men', "genre_plot")
nearest_neigh_model_both('Waiting to Exhale', "genre_plot")
nearest_neigh_model_both('Father of the Bride Part II', "genre_plot")
nearest_neigh_model_both('Tom and Huck', "genre_plot")
nearest_neigh_model_both('Sudden Death', "genre_plot")
nearest_neigh_model_both('GoldenEye', "genre_plot")
nearest_neigh_model_both('The Shawshank Redemption', "genre_plot")
nearest_neigh_model_both('Toy Story', "genre_plot")

Jumanji
Accuracy at 10: 0.4
-------------------
Grumpier Old Men
Accuracy at 10: 0.1
-------------------
Waiting to Exhale
Accuracy at 10: 0.0
-------------------
Father of the Bride Part II
Accuracy at 10: 0.1
-------------------
Tom and Huck
Accuracy at 10: 0.1
-------------------
Sudden Death
Accuracy at 10: 0.1
-------------------
GoldenEye
Accuracy at 10: 0.7
-------------------
The Shawshank Redemption
Accuracy at 10: 0.0
-------------------
Toy Story
Accuracy at 10: 0.2
-------------------


Unnamed: 0,movie_id,Title,Genres,API_Rec_Titles
1,10193,Toy Story 3,"[16, 10751, 35]",Toy Story 2
0,863,Toy Story 2,"[16, 35, 10751]",Toy Story 3
188,350,The Devil Wears Prada,"[18, 35]",A Bug's Life
172,7303,Maid in Manhattan,"[35, 18, 10749]",The Lion King
395,393559,My Life as a Zucchini,"[16, 35, 18, 10751, 10749]","Monsters, Inc."
262,278,The Shawshank Redemption,"[18, 80]",The Incredibles
366,10674,Mulan,"[16, 10751, 12]",Up
419,5,Four Rooms,35,Finding Nemo
223,118,Charlie and the Chocolate Factory,"[12, 35, 10751, 14]",Se7en
113,3170,Bambi,"[16, 18, 10751]",WALL·E


In [33]:
# Printing overall precision at 10 for genre + plot
print("Accuracy at 10 overall" + " " + str((0.2 + 0.4 + 0.1 + 0.0 + 0.1 + 0.1 + 0.1 + 0.7 + 0.0)/10))

Accuracy at 10 overall 0.16999999999999998


In [34]:
# Nearest neighbor model but trialling if splitting the data into 
# train/test affects how it performs
def nearest_neigh_model_both_train(title, element, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    print(title)
    idx = indices[title]
    
    tfidf_matrix = matrix_creation(element)
    
    # Compute the cosine similarity matrix
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Take the cosine similarity scores of all the movies 
    second_elements = [x[1] for x in sim_scores]
    
    y = second_elements
    
    # Set up tfidf on the plot column 
    X = metadata.genre_plot
    X_tfidf = tfidf.fit_transform(X)
    
    # Split train/test
    X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.8, random_state=42)
    
    #Set up nearest neighbors
    neigh = NearestNeighbors(n_neighbors=2, radius=0.4)
    neigh.fit(X_train)
    
    # Setting up movie indices
    movie_indices = []
    for i in neigh.kneighbors(X_tfidf[idx], 11, return_distance=False)[0][1:]:
        movie_indices.append(i)
        
    #Get API movie recommendation for target movie 
    movie_titles = api_rec(title)
    
    # Data frame creation
    Recommendation_list = pd.DataFrame({"movie_id":metadata["id"].iloc[movie_indices], 
                          "Title":metadata['Title'].iloc[movie_indices],
                          "Genres":metadata['Genre'].iloc[movie_indices],
                          "API_Rec_Titles":movie_titles})
    # Calculating 
    movie_in_api = [i for i in list(Recommendation_list["Title"]) if i in list(Recommendation_list["API_Rec_Titles"])]
    accuracy_of_rec = len(movie_in_api)/10
 
    relevance_score = len([i for i in list(Recommendation_list["Title"]) if i in Recommendation_list["API_Rec_Titles"]])
    

    print("Accuracy at 10: " + str(accuracy_of_rec))
    print("----------------")
    return Recommendation_list

In [35]:
# Getting accuracy score for subset of dataset
nearest_neigh_model_both_train('Jumanji', "genre_plot")
nearest_neigh_model_both_train('Grumpier Old Men', "genre_plot")
nearest_neigh_model_both_train('Waiting to Exhale', "genre_plot")
nearest_neigh_model_both_train('Father of the Bride Part II', "genre_plot")
nearest_neigh_model_both_train('Tom and Huck', "genre_plot")
nearest_neigh_model_both_train('Sudden Death', "genre_plot")
nearest_neigh_model_both_train('GoldenEye', "genre_plot")
nearest_neigh_model_both_train('The Shawshank Redemption', "genre_plot")
nearest_neigh_model_both_train('Toy Story', "genre_plot")

Jumanji
Accuracy at 10: 0.0
----------------
Grumpier Old Men
Accuracy at 10: 0.1
----------------
Waiting to Exhale
Accuracy at 10: 0.1
----------------
Father of the Bride Part II
Accuracy at 10: 0.0
----------------
Tom and Huck
Accuracy at 10: 0.0
----------------
Sudden Death
Accuracy at 10: 0.1
----------------
GoldenEye
Accuracy at 10: 0.0
----------------
The Shawshank Redemption
Accuracy at 10: 0.0
----------------
Toy Story
Accuracy at 10: 0.2
----------------


Unnamed: 0,movie_id,Title,Genres,API_Rec_Titles
82,9594,Double Impact,"[53, 28, 80, 18]",Toy Story 2
25,1813,The Devil's Advocate,"[18, 9648, 53, 27]",Toy Story 3
22,9552,The Exorcist,"[27, 53, 18]",A Bug's Life
5,9806,The Incredibles,"[28, 12, 16, 10751]",The Lion King
71,17654,District 9,[878],"Monsters, Inc."
33,301337,Downsizing,"[18, 878]",The Incredibles
47,177572,Big Hero 6,"[12, 10751, 16, 28, 35]",Up
88,658,Goldfinger,"[12, 28, 53]",Finding Nemo
86,19995,Avatar,"[28, 12, 14, 878]",Se7en
1,10193,Toy Story 3,"[16, 10751, 35]",WALL·E


In [36]:
# Printing overall precision at 10 for genre + plot for training 
print("Accuracy at 10 overall:" + " " + str((0.0 + 0.1 + 0.1 + 0.0 + 0.0 + 0.1 + 0.0 + 0.0 + 0.2)/10))

Accuracy at 10 overall: 0.05


# Using KD Tree

In [37]:
# Set up tfidf on the plot column 
X = metadata.genre_plot
X_tfidf = tfidf.fit_transform(X)

X_tfidf = X_tfidf.toarray()



In [38]:
# Testing KD tree results
from sklearn.neighbors import KDTree

rng = np.random.RandomState(0)
X = rng.random_sample((10, 3))  # 10 points in 3 dimensions


tree = KDTree(X_tfidf, leaf_size=500)              
dist, ind = tree.query(X_tfidf[3:4], k=10)                
print(ind)  # indices of 3 closest neighbors

print(dist)  # distances to 3 closest neighbors


[[  3 113 170 367 430 366  18 280 420 310]]
[[0.         1.28728917 1.31652148 1.33823883 1.35577564 1.35744327
  1.35761178 1.36032813 1.36315626 1.36390287]]


In [39]:
# Printing results of the test indices 
for i in ind:
    print(metadata.iloc[i].Title)

3                       The Lion King
113                             Bambi
170                           Aladdin
367                The Little Mermaid
430                    Shanghai Triad
366                             Mulan
18                  Good Will Hunting
280                             Balto
420    Ace Ventura: When Nature Calls
310                            Qwerty
Name: Title, dtype: object


In [40]:
# Define KD tree function to produce recommendation list and accuracy score
def tree_recc(title):
    #Get index of movie
    idx = indices[title]
    
    #Get target movie ID
    movie_ID = metadata.loc[metadata["Title"] == title]["id"].values[0]
    
    # Set up tfidf on the plot column 
    X = metadata.genre_plot
    X_tfidf = tfidf.fit_transform(X)
    
    X_tfidf = X_tfidf.toarray()
    
    
    # Set up KD tree 
    rng = np.random.RandomState(0)
    X = rng.random_sample((10, 3))  # 10 points in 3 dimensions

    # Create tree
    tree = KDTree(X_tfidf, leaf_size=500)              
    dist, ind = tree.query(X_tfidf[idx:idx+1], k=11) 
    print(title)
    
    # collect movie indices
    movie_indices = []
    movie_title = []
    for i in ind:
        for j in range(1,11):
            movie_indices.append(metadata.iloc[j].id)
            movie_title.append(metadata.iloc[j].Title)
            
    #Get API movie recommendation for target movie 
    movie_titles = api_rec(title)
    
    # Set up dataframe 
    recommendation_list = pd.DataFrame({"Title":movie_title, "API Title": movie_titles})
    
    # Calculating 
    movie_in_api = [i for i in list(recommendation_list["Title"]) if i in list(recommendation_list["API Title"])]
    accuracy_of_rec = len(movie_in_api)/10
    
    print("Accuracy Score " + str(accuracy_of_rec))
    print("---------------------")
    
    return(recommendation_list)


In [41]:
# First test on KD tree function
tree_recc("Toy Story")

Toy Story
Accuracy Score 0.9
---------------------


Unnamed: 0,Title,API Title
0,Toy Story 3,Toy Story 2
1,A Bug's Life,Toy Story 3
2,The Lion King,A Bug's Life
3,"Monsters, Inc.",The Lion King
4,The Incredibles,"Monsters, Inc."
5,Up,The Incredibles
6,Finding Nemo,Up
7,Se7en,Finding Nemo
8,WALL·E,Se7en
9,Jumanji: Welcome to the Jungle,WALL·E


In [42]:
# Getting accuracy on KD tree on genre + plot columns
tree_recc('Jumanji')
tree_recc('Grumpier Old Men')
tree_recc('Waiting to Exhale')
tree_recc('Father of the Bride Part II')
tree_recc('Tom and Huck')
tree_recc('Sudden Death')
tree_recc('GoldenEye')
tree_recc('The Shawshank Redemption')
tree_recc('Toy Story')

Jumanji
Accuracy Score 0.1
---------------------
Grumpier Old Men
Accuracy Score 0.0
---------------------
Waiting to Exhale
Accuracy Score 0.0
---------------------
Father of the Bride Part II
Accuracy Score 0.0
---------------------
Tom and Huck
Accuracy Score 0.0
---------------------
Sudden Death
Accuracy Score 0.0
---------------------
GoldenEye
Accuracy Score 0.0
---------------------
The Shawshank Redemption
Accuracy Score 0.0
---------------------
Toy Story
Accuracy Score 0.9
---------------------


Unnamed: 0,Title,API Title
0,Toy Story 3,Toy Story 2
1,A Bug's Life,Toy Story 3
2,The Lion King,A Bug's Life
3,"Monsters, Inc.",The Lion King
4,The Incredibles,"Monsters, Inc."
5,Up,The Incredibles
6,Finding Nemo,Up
7,Se7en,Finding Nemo
8,WALL·E,Se7en
9,Jumanji: Welcome to the Jungle,WALL·E


In [43]:
# Calculate KD tree overall accuracy 
print("KD tree Accuracy: " + str((0.1 + 0.0 + 0.0 + 0.0 + 0.0 + 0.0 + 0.0 + 0.0 + 0.9)/10))

KD tree Accuracy: 0.1


# Overall precision of method 

### Cosine Similarity 
##### Gerne 
Overall Accuracy at 10: 0.14
##### Plot
Overall Accuracy at 10: 0.17
##### Genre + Plot
Overall Accuracy at 10: 0.17

### Nearest Neighbors 
##### Genre
Overall Accuracy at 10: 0.12
##### Plot
Overall Accuracy at 10: 0.17
##### Genre  + Plot 
Overall Accuracy at 10: 0.17
##### Genre + Plot + train/test
Overall Accuracy at 10: 0.05

### KD TREE
##### Genre 
Overall Accuracy at 10: 0.1 

### Method of calculcation 
By comparing the list of recommendation to the what tmdb api recommends for the movie, we can calculate how precise our models is. The list of movies in the recommendation that are in the API recommendation list would give us our Accuracy. 

For example: Toy story recommendation for cosine similarity for plot has only 2 movies recommended that are in the API recommendation for toy story so it would have a score of 2/10 = 0.2

We do this for 10 different movies for each method and take the average of the scores to calculate the overall Accuracy for the method. 
