In [44]:
import pandas as pd 
import os 
import numpy as np
import requests

In [2]:
path1 = "../Resources/tmdb_movies_list"

metadata = pd.read_csv(path1)

In [3]:
metadata

Unnamed: 0,id,Title,Genre,Plot,Poster
0,863,Toy Story 2,"[16, 35, 10751]","Andy heads off to Cowboy Camp, leaving his toy...",/2MFIhZAW0CVlEQrFyqwa4U6zqJP.jpg
1,10193,Toy Story 3,"[16, 10751, 35]","Woody, Buzz, and the rest of Andy's toys haven...",/AbbXspMOwdvwWZgVN0nabZq03Ec.jpg
2,9487,A Bug's Life,"[12, 16, 35, 14, 10751]","On behalf of ""oppressed bugs everywhere,"" an i...",/Ah3J9OJVc2CNCuH2zMydXy9fmIC.jpg
3,8587,The Lion King,"[10751, 16, 18]",A young lion prince is cast out of his pride b...,/sKCr78MXSLixwmZ8DyJLrpMsd15.jpg
4,585,"Monsters, Inc.","[16, 35, 10751]",Lovable Sulley and his wisecracking sidekick M...,/wFSpyMsp7H0ttERbxY7Trlv8xry.jpg
...,...,...,...,...,...
414,31357,Waiting to Exhale,35,"Cheated on, mistreated and stepped on, the wom...",/qJU6rfil5xLVb5HpJsmmfeSK254.jpg
415,11862,Father of the Bride Part II,35,Just when George Banks has recovered from his ...,/rj4LBtwQ0uGrpBnCELr716Qo3mw.jpg
416,11860,Sabrina,35,An ugly duckling having undergone a remarkable...,/z1oNjotUI7D06J4LWQFQzdIuPnf.jpg
417,45325,Tom and Huck,10751,"A mischievous young boy, Tom Sawyer, witnesses...",/vIG8hWOa7DyLMRiurzKwVAnIYoU.jpg


In [4]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

def matrix_creation(element):
    
    #Replace NaN with an empty string
    metadata[element] = metadata[element].fillna('')

    #Construct the required TF-IDF matrix by fitting and transforming the data
    tfidf_matrix = tfidf.fit_transform(metadata[element])
    
    return tfidf_matrix

In [8]:
tfidf_matrix = tfidf.fit_transform(metadata["Plot"])

In [9]:
matrix_creation("Plot")

<419x4840 sparse matrix of type '<class 'numpy.float64'>'
	with 9982 stored elements in Compressed Sparse Row format>

In [39]:
indices = pd.Series(metadata.index, index = metadata["Title"])

indices.head()

Title
Toy Story 2       0
Toy Story 3       1
A Bug's Life      2
The Lion King     3
Monsters, Inc.    4
dtype: int64

In [40]:
metadata.index


RangeIndex(start=0, stop=419, step=1)

In [11]:
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics import ndcg_score

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [17]:
def get_recommendations(title, element, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]
    
    tfidf_matrix = matrix_creation(element)
    
    # Compute the cosine similarity matrix
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    test = pd.DataFrame({"Title":metadata['Title'].iloc[movie_indices], "sim_score": sim_scores, "Genres":metadata['Genre'].iloc[movie_indices], "Plot":metadata['Plot'].iloc[movie_indices]}).reset_index(drop=False)
    
    # Calculate relevance score 
    movie_title_genre = list(metadata.loc[metadata["Title"] == title]["Genre"])[0].split(",")
    movie_title_genre_list = [i.strip() for i in movie_title_genre]

    relevance_score = []
    
    # Loop through each row in the reccomendation and calculate relevance score 
    for i in range(0,len(test)):
        
        genre_list = [i.strip() for i in test["Genres"][i].split(",")]
#         print(genre_list)
        relevance_score.append(len([i.strip() for i in genre_list if i.strip() in movie_title_genre_list]))
    
    # Add relevance score into dataframe
    test["Relevance Score"] = relevance_score
    
    
    # Calculate nDCG scores 
    ideal_score = test["Relevance Score"].sort_values(ascending=False)
    
    
    ideal_score = np.asarray([ideal_score])
    relevance_score = np.asarray([relevance_score])
    
    test_ndcg = ndcg_score(ideal_score, relevance_score)
    ideal_ndcg = ndcg_score(ideal_score, ideal_score)
    
    score_ndcg = test_ndcg/ideal_ndcg
    print("Relevance nDCG:" + str(test_ndcg))
    print("Ideal nDCG:" + str(ideal_ndcg))
    print("nDCG score:" + str(score_ndcg))
    
    
#     print(relevance_list)
    # Return the top 10 most similar movies
    return test

In [18]:
get_recommendations('Toy Story', "Plot")


Relevance nDCG:0.9275696372679155
Ideal nDCG:1.0000000000000002
nDCG score:0.9275696372679153


Unnamed: 0,index,Title,sim_score,Genres,Plot,Relevance Score
0,1,Toy Story 3,"(1, 0.44136894356791234)","[16, 10751, 35]","Woody, Buzz, and the rest of Andy's toys haven...",3
1,0,Toy Story 2,"(0, 0.4049024543279657)","[16, 35, 10751]","Andy heads off to Cowboy Camp, leaving his toy...",1
2,188,The Devil Wears Prada,"(188, 0.08741182838451465)","[18, 35]",Andy moves to New York to work in the fashion ...,1
3,172,Maid in Manhattan,"(172, 0.07338358283374645)","[35, 18, 10749]",Marisa Ventura is a struggling single mom who ...,0
4,262,The Shawshank Redemption,"(262, 0.061613994347496814)","[18, 80]",Framed in the 1940s for the double murder of h...,0
5,399,My Life as a Zucchini,"(399, 0.04485325977075304)","[16, 35, 18, 10751, 10749]","After his mother’s death, Zucchini is befriend...",2
6,238,Northanger Abbey,"(238, 0.03820426942855762)","[10749, 18, 10770]",A young woman's penchant for sensational Gothi...,0
7,165,Dumb and Dumber,"(165, 0.037460900833817756)",[35],Lloyd and Harry are two men whose stupidity is...,0
8,285,101 Dalmatians,"(285, 0.036326806170881284)","[10751, 35]","An evil, high-fashion designer plots to steal ...",1
9,205,The Bucket List,"(205, 0.03589906924319538)","[18, 35]",Corporate billionaire Edward Cole and working ...,1


In [45]:
#Set up url 

movie_tmdbid = 862
url = "https://api.themoviedb.org/3/movie/" + str(movie_tmdbid) + "/recommendations?language=en-US&page=1"

#Set up headers -- To be moved into .env for security purposes 
headers = {
"accept": "application/json",
"Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiIzZTdjNTZkZTFkNjQxZWIyOGVhODRiNWRkODgzOTUxMCIsInN1YiI6IjY1Y2IyNzRmOGMzMTU5MDE3YzM5MGFlNSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.i4FruSg4IDDl--CWmoNT3SUKtiqCgZm4O54CFZS2uzs"
}

response = requests.get(url, headers=headers).json()

In [46]:
movie_titles = []
for j in range(0,10):
        movie_titles.append(response['results'][j]['title'])

In [47]:
pd.DataFrame({"Title":movie_titles})

Unnamed: 0,Title
0,Toy Story 2
1,Toy Story 3
2,A Bug's Life
3,The Lion King
4,"Monsters, Inc."
5,The Incredibles
6,Up
7,Finding Nemo
8,Se7en
9,WALL·E


In the case of Toy Story, 

We only have 2 movies in the model recommendation list that are in the api recommendation list 

Therefore it would be a precision score of 2/10 = 0.2 