In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
path_data = "https://github.com/MatheusNakai/Datasets/raw/main/"

In [3]:
movies = pd.read_csv(path_data + 'Datasets/movies.csv')
movieDB = pd.read_csv(path_data + 'Datasets/movies_bin.csv')

In [4]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [5]:
# create a function to create a table showing the numbers of missing values for each feature
def create_missing_df(dataframe):

  missing_index = dataframe.columns.tolist()
  missing = dataframe.isnull().sum().tolist()
  missing_df = pd.DataFrame({'Missing':missing}, index=missing_index)

  return missing_df

In [6]:
create_missing_df(movies)

Unnamed: 0,Missing
movieId,0
title,0
genres,0


In [7]:
# the function to extract titles
def extract_title(title):

  year = title[len(title)-5:len(title)-1]

  # some movies do not have the info about year in the column title. So, we should take care of the case as well.
  if year.isnumeric():
    title_no_year = title[:len(title)-7]
    return title_no_year

  else:
    return title

In [8]:
# the function to extract years
def extract_year(title):

  year = title[len(title)-5:len(title)-1]

  # some movies do not have the info about year in the column title. So, we should take care of the case as well.
  if year.isnumeric():
    return int(year)

  else:
    return np.nan

In [9]:
movies.rename(columns={'title':'title_year'}, inplace=True) # change the column name from title to title_year
movies['title_year'] = movies['title_year'].apply(lambda x: x.strip()) # remove leading and ending whitespaces in title_year
movies['title'] = movies['title_year'].apply(extract_title) # create the column for title
movies['year'] = movies['title_year'].apply(extract_year) # create the column for year

In [10]:
movies.head()

Unnamed: 0,movieId,title_year,genres,title,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji,1995.0
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale,1995.0
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995.0


In [11]:
create_missing_df(movies)

Unnamed: 0,Missing
movieId,0
title_year,0
genres,0
title,0
year,12


In [12]:
r,c = movies[movies['genres']=='(no genres listed)'].shape
print('The number of movies which do not have info about genres:',r)

The number of movies which do not have info about genres: 34


In [13]:
movies = movies[~(movies['genres']=='(no genres listed)')].reset_index(drop=True)

In [14]:
movies[['title','genres']].head(5)

Unnamed: 0,title,genres
0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy
1,Jumanji,Adventure|Children|Fantasy
2,Grumpier Old Men,Comedy|Romance
3,Waiting to Exhale,Comedy|Drama|Romance
4,Father of the Bride Part II,Comedy


In [15]:
# remove '|' in the genres column
movies['genres'] = movies['genres'].str.replace('|',' ')

  movies['genres'] = movies['genres'].str.replace('|',' ')


In [16]:
# count the number of occurences for each genre in the data set
counts = dict()

for i in movies.index:
  for g in movies.loc[i,'genres'].split(' '):
    if g not in counts:
      counts[g] = 1
    else:
      counts[g] = counts[g] + 1

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
movies['genres'][0]

'Adventure Animation Children Comedy Fantasy'

In [19]:
movies['genres'] = movies['genres'].str.replace('Sci-Fi','SciFi')
movies['genres'] = movies['genres'].str.replace('Film-Noir','Noir')

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vector = TfidfVectorizer(stop_words='english') # create an object for TfidfVectorizer
tfidf_matrix = tfidf_vector.fit_transform(movies['genres']) # apply the object to the genres column

In [21]:
movies.head()

Unnamed: 0,movieId,title_year,genres,title,year
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,Toy Story,1995.0
1,2,Jumanji (1995),Adventure Children Fantasy,Jumanji,1995.0
2,3,Grumpier Old Men (1995),Comedy Romance,Grumpier Old Men,1995.0
3,4,Waiting to Exhale (1995),Comedy Drama Romance,Waiting to Exhale,1995.0
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995.0


In [22]:
print(list(enumerate(tfidf_vector.get_feature_names_out())))

[(0, 'action'), (1, 'adventure'), (2, 'animation'), (3, 'children'), (4, 'comedy'), (5, 'crime'), (6, 'documentary'), (7, 'drama'), (8, 'fantasy'), (9, 'horror'), (10, 'imax'), (11, 'musical'), (12, 'mystery'), (13, 'noir'), (14, 'romance'), (15, 'scifi'), (16, 'thriller'), (17, 'war'), (18, 'western')]


In [23]:
print(tfidf_matrix[:5])

  (0, 8)	0.48301747178653426
  (0, 4)	0.26738777563975086
  (0, 3)	0.5048957307474672
  (0, 2)	0.5162876752057701
  (0, 1)	0.41680359510322523
  (1, 8)	0.5936766667075072
  (1, 3)	0.6205672299106341
  (1, 1)	0.5122932056626417
  (2, 14)	0.8211549883098352
  (2, 4)	0.5707052524498741
  (3, 7)	0.46621627821471856
  (3, 14)	0.7264518207332316
  (3, 4)	0.5048862585582906
  (4, 4)	1.0


In [24]:
tfidf_matrix.shape

(9708, 19)

In [25]:
# the first row vector of tfidf_matrix (Toy Story)
tfidf_matrix.todense()[0]

matrix([[0.        , 0.4168036 , 0.51628768, 0.50489573, 0.26738778,
         0.        , 0.        , 0.        , 0.48301747, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        ]])

In [26]:
from sklearn.metrics.pairwise import linear_kernel
sim_matrix = linear_kernel(tfidf_matrix,tfidf_matrix) # create the cosine similarity matrix
print(sim_matrix)

[[1.         0.8136036  0.15259961 ... 0.         0.42114166 0.26738778]
 [0.8136036  1.         0.         ... 0.         0.         0.        ]
 [0.15259961 0.         1.         ... 0.         0.         0.57070525]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.42114166 0.         0.         ... 0.         1.         0.        ]
 [0.26738778 0.         0.57070525 ... 0.         0.         1.        ]]


In [27]:
# the function to convert from index to title_year
def get_title_year_from_index(index):

  return movies[movies.index == index]['title_year'].values[0]



In [28]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [29]:
from fuzzywuzzy import fuzz
# create a function to find the closest title
def matching_score(a,b):

  return fuzz.ratio(a,b)



In [30]:
# the function to convert from title to index
def get_index_from_title(title):

  return movies[movies.title == title].index.values[0]

In [31]:
# a function to convert index to title
def get_title_from_index(index):

  return movies[movies.index == index]['title'].values[0]

In [32]:
# the function to return the most similar title to the words a user types
def find_closest_title(title):

  leven_scores = list(enumerate(movies['title_year'].apply(matching_score, b=title)))
  sorted_leven_scores = sorted(leven_scores, key=lambda x: x[1], reverse=True)
  closest_title = get_title_from_index(sorted_leven_scores[0][0])
  distance_score = sorted_leven_scores[0][1]

  return closest_title, distance_score

In [33]:
def contents_based_recommender(movie_user_likes, how_many):

  closest_title, distance_score = find_closest_title(movie_user_likes)
  rec_movie = []
  if distance_score == 100:

    movie_index = get_index_from_title(closest_title)
    movie_list = list(enumerate(sim_matrix[int(movie_index)]))
    similar_movies = list(filter(lambda x:x[0] != int(movie_index), sorted(movie_list,key=lambda x:x[1], reverse=True))) # remove the typed movie itself

    for i,s in similar_movies[:how_many]:
      rec_movie.append(movies.iloc[i]['title'])

    return rec_movie

  else:
    # print('Did you mean '+'\033[1m'+str(closest_title)+'\033[0m'+'?','\n')

    movie_index = get_index_from_title(closest_title)
    movie_list = list(enumerate(sim_matrix[int(movie_index)]))
    similar_movies = list(filter(lambda x:x[0] != int(movie_index), sorted(movie_list,key=lambda x:x[1], reverse=True)))


  for i,s in similar_movies[:how_many]:
    rec_movie.append(movies.iloc[i]['title'])

  return rec_movie

In [35]:
def multiple_movies_CB(list_of_movies):
  temp_reccomendation = []
  for movie in list_of_movies:
    temp_reccomendation.append(contents_based_recommender(movie, 20))
  flat_list = [item for sublist in temp_reccomendation for item in sublist]
  list_of_recommendation = []
  repeated_movies = []
  for i in flat_list:
    if i not in list_of_recommendation:
      list_of_recommendation.append(i)
  return list_of_recommendation

In [36]:
def find_title_db(list_of_names):
  ret = []
  for movie in a:
    if ',' in movie:
      movie = movie.split(',')
      ret.append(movieDB.loc[movieDB['original_title'].str.contains(movie[0])])
    else:
      ret.append(movieDB.loc[movieDB['original_title'].str.contains(movie)])

  list_of_row = [ele for ele in ret if len(ele['original_title']) != 0]
  return list_of_row

In [37]:
api_key = '84bd3118796019969d2fee13a58bcf90'
reading_token = 'eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiI4NGJkMzExODc5NjAxOTk2OWQyZmVlMTNhNThiY2Y5MCIsInN1YiI6IjY0ZjA4MWRjY2FhNTA4MDBhYjcxZDM0YSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.wf4s0kVeBajUQe3tOUoS7Wkiemt47UNlmzYGphWY0vY'

In [38]:
import requests
import json
def get_movie_info(movieId):
  the_key = '84bd3118796019969d2fee13a58bcf90'
  url = f"https://api.themoviedb.org/3/movie/{movieId}?api_key={api_key}&language=pt-BR"
  image_url=f"https://image.tmdb.org/t/p/original"

  headers = {
      "accept": "application/json",
      "Authorization": f"Bearer {reading_token}"
  }
  info = {}
  response = requests.get(url)
  if response.status_code ==200:
    response = json.loads(response.text)
    info['poster_path'] = image_url+response['poster_path']
    info['overview'] = response['overview']

    return info
  if response.status_code==404:
    return "movie not found"

In [39]:
def format_json(list_of_movies):
  response = []
  for movie in list_of_movies:
    id = int(movie['id'].values[0])
    info = get_movie_info(id)
    if info !='movie not found':
      dic = {'id': id,
              'original_title': movie['original_title'].values[0],
              'overview':info['overview'],
              'genres': movie['genres'].values[0],
              'poster_path': info['poster_path']}
      response.append(dic)
  return response

In [40]:
a = contents_based_recommender('Monsters, Inc.', 20)

In [41]:
a

['Toy Story',
 'Antz',
 'Toy Story 2',
 'Adventures of Rocky and Bullwinkle, The',
 "Emperor's New Groove, The",
 'Wild, The',
 'Shrek the Third',
 'Tale of Despereaux, The',
 'Asterix and the Vikings (Astérix et les Vikings)',
 'Turbo',
 'The Good Dinosaur',
 'Moana',
 'Inside Out',
 'Black Cauldron, The',
 'Lord of the Rings, The',
 "We're Back! A Dinosaur's Story",
 'Atlantis: The Lost Empire',
 'Land Before Time, The',
 'Pokemon 4 Ever (a.k.a. Pokémon 4: The Movie)',
 'Sinbad: Legend of the Seven Seas']

In [42]:
recommendation = multiple_movies_CB(a)

In [43]:
recommendation = find_title_db(recommendation)

  ret.append(movieDB.loc[movieDB['original_title'].str.contains(movie)])


In [44]:
response= format_json(recommendation)

In [45]:
(response)

[{'id': 10193,
  'original_title': 'Toy Story 3',
  'overview': 'Quando Andy se prepara para ir para a faculdade, Woody, Buzz, Jessie e o restante dos leais brinquedos de Andy pensam o que irá acontecer com eles. Mas quando uma confusão faz com que eles sejam levados à creche Sunnyside, eles conhecem um anfitrião de novos brinquedos e logo descobrem que uma nova aventura selvagem está apenas começando!',
  'genres': "['Animation', 'Comedy', 'Family']",
  'poster_path': 'https://image.tmdb.org/t/p/original/McHK4kVvWfJ7jPiC4tAO9XAYQm.jpg'},
 {'id': 8916,
  'original_title': 'Antz',
  'overview': 'A formiguinha Z apenas um operário, que sonha roubar o coração da princesa Bala. Para isso, convence seu amigo soldado a trocar de lugar com ele, o que faz com que tenha que enfrentar o impiedoso General Mandíbula, que planeja uma grande ofensiva contra o formigueiro.',
  'genres': "['Adventure', 'Animation', 'Comedy', 'Family']",
  'poster_path': 'https://image.tmdb.org/t/p/original/plM5vyLRiiV