In [None]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
print('Pandas version: ', pd.__version__)

import numpy as np
print('NumPy version: ', np.__version__)

import matplotlib
print('Matplotlib version: ', matplotlib.__version__)

from matplotlib import pyplot as plt

import sklearn
print('Scikit-Learn version: ', sklearn.__version__)

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.cluster import KMeans


import pickle
print('Pickle version: ', pickle.format_version)

import sys
print('Sys version: ', sys.version[0:5])

from sklearn.neighbors import NearestNeighbors

import random

Pandas version:  1.5.3
NumPy version:  1.23.5
Matplotlib version:  3.7.1
Scikit-Learn version:  1.2.2
Pickle version:  4.0
Sys version:  3.10.


In [None]:
base_path = "/content/drive/MyDrive/TCC/Datasets/"

In [None]:
ratings = pd.read_csv(base_path + 'ratings.csv', usecols=['userId','movieId','rating'])
movies = pd.read_csv(base_path + 'movies.csv', usecols=['movieId','title'])
ratings2 = pd.merge(ratings, movies, how='inner', on='movieId')
movies2 = pd.read_csv(base_path + 'movies.csv')

In [None]:
ratings2.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [None]:
df = ratings2.pivot_table(index='title',columns='userId',values='rating').fillna(0)
df1 = df.copy()

In [None]:
df1.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
def create_missing_df(dataframe):

  missing_index = dataframe.columns.tolist()
  missing = dataframe.isnull().sum().tolist()
  missing_df = pd.DataFrame({'Missing':missing}, index=missing_index)

  return missing_df

In [None]:
create_missing_df(movies2)

Unnamed: 0,Missing
movieId,0
title,0
genres,0


In [None]:
# the function to extract titles
def extract_title(title):

  year = title[len(title)-5:len(title)-1]

  # some movies do not have the info about year in the column title. So, we should take care of the case as well.
  if year.isnumeric():
    title_no_year = title[:len(title)-7]
    return title_no_year

  else:
    return title

In [None]:
# the function to extract years
def extract_year(title):

  year = title[len(title)-5:len(title)-1]

  # some movies do not have the info about year in the column title. So, we should take care of the case as well.
  if year.isnumeric():
    return int(year)

  else:
    return np.nan

In [None]:
movies2.rename(columns={'title':'title_year'}, inplace=True) # change the column name from title to title_year
movies2['title_year'] = movies2['title_year'].apply(lambda x: x.strip()) # remove leading and ending whitespaces in title_year
movies2['title'] = movies2['title_year'].apply(extract_title) # create the column for title
movies2['year'] = movies2['title_year'].apply(extract_year) # create the column for year

In [None]:
create_missing_df(movies2)

Unnamed: 0,Missing
movieId,0
title_year,0
genres,0
title,0
year,12


In [None]:
r,c = movies2[movies2['genres']=='(no genres listed)'].shape
print('The number of movies which do not have info about genres:',r)

The number of movies which do not have info about genres: 34


In [None]:
movies2 = movies2[~(movies2['genres']=='(no genres listed)')].reset_index(drop=True)

In [None]:
movies2[['title','genres']].head(5)

Unnamed: 0,title,genres
0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy
1,Jumanji,Adventure|Children|Fantasy
2,Grumpier Old Men,Comedy|Romance
3,Waiting to Exhale,Comedy|Drama|Romance
4,Father of the Bride Part II,Comedy


In [None]:
# remove '|' in the genres column
movies2['genres'] = movies2['genres'].str.replace('|',' ')

  movies2['genres'] = movies2['genres'].str.replace('|',' ')


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
movies2['genres'] = movies2['genres'].str.replace('Sci-Fi','SciFi')
movies2['genres'] = movies2['genres'].str.replace('Film-Noir','Noir')

In [None]:
tfidf_vector = TfidfVectorizer(stop_words='english') # create an object for TfidfVectorizer
tfidf_matrix = tfidf_vector.fit_transform(movies2['genres']) # apply the object to the genres column

In [None]:
# the first row vector of tfidf_matrix (Toy Story)
tfidf_matrix.todense()[0]

matrix([[0.        , 0.4168036 , 0.51628768, 0.50489573, 0.26738778,
         0.        , 0.        , 0.        , 0.48301747, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        ]])

In [None]:
from sklearn.metrics.pairwise import linear_kernel

In [None]:
sim_matrix = linear_kernel(tfidf_matrix,tfidf_matrix) # create the cosine similarity matrix
print(sim_matrix)

[[1.         0.8136036  0.15259961 ... 0.         0.42114166 0.26738778]
 [0.8136036  1.         0.         ... 0.         0.         0.        ]
 [0.15259961 0.         1.         ... 0.         0.         0.57070525]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.42114166 0.         0.         ... 0.         1.         0.        ]
 [0.26738778 0.         0.57070525 ... 0.         0.         1.        ]]


In [None]:
# the function to convert from index to title_year
def get_title_year_from_index(index):

  return movies2[movies2.index == index]['title_year'].values[0]

# the function to convert from title to index
def get_index_from_title(title):

  return movies2[movies2.title == title].index.values[0]

In [None]:
from fuzzywuzzy import fuzz



In [None]:
def matching_score(a,b):

  return fuzz.ratio(a,b)

In [None]:
# a function to convert index to title
def get_title_from_index(index):

  return movies2[movies2.index == index]['title'].values[0]

In [None]:
# the function to return the most similar title to the words a user types
def find_closest_title(title):

  leven_scores = list(enumerate(movies2['title'].apply(matching_score, b=title)))
  sorted_leven_scores = sorted(leven_scores, key=lambda x: x[1], reverse=True)
  closest_title = get_title_from_index(sorted_leven_scores[0][0])
  distance_score = sorted_leven_scores[0][1]

  return closest_title, distance_score

In [None]:
def contents_based_recommender(movie_user_likes, how_many):

  closest_title, distance_score = find_closest_title(movie_user_likes)
  rec_movie = []
  if distance_score == 100:

    movie_index = get_index_from_title(closest_title)
    movie_list = list(enumerate(sim_matrix[int(movie_index)]))
    similar_movies = list(filter(lambda x:x[0] != int(movie_index), sorted(movie_list,key=lambda x:x[1], reverse=True))) # remove the typed movie itself

    # print('Here\'s the list of movies similar to '+'\033[1m'+str(closest_title)+'\033[0m'+'.\n')

    for i,s in similar_movies[:how_many]:
      rec_movie.append(get_title_year_from_index(i))

    return rec_movie

  else:
    # print('Did you mean '+'\033[1m'+str(closest_title)+'\033[0m'+'?','\n')

    movie_index = get_index_from_title(closest_title)
    movie_list = list(enumerate(sim_matrix[int(movie_index)]))
    similar_movies = list(filter(lambda x:x[0] != int(movie_index), sorted(movie_list,key=lambda x:x[1], reverse=True)))


  for i,s in similar_movies[:how_many]:
    rec_movie.append(get_title_year_from_index(i))

  return rec_movie


In [None]:
def recommend_movies_cf(user, num_recommended_movies):
  recommended_movies = []

  for m in df[df[user] == 0].index.tolist():

    index_df = df.index.tolist().index(m)
    predicted_rating = df1.iloc[index_df, df1.columns.tolist().index(user)]
    recommended_movies.append((m, predicted_rating))

  sorted_rm = sorted(recommended_movies, key=lambda x:x[1], reverse=True)
  rank = 1
  return sorted_rm


In [None]:
def movie_recommender_cf(user, num_neighbors, num_recommendation):

  number_neighbors = num_neighbors

  knn = NearestNeighbors(metric='cosine', algorithm='brute')
  knn.fit(df.values)
  distances, indices = knn.kneighbors(df.values, n_neighbors=number_neighbors)

  user_index = df.columns.tolist().index(user)

  for m,t in list(enumerate(df.index)):
    if df.iloc[m, user_index] == 0:
      sim_movies = indices[m].tolist()
      movie_distances = distances[m].tolist()

      if m in sim_movies:
        id_movie = sim_movies.index(m)
        sim_movies.remove(m)
        movie_distances.pop(id_movie)

      else:
        sim_movies = sim_movies[:num_neighbors-1]
        movie_distances = movie_distances[:num_neighbors-1]

      movie_similarity = [1-x for x in movie_distances]
      movie_similarity_copy = movie_similarity.copy()
      nominator = 0

      for s in range(0, len(movie_similarity)):
        if df.iloc[sim_movies[s], user_index] == 0:
          if len(movie_similarity_copy) == (number_neighbors - 1):
            movie_similarity_copy.pop(s)

          else:
            movie_similarity_copy.pop(s-(len(movie_similarity)-len(movie_similarity_copy)))

        else:
          nominator = nominator + movie_similarity[s]*df.iloc[sim_movies[s],user_index]

      if len(movie_similarity_copy) > 0:
        if sum(movie_similarity_copy) > 0:
          predicted_r = nominator/sum(movie_similarity_copy)

        else:
          predicted_r = 0

      else:
        predicted_r = 0

      df1.iloc[m,user_index] = predicted_r
  return recommend_movies_cf(user,num_recommendation)


In [None]:
b=(movie_recommender_cf(1, 30, 10))

In [None]:
b[0]

('3 Ninjas: High Noon On Mega Mountain (1998)', 5.000000000000001)

In [None]:
b[0][0]

'3 Ninjas: High Noon On Mega Mountain (1998)'

In [None]:
teste=[]
for i in b:
  if i[1]>=5.0: #treshold value
    teste.append(i[0])
  else:
    break

In [None]:
a=df[1].sort_values(ascending=False)

In [None]:
test =[]
for i in range(len(a)):
  if a[i]>=5.0:
    test.append(a.index.tolist()[i])
  else:
    break

In [None]:
user_rec = []
for i in range(len(test)):
  user_rec.append(contents_based_recommender(test[i],10))

In [None]:
flat_list = [item for sublist in user_rec for item in sublist]

In [None]:
len(flat_list)

124

In [None]:
len(teste)

250

In [None]:
uniqueList = []
duplicateList = []

for i in flat_list:
    if i not in uniqueList:
        uniqueList.append(i)
    elif i not in duplicateList:
        duplicateList.append(i)

print(duplicateList)

['Tom and Huck (1995)', 'Father of the Bride Part II (1995)', 'Braveheart (1995)', "Don't Be a Menace to South Central While Drinking Your Juice in the Hood (1996)", 'Richard III (1995)', 'GoldenEye (1995)', 'Winnie the Pooh and the Blustery Day (1968)', 'Quest, The (1996)', 'Crossing Guard, The (1995)', 'Waterworld (1995)', 'Mortal Kombat (1995)', 'Nightmare Before Christmas, The (1993)', 'Sense and Sensibility (1995)', 'Casino (1995)', 'Underground (1995)', 'Client, The (1994)', 'Lawnmower Man 2: Beyond Cyberspace (1996)', 'Kicking and Screaming (1995)']


In [None]:
print(set(teste).intersection(flat_list))

{'Around the World in 80 Days (1956)', 'Richard III (1995)', 'Balto (1995)', 'Doctor Dolittle (1967)', 'Insomnia (2002)'}


In [None]:
def hybrid_mode(user, treshold_value):
  user_cf=movie_recommender_cf(user, 10, 10)
  rec_cf=[]
  for i in user_cf:
    if i[1]>=treshold_value: #treshold value
      rec_cf.append(i[0])
    else:
      break

  user_reviews = df[user].sort_values(ascending=False)
  user_max =[]
  for i in range(len(user_reviews)):
    if a[i]>=5.0:
      user_max.append(user_reviews.index.tolist()[i])
    else:
      break
  rec_cb = []
  for i in range(len(user_max)):
    rec_cb.append(contents_based_recommender(user_max[i],10))
  rec_cb = [item for sublist in user_rec for item in sublist]

  rec_movies = set(rec_cf).intersection(rec_cb)

  return rec_movies

In [None]:
hybrid_mode(user=12,treshold_value=5)

set()