In [1]:
#Import libraries
import pandas as pd
import numpy as np
import re
import sys

# import ipywidgets as widgets
# from IPython.display import display

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movie_file = pd.read_csv("ml-25m/movies.csv")
movie_file

#read ratings file needed for algorithm
movie_rating = pd.read_csv("ml-25m/ratings.csv")

In [3]:
#Function to clean title data
def remove_year_from_title(title):
    #find pattern of four numbers and brackets
    pattern = r'\(\d{4}\)'
    #remove the pattern from the title
    cleaned_title = re.sub(pattern, '', title).strip()
    return cleaned_title

In [4]:
#Make a new column with the fixed titles
movie_file["fix_title"] = movie_file["title"].apply(remove_year_from_title)

In [5]:
#Now we need to vectorise the titles, so the computer can understand them
vectorise = TfidfVectorizer(ngram_range=(1,2))
#turn title into matrix
matrix = vectorise.fit_transform(movie_file["fix_title"])

In [6]:
#Create a search function, for when entering title name
def search_title(title):
    title = remove_year_from_title(title)
    vectored = vectorise.transform([title])
    similar = cosine_similarity(vectored, matrix).flatten()
    ten_most_similar_titles = np.argpartition(similar, -10)[-10:]
    final = movie_file.iloc[ten_most_similar_titles][::-1]
    return final

In [7]:
#find users who liked the same movie as movie entered
    #similar_user = movie_rating[(movie_rating["movieId"] == movieId) & (movie_rating["rating"] >= 3.5)]["userId"].unique()
    #find more movies the user rated 3.5 or above
    #user_likes = movie_rating[(movie_rating["userId"].isin(similar_user)) & (movie_rating["rating"] >= 3.5)]["movieId"]
    #turn the amount of the movie counts into a percentage
    #user_likes = user_likes.value_counts() / len(similar_user)
    #since there are alot of movies liked, I only chose the ones where more than 20% of users liked
    #user_likes = user_likes[user_likes > 0.2]
    #user_likes

In [8]:
#Make the final recommendation function
def recommendation_function(movieId):
    #find users who liked the same movie as movie entered
    similar_user = movie_rating[(movie_rating["movieId"] == movieId) & (movie_rating["rating"] >= 3.5)]["userId"].unique()
    #find more movies the user rated 3.5 or above
    user_likes = movie_rating[(movie_rating["userId"].isin(similar_user)) & (movie_rating["rating"] >= 3.5)]["movieId"]
    #turn the amount of the movie counts into a percentage
    user_likes = user_likes.value_counts() / len(similar_user)
    #since there are alot of movies liked, I only chose the ones where more than 20% of users liked
    user_likes = user_likes[user_likes > 0.2]
    
    #find all users who liked the movie title
    users = movie_rating[(movie_rating["movieId"].isin(user_likes.index)) & (movie_rating["rating"] > 3.5)]
    #find percentage all users recommend each movie
    users = users["movieId"].value_counts() / len(users["userId"].unique())
    
    #now we need to compare the two percentages we have made
    #we will have a new table where the columns show how much each user likes a movie and how much similar users like a movie
    new_table = pd.concat([user_likes, users], axis=1)
    new_table.columns = ["user_likes", "users"]
    #Now we need to find the ratio between these two numbers
    new_table["ratio"] = new_table["user_likes"] / new_table["users"]
    #sort these ratios
    new_table = new_table.sort_values("ratio", ascending=False)
    
    #we need to get the titles of the highest ratio of movies and return it
    return new_table.head(10).merge(movie_file, left_index=True, right_on="movieId")[["title", "genres"]]

In [11]:
#Add a easier way for user to enter media - from user survey
# title_input = widgets.Text(
#     value="",
#     description="Enter Title:",
#     disabled=False
# )

# output_title = widgets.Output()

# def when_typing(text):
#     with output_title:
#         output_title.clear_output()
#         title = text["new"]
#         if len(title) >= 3:
#             output_movie = search_title(title)
#             movie_ID = output_movie.iloc[0]["movieId"]
#             display(recommendation_function(movie_ID))

# title_input.observe(when_typing, names="value")
# display(title_input, output_title)

In [12]:
output_movie = search_title(sys.argv[1])
movie_ID = output_movie.iloc[0]["movieId"]
data = recommendation_function(movie_ID)
print(data)