# Movie Recommender System
## Author: Sharaj Jagadeesan
### MSc Data Analytics, University of Galway.

In [73]:
# Packages

import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ipywidgets as widgets
from IPython.display import display

In [74]:
# Loading the datasets

movies = pd.read_csv(r"D:\Data Analytics Materials\DataQuest\ml-25m\ml-25m\movies.csv")
ratings = pd.read_csv(r"D:\Data Analytics Materials\DataQuest\ml-25m\ml-25m\ratings.csv")
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [75]:
# Preprocessing step

def cleaning_title(title):
    return re.sub("[^a-zA-Z0-9 ]","",title)
movies["clean_title"] = movies["title"].apply(cleaning_title)
movies.head()

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


In [81]:
# Creating TFIDF matrix

vectorizer = TfidfVectorizer(ngram_range = (1,2))
tfidf = vectorizer.fit_transform(movies["clean_title"])

# Computing cosine similarity
def search_title(title):
    title = cleaning_title(title)
    query_vector = vectorizer.transform([title])
    similarity = cosine_similarity(query_vector, tfidf).flatten()
    indices = np.argsort(similarity)[::-1][:5]
    results = movies.iloc[indices]
    return results

In [82]:
# Creating an interactive widget

movie_input = widgets.Text(value = "", description = "Movie Title:", disabled = False)

movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 3:
            display(search_title(title))
movie_input.observe(on_type, names="value")
display(movie_input, movie_list)

Text(value='', description='Movie Title:')

Output()

In [83]:
# Creating a function to find similar movies

def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_rec = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    
    similar_user_rec = similar_user_rec.value_counts() / len(similar_users)
    similar_user_rec = similar_user_rec[similar_user_rec > 0.1]
    
    all_users = ratings[(ratings["movieId"].isin(similar_user_rec.index)) & (ratings["rating"] > 4)]
    all_users_rec = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    rec_percentages = pd.concat([similar_user_rec, all_users_rec], axis = 1)
    rec_percentages.columns = ["similar", "all"]
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending = False)
    return rec_percentages.head(10).merge(movies, left_index = True, right_on = "movieId")[["score","title","genres"]]

In [84]:
# Display the Recommendations

movie_input_name = widgets.Text(value = "", description = "Movie Title: ", disabled = False)
recommendation_list = widgets.Output()

def on_type_rec(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            result = search_title(title)
            movie_id = result.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))
            
movie_input_name.observe(on_type_rec, names="value")
display(movie_input_name, recommendation_list)

Text(value='', description='Movie Title: ')

Output()