<a href="https://colab.research.google.com/github/VanshikaChenna/18CSE301J-Projects/blob/main/Movie_reccomendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Reading movie data with Pandas

import pandas as pd

# Dataset link: https://files.grouplens.org/datasets/movielens/ml-25m.zip

movies = pd.read_csv("movies.csv")

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# Cleaning movie titles with Regex

import re

def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [5]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [6]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [7]:
# Creating a TF-IDF matrix

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [8]:
# Computing similarities between input term and movies in the dataset using cosine similiarity function
# And creating a search function

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()  #compare search term to movie title and return the similarities
    indices = np.argpartition(similarity, -5)[-5:]  # to get 5 most similar titles to our search term
    results = movies.iloc[indices].iloc[::-1]   # gives the indices of the vectors returned for the movies

    return results

In [9]:

# pip install ipywidgets
# jupyter labextension install @jupyter-widgets/jupyterlab-manager

In [10]:
# Building an interactive search box/widget with jupyter

import ipywidgets as widgets
from IPython.display import display

# input widget creation
movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)

# output widget creation
movie_list = widgets.Output()

# function executes whenever input given
def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [11]:
# Reading in movie ratings data
ratings = pd.read_csv("ratings.csv")

In [12]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp    float64
dtype: object

In [13]:
# Finding users who liked the same movie

def find_similar_movies(movie_id):
    # similar_users= Users who have similar taste in movies to us
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    # similar_user_recs= other movies, that the users who have similar taste like us, liked
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    # to count how many times each movie occurs
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    # find only the movies that greater than 10% or more of the users similar to us liked
    similar_user_recs = similar_user_recs[similar_user_recs > .10]

    # finding number of people who liked movies similar to our input, rather than all movies they like
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    # we want movies that have a big difference between similar movie recs and all movie recs
    # the higher the score, the better the recommendations
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [14]:
import ipywidgets as widgets
from IPython.display import display

# creating input widget
movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)

# craeting output widget
recommendation_list = widgets.Output()

# function executes whenever input given
def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()