# MOVIE RECOMMENDER SYSTEM

#Context


Over the past two decades, there has been a monumental shift in how people access and consume video content. With the universal access to broadband internet, numerous platforms like YouTube, Netflix, and HBO Go emerged and steadily grew to prominence.
Although not a household name in itself, OTT is the exact technology that made the streaming revolution possible.
OTT stands for “Over The Top” which refers to any video streaming service delivering content to the users over the internet, however, there are subscription charges associated with the usage of such platforms such as PrimeVideo, Netflix, HotStart, Zee5, SonyLiv, etc.
Based on this the recommender systems are designed to help the user better

#Data Description

The data consists of 105339 ratings applied over 10329 movies. The average rating and minimum and maximum rating are 0.5 and 5 respectively. There are 668 users who have given their ratings for 149532 movies.

# Importing Dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [None]:
ratings = pd.read_csv('/content/ratings.csv',nrows = 10000)
movies = pd.read_csv('/content/movies.csv')

In [None]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   userId     10000 non-null  int64  
 1   movieId    10000 non-null  int64  
 2   rating     10000 non-null  float64
 3   timestamp  10000 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 312.6 KB


# Checking for duplicates

In [None]:
# duplicates in ratings

duplicates = ratings[ratings.duplicated()]
print(f"Number of duplicate rows: {len(duplicates)}")

Number of duplicate rows: 0


In [None]:
#duplicates in movies

duplicates = movies[movies.duplicated()]
print(f"Number of duplicate rows in movies: {len(duplicates)}")

Number of duplicate rows in movies: 0


In [None]:
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [None]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807


In [None]:
movies.shape

(10329, 3)

In [None]:
ratings.shape

(10000, 4)

In [None]:
movies = movies.merge(ratings,on = 'movieId')

In [None]:
movies.head(2)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2,5.0,859046895
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,1303501039


In [None]:

movies['rating'].describe()

count    10000.000000
mean         3.382550
std          1.159532
min          0.500000
25%          3.000000
50%          3.500000
75%          4.000000
max          5.000000
Name: rating, dtype: float64

In [None]:
#  Finding unique users and movies

unique_users = movies['userId'].nunique()
unique_movies = movies['movieId'].nunique()

print(f"There are {unique_users} unique users and {unique_movies} unique movies in the dataset.")

There are 88 unique users and 3232 unique movies in the dataset.


In [None]:
#  Average rating and Total movies at genre level.

average_ratings_by_genre = movies.groupby('genres')['rating'].mean()
total_movies_by_genre = movies.groupby('genres')['movieId'].count()

results = pd.DataFrame({
    'Average Rating': average_ratings_by_genre,
    'Total Movies': total_movies_by_genre
})

print(results.head())

                                                    Average Rating  \
genres                                                               
Action                                                    2.347826   
Action|Adventure                                          3.565574   
Action|Adventure|Animation                                4.250000   
Action|Adventure|Animation|Children|Comedy                3.647059   
Action|Adventure|Animation|Children|Comedy|Fantasy        3.500000   

                                                    Total Movies  
genres                                                            
Action                                                        23  
Action|Adventure                                              61  
Action|Adventure|Animation                                     2  
Action|Adventure|Animation|Children|Comedy                    17  
Action|Adventure|Animation|Children|Comedy|Fantasy             2  


In [None]:
#  Unique genres considered

unique_genres = movies['genres'].str.split('|').explode().unique()
print(f"There are {len(unique_genres)} unique genres in the dataset.")

There are 19 unique genres in the dataset.


## Popularity-based recommender system

In [None]:
def recommend_movies(genre, threshold, n):


  filtered_movies = movies[movies['genres'].str.contains(genre)]
  filtered_movies = filtered_movies[filtered_movies['rating'] >= threshold]
  filtered_movies = filtered_movies.groupby('title')['rating'].mean().sort_values(ascending=False)

  top_n_movies = filtered_movies.head(n)

  results = pd.DataFrame({
      'Title': top_n_movies.index,
      'Rating': top_n_movies.values,
      'Total Ratings': movies.groupby('title')['rating'].count()[top_n_movies.index]
  })

  return results

# Example usage
recommended_movies = recommend_movies(genre='Comedy', threshold=4.0, n=5)
recommended_movies


Unnamed: 0_level_0,Title,Rating,Total Ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
¡Three Amigos! (1986),¡Three Amigos! (1986),5.0,2
Love in the Afternoon (1957),Love in the Afternoon (1957),5.0,1
"Hard Day's Night, A (1964)","Hard Day's Night, A (1964)",5.0,1
Harvey (1950),Harvey (1950),5.0,2
Ice Age 4: Continental Drift (2012),Ice Age 4: Continental Drift (2012),5.0,1


## Content-based recommender system

In [None]:
#Create a content-based recommender system that recommends top N movies based on similar movie(m) genres and display only S No and movie title

import pandas as pd
def content_based_recommender(movie_id, n):
  # Get the genres of the input movie
  movie_genres = movies[movies['movieId'] == movie_id]['genres'].values[0].split('|')

  # Find movies with similar genres
  similar_movies = movies[movies['genres'].str.contains('|'.join(movie_genres))]

  # Calculate similarity scores based on genre overlap
  similarity_scores = similar_movies['genres'].apply(lambda x: len(set(x.split('|')) & set(movie_genres)))

  # Sort movies by similarity score and return top N
  top_n_movies = similar_movies.loc[similarity_scores.sort_values(ascending=False).index][:n]

  # Return only S No and movie title
  results = pd.DataFrame({
      'S No': range(1, n + 1),
      'Movie Title': top_n_movies['title']
  })

  return results

# Example usage
recommended_movies = content_based_recommender(movie_id=1, n=5)
recommended_movies


Unnamed: 0,S No,Movie Title
0,1,Toy Story (1995)
7036,2,"Monsters, Inc. (2001)"
6006,3,Toy Story 2 (1999)
6007,4,Toy Story 2 (1999)
6008,5,Toy Story 2 (1999)


# Collaborative_filtering_recommender

In [None]:
movies.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2,5.0,859046895
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,1303501039
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,5.0,858610933
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.0,850815810
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,14,4.0,851766286


In [None]:
## recommend top N movies based on “K” similar users for a target user “u” display only S No and movie title

import pandas as pd
def collaborative_filtering_recommender(user_id, n):
  # Get the movies rated by the user
  user_movies = ratings[ratings['userId'] == user_id]['movieId']

  # Find similar users based on movie ratings
  similar_users = ratings[ratings['movieId'].isin(user_movies)]
  similar_users = similar_users.groupby('userId')['rating'].mean().sort_values(ascending=False)

  # Select top K similar users
  top_k_users = similar_users.index[:n]

  # Find movies rated by similar users but not by the target user
  recommended_movies = movies[movies['movieId'].isin(similar_users.index)]
  recommended_movies = recommended_movies[~recommended_movies['movieId'].isin(user_movies)]

  # Calculate average rating for each recommended movie
  average_ratings = recommended_movies.groupby('title')['rating'].mean().sort_values(ascending=False)

  # Select top N movies based on average rating
  top_n_movies = average_ratings.index[:n]

  # Return only S No and movie title
  results = pd.DataFrame({
      'S No': range(1, n + 1),
      'Movie Title': top_n_movies
  })

  return results

# Example usage
recommended_movies = collaborative_filtering_recommender(user_id=1, n=5)
recommended_movies


Unnamed: 0,S No,Movie Title
0,1,Sense and Sensibility (1995)
1,2,"City of Lost Children, The (Cité des enfants p..."
2,3,Mr. Holland's Opus (1995)
3,4,Dead Man Walking (1995)
4,5,Leaving Las Vegas (1995)


## GUI interface - Popularity based recommender system

In [None]:
#  A GUI interface using Python libraries (ipywidgets etc.) for popularity based recommender system and display output as dataframe

def on_value_change(change):
  with output:
    if change['name'] == 'genre':
      threshold_slider.max = movies[movies['genres'].str.contains(change['new'])].rating.max()
    recommended_movies = recommend_movies(genre=genre_toggle_buttons.value, threshold=threshold_slider.value, n=num_movies_slider.value)
    display(recommended_movies)

genre_toggle_buttons = ToggleButtons(
    options=unique_genres,
    description='Genre:',
    style={'description_width': 'initial'},
    button_style='info'
)
genre_toggle_buttons.observe(on_value_change, names='value')

threshold_label = Label(value='Minimum rating:')
threshold_slider = FloatSlider(
    min=0,
    max=5,
    step=0.5,
    description='',
    style={'description_width': 'initial'},
)
threshold_slider.observe(on_value_change, names='value')

num_movies_label = Label(value='Number of movies:')
num_movies_slider = IntSlider(
    min=1,
    max=50,
    step=1,
    description='',
    style={'description_width': 'initial'},
)
num_movies_slider.observe(on_value_change, names='value')

output = Output()

VBox([
    HBox([genre_toggle_buttons]),
    HBox([threshold_label, threshold_slider, num_movies_label, num_movies_slider]),
    output
])


VBox(children=(HBox(children=(ToggleButtons(button_style='info', description='Genre:', options=('Adventure', '…