Movie Recommendation(Collaborative Filtering)

In [1]:
import pandas as pd
movies = pd.read_csv("movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [2]:
import re
def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title
movies["clean_title"] = movies["title"].apply(clean_title)
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,Black Butler Book of the Atlantic 2017
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,No Game No Life Zero 2017
9739,193585,Flint (2017),Drama,Flint 2017
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,Bungo Stray Dogs Dead Apple 2018


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies["clean_title"])

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]
    
    return results

In [5]:
movie_id = 89745
ratings = pd.read_csv("ratings.csv")
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(5).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [6]:
movie_name = input("Enter movie name :")
results = search(movie_name)
movie_id = results.iloc[0]["movieId"]
similar_movies = find_similar_movies(movie_id)

In [7]:
print("Search Results for Movie:", movie_name)
display(results)
print("\nTop Similar Movies:")
display(similar_movies)

Search Results for Movie: Avatar


Unnamed: 0,movieId,title,genres,clean_title
7212,72998,Avatar (2009),Action|Adventure|Sci-Fi|IMAX,Avatar 2009
3247,4389,Lost and Delirious (2001),Drama,Lost and Delirious 2001
3246,4388,Scary Movie 2 (2001),Comedy,Scary Movie 2 2001
3249,4392,Alice (1990),Comedy|Drama|Fantasy|Romance,Alice 1990
9741,193609,Andrew Dice Clay: Dice Rules (1991),Comedy,Andrew Dice Clay Dice Rules 1991



Top Similar Movies:


Unnamed: 0,score,title,genres
7212,21.296296,Avatar (2009),Action|Adventure|Sci-Fi|IMAX
8202,17.037037,"World's End, The (2013)",Action|Comedy|Sci-Fi
7881,15.972222,"Dictator, The (2012)",Comedy
6483,15.972222,28 Weeks Later (2007),Horror|Sci-Fi|Thriller
7593,15.972222,Louis C.K.: Shameless (2007),Comedy
