In [1]:
#TF-IDF Vectorizers are used to convert textual data into numeric factors, where each dimension represents a word and its value is based on its frequency in the document
#and its rarity across all documents.
#ngram_range=(1,2) - use to read more than 1 word at a time
#Cosine Similarity - measure of similarity between two non zero vectors of an inner product space

In [29]:
#Importing Libraries
import pandas as pd # used for data manipulation
import ipywidgets as widgets # used for creating interactive user interfaces
import numpy as np # used for numerical operations
import ipywidgets as widgets # used for creating interactive user interfaces
import re # Library for expression support and  used for searching, matching, and manipulating text based on specific patterns.
from sklearn.feature_extraction.text import TfidfVectorizer # used for converting a collection of raw documents to a matrix of TF-IDF features.
from sklearn.metrics.pairwise import cosine_similarity # used for computing the cosine similarity between two vectors.
from IPython.display import display # used for displaying objects in interactive environments.

In [3]:
# Import the dataset and generating a dataframe for it.
data = pd.read_csv(r"C:\Users\shubh\OneDrive\Desktop\Projects\Data\MovieDataset\ml-25m\movies.csv")
movie_df = pd.DataFrame(data)
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# Cleaning movie titles 
def clean_title(title):
   return re.sub("[^a-zA-Z0-9 ]","", title)

In [5]:
#New columns for clean title
movie_df["clean_title"] = movie_df["title"].apply(clean_title)
movie_df

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [6]:
#TF-IDF Vectorization for data reading for search engine using ngram
vectorizer = TfidfVectorizer(ngram_range=(1,2)) 
tfidf = vectorizer.fit_transform(movie_df["clean_title"])

In [7]:
#Creating search function
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movie_df.iloc[indices][::-1]
    return results 

In [8]:
# Creating a widget for movie input and Output
movie_input = widgets.Text(
    value = "Toy Story",
    description = "Movie title:",
    disabled = False
) 

movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names = 'value')
display(movie_input, movie_list)

Text(value='Toy Story', description='Movie title:')

Output()

In [9]:
#Recommendation System
# Import the rating dataset and generating a dataframe for it.
rating = pd.read_csv(r"C:\Users\shubh\OneDrive\Desktop\Projects\Data\MovieDataset\ml-25m\ratings.csv")
rating_df = pd.DataFrame(rating)
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [10]:
# Finding users who liked the movie 
movieID = 1
similar_users = rating_df[(rating_df["movieId"] == movieID) & (rating_df["rating"] > 4)] ["userId"].unique()
similar_users


array([    36,     75,     86, ..., 162527, 162530, 162533])

In [11]:
# Finding similar movie on the basis of rating > 4
similar_user_recommendation = rating_df[(rating_df["userId"].isin(similar_users)) & (rating_df["rating"] > 4 )] ["movieId"]
similar_user_recommendation

5101            1
5105           34
5111          110
5114          150
5127          260
            ...  
24998854    60069
24998861    67997
24998876    78499
24998884    81591
24998888    88129
Name: movieId, Length: 1358326, dtype: int64

In [12]:
# Finding 10% of the movies that the user liked and is being recommended to him or her
similar_user_recommendation = similar_user_recommendation.value_counts() / len(similar_users)
similar_user_recommendation = similar_user_recommendation[similar_user_recommendation > .1]
similar_user_recommendation

movieId
1        1.000000
318      0.445607
260      0.403770
356      0.370215
296      0.367295
           ...   
953      0.103053
551      0.101195
1222     0.100876
745      0.100345
48780    0.100186
Name: count, Length: 113, dtype: float64

In [13]:
all_users = rating_df[(rating_df["movieId"].isin(similar_user_recommendation.index)) & (rating_df["rating"] > 4)]

In [14]:
# Finidng the percentage of all users that recommended the movies in similar_user_recommendation
all_users_recs = all_users["movieId"].value_counts() / len(all_users['userId'].unique())
all_users_recs


movieId
318      0.342220
296      0.284674
2571     0.244033
356      0.235266
593      0.225909
           ...   
551      0.040918
50872    0.039111
745      0.037031
78499    0.035131
2355     0.025091
Name: count, Length: 113, dtype: float64

In [15]:
# Comparing the percentages i.e creating a recommendation score 
rec_percetages = pd.concat([similar_user_recommendation, all_users_recs], axis=1)
rec_percetages.columns = ["Similar", "All"]
rec_percetages

Unnamed: 0_level_0,Similar,All
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.000000,0.124728
318,0.445607,0.342220
260,0.403770,0.222207
356,0.370215,0.235266
296,0.367295,0.284674
...,...,...
953,0.103053,0.045792
551,0.101195,0.040918
1222,0.100876,0.066877
745,0.100345,0.037031


In [16]:
# Getting the scores for the movie (The more the better)
rec_percetages["score"] = rec_percetages["Similar"] / rec_percetages["All"]
rec_percetages = rec_percetages.sort_values("score",ascending=False)
rec_percetages 

Unnamed: 0_level_0,Similar,All,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.000000,0.124728,8.017414
3114,0.280648,0.053706,5.225654
2355,0.110539,0.025091,4.405452
78499,0.152960,0.035131,4.354038
4886,0.235147,0.070811,3.320783
...,...,...,...
2858,0.216724,0.167634,1.292845
296,0.367295,0.284674,1.290232
79132,0.166817,0.131384,1.269693
4973,0.142501,0.112405,1.267747


In [17]:
#getting top 10 recommendation
rec_percetages.head(10).merge(movie_df, left_index=True , right_on="movieId")


Unnamed: 0,Similar,All,score,movieId,title,genres,clean_title
0,1.0,0.124728,8.017414,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.280648,0.053706,5.225654,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.110539,0.025091,4.405452,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.15296,0.035131,4.354038,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
4780,0.235147,0.070811,3.320783,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
580,0.216618,0.067513,3.208539,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
6258,0.228139,0.072268,3.156862,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
587,0.1794,0.059977,2.99115,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
8246,0.203504,0.068453,2.972889,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
359,0.253411,0.085764,2.954762,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


In [21]:
#Recommendation System
def find_similar_movies(movie_id):
    #User similar to us
    similar_users = rating_df[(rating_df["movieID"] == movie_id) & (rating_df["rating"] > 4)] ["userID"].unique()  
    similar_user_recommendation = rating_df[(rating_df["userId"].isin(similar_users)) & (rating_df["rating"]>4)]["movieId"]
    
    similar_user_recommendation = similar_user_recommendation.value_counts() / len(similar_users)
    #Finding users and their recommendations
    similar_user_recommendation = similar_user_recommendation[similar_user_recommendation > .10]

    all_users = rating_df[(rating_df["movieid"].isin(similar_user_recommendation.index)) & (rating_df["rating"] >4)]
    all_users_recs = all_users["movieId"].value.counts() / len(all_users[userId].unique())

    rec_percetages = pd.concat([similar_user_recommendation, all_users_recs], axis=1) 
    rec_percetages.columns = ["similar", "all"]

    rec_percetages["score"] = rec_percetages["similar"] / rec_percetages["all"]

    rec_percetages = rec_percetages.sort_values("score", ascending=False)
    return rec_percetages.head(10).merge(movies, left_index=True, right_on="movieId") [["score","title","genres"]]


In [32]:
#Recommendation Widget 
movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')

display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()