In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
df = pd.read_csv('movies.csv')


In [27]:
df.count()

movieId    9742
title      9742
genres     9742
content    9742
dtype: int64

In [26]:
df.head()

Unnamed: 0,movieId,title,genres,content
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure|Animation|Children|Comedy|Fantasy To...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure|Children|Fantasy Jumanji (1995)
2,3,Grumpier Old Men (1995),Comedy|Romance,Comedy|Romance Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Comedy|Drama|Romance Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995),Comedy,Comedy Father of the Bride Part II (1995)


In [5]:
df['content'] = df['genres'] + ' ' + df['title']

In [12]:
df.head()

Unnamed: 0,movieId,title,genres,content
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure|Animation|Children|Comedy|Fantasy To...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure|Children|Fantasy Jumanji (1995)
2,3,Grumpier Old Men (1995),Comedy|Romance,Comedy|Romance Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Comedy|Drama|Romance Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995),Comedy,Comedy Father of the Bride Part II (1995)


Using the TF-IDF (Term Frequency-Inverse Document Frequency) technique, we'll convert text data into a 
numerical representation that can be used for 
similarity calculations.

In [13]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['content'])

# the fit_transform method is used to transform the text data into a numerical representation using the TF-IDF 
#(Term Frequency-Inverse Document Frequency) technique

In [48]:
print(tfidf)
print(tfidf_matrix)

TfidfVectorizer(stop_words='english')
  (0, 116)	0.3462681147602057
  (0, 7698)	0.4336085282954199
  (0, 8190)	0.6285267783370819
  (0, 2853)	0.2639918503330498
  (0, 1766)	0.1462569168855382
  (0, 1569)	0.27306521182266125
  (0, 465)	0.28215755410829646
  (0, 275)	0.227543067958222
  (1, 4376)	0.7709731800692313
  (1, 116)	0.3923409565109917
  (1, 2853)	0.2991173909919543
  (1, 1569)	0.30939801220384205
  (1, 275)	0.25781890137934854
  (2, 5217)	0.4199181434585641
  (2, 5825)	0.5011982812311119
  (2, 3513)	0.6471082530862423
  (2, 6783)	0.1913506107837027
  (2, 116)	0.315239161006609
  (2, 1766)	0.13315089032190922
  (3, 2770)	0.671241749294435
  (3, 8657)	0.6064390511340177
  (3, 2448)	0.12754083492368723
  (3, 6783)	0.19848691173143362
  (3, 116)	0.32699580768905046
  (3, 1766)	0.13811666921806096
  :	:
  (9737, 465)	0.21030220513745695
  (9738, 142)	0.3703662733354052
  (9738, 8997)	0.5497487204607421
  (9738, 4794)	0.4160991947729659
  (9738, 3221)	0.4819829236580419
  (9738, 2853

In [7]:
#We'll compute the cosine similarity between the movies based on their TF-IDF matrix.
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

#the linear_kernel function is used to compute the cosine similarity between the TF-IDF matrix.

In [18]:
print(cosine_sim)

[[1.         0.35797045 0.12863151 ... 0.         0.05346934 0.01147169]
 [0.35797045 1.         0.12368123 ... 0.         0.         0.        ]
 [0.12863151 0.12368123 1.         ... 0.         0.         0.01044372]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.05346934 0.         0.         ... 0.         1.         0.        ]
 [0.01147169 0.         0.01044372 ... 0.         0.         1.        ]]


In [8]:
#Given a movie title, we'll find the most similar movies based on the cosine similarity scores.
def get_recommendations(title, cosine_sim, df, top_n=5):
    # Get the index of the movie that matches the title
    idx = df[df['title'] == title].index[0]

    
    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    
    # Sort the movies based on the similarity scores and x[1] contains similarity score and x[0] contains index of a movie.
    #The reverse=True argument ensures that the sorting is done in descending order. his means that movies with higher- 
    #-similarity scores will appear at the beginning of the sorted list.
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    
    # Get the top-n most similar movies (excluding the input movie itself)
    #sim_scores[1:top_n+1] is used to slice the list from index 1 to top_n+1. This slicing operation excludes- 
    #-the first element, which represents the similarity score of the input movie itself.
    #The list comprehension [i[0] for i in sim_scores[1:top_n+1]] extracts the first element (the movie index) from -
    #-each tuple in the sliced list. This creates a list of movie indices representing the top-n most similar movies.
    top_movies_indices = [i[0] for i in sim_scores[1:top_n+1]]

    
    # Return the top-n most similar movies
    return df['title'].iloc[top_movies_indices]


#Test the recommendation system
#Let's test the recommendation system by providing a movie title and getting the top 5 similar movies.

In [24]:
# Example usage
movie_title = 'Jumanji (1995)'
recommendations = get_recommendations(movie_title, cosine_sim, df)
print(recommendations)


9636    Jumanji: Welcome to the Jungle (2017)
26                        Now and Then (1995)
0                            Toy Story (1995)
209                              Gordy (1995)
1565                         Tall Tale (1995)
Name: title, dtype: object


In [47]:
print(df[df['title']=='Jumanji (1995)']['genres'])
print(df[df['title']=='Jumanji: Welcome to the Jungle (2017)']['genres'])
print(df[df['title']=='Now and Then (1995)']['genres'])
print(df[df['title']=='Toy Story (1995)']['genres'])


1    Adventure|Children|Fantasy
Name: genres, dtype: object
9636    Action|Adventure|Children
Name: genres, dtype: object
26    Children|Drama
Name: genres, dtype: object
0    Adventure|Animation|Children|Comedy|Fantasy
Name: genres, dtype: object


In [28]:
# Example2 usage
movie_title = 'Waiting to Exhale (1995)'
recommendations = get_recommendations(movie_title, cosine_sim, df)
print(recommendations)

5996                Waiting... (2005)
529                   Two Much (1995)
1113       Waiting for Guffman (1996)
7444    Waiting for 'Superman' (2010)
26                Now and Then (1995)
Name: title, dtype: object


This code provides a implementation of content-based filtering for 
movie recommendations using the TF-IDF technique and cosine similarity.