In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
movies = pd.read_csv('/content/sample_data/movies_sample.csv')
movies.head()

Unnamed: 0,movie_id,title,genres,cast,crew
0,1,The Matrix,Action Sci-Fi,Keanu Reeves Laurence Fishburne,Crew A
1,2,Inception,Action Thriller,Leonardo DiCaprio Joseph Gordon-Levitt,Crew B
2,3,The Notebook,Romance Drama,Ryan Gosling Rachel McAdams,Crew C
3,4,Interstellar,Sci-Fi Adventure,Matthew McConaughey Anne Hathaway,Crew D
4,5,The Godfather,Crime Drama,Marlon Brando Al Pacino,Crew E


In [6]:
movies.columns

Index(['movie_id', 'title', 'genres', 'cast', 'crew'], dtype='object')

In [7]:
movies['genres'] = movies['genres'].fillna('')

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genres'])

In [8]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [9]:
# Create a mapping of movie title to index
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

# Recommendation function
def recommend(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]  # top 5 similar movies (excluding itself)
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]

In [10]:
recommend('Inception')

Unnamed: 0,title
9,Gladiator
5,The Dark Knight
0,The Matrix
2,The Notebook
3,Interstellar
