In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Load movie dataset
movies = pd.read_csv("movies_metadata_cleaned.csv")  # Use cleaned data

# Fill missing overviews with empty string
movies['overview'] = movies['overview'].fillna("")

# Check data
print(movies[['title', 'overview']].head())


                         title  \
0                    Toy Story   
1                      Jumanji   
2             Grumpier Old Men   
3  Father of the Bride Part II   
4                         Heat   

                                            overview  
0  Led by Woody, Andy's toys live happily in his ...  
1  When siblings Judy and Peter discover an encha...  
2  A family wedding reignites the ancient feud be...  
3  Just when George Banks has recovered from his ...  
4  Obsessive master thief, Neil McCauley leads a ...  


In [3]:
# Convert text (movie descriptions) into TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['overview'])

# Compute Cosine Similarity between all movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

print("Cosine Similarity Matrix Shape:", cosine_sim.shape)  # Should be (num_movies, num_movies)


Cosine Similarity Matrix Shape: (9115, 9115)


In [4]:
def recommend_similar_movies(movie_title, n=10):
    """Finds movies similar to the given movie title using content-based filtering."""
    
    # Check if the movie exists in the dataset
    if movie_title not in movies['title'].values:
        return "Movie not found in dataset."
    
    # Get index of the given movie
    idx = movies[movies['title'] == movie_title].index[0]
    
    # Get similarity scores with all other movies
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort movies by similarity score (highest first), excluding the input movie itself
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:n+1]
    
    # Get movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return recommended movies
    return movies.iloc[movie_indices][['title', 'id']]

# Example: Find movies similar to "The Godfather"
similar_movies = recommend_similar_movies("The Godfather", 10)
print(similar_movies)


                        title      id
597    The Godfather: Part II     240
992   The Godfather: Part III     242
7045               Blood Ties  190955
4163                 Election   18747
8832            Live by Night  259695
1532           American Movie   14242
5788               Easy Money   29920
2209                     Made   15745
7266      The Look of Silence  267480
6339                 Sinister   82507
