In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
# Load the dataset
movies_df = pd.read_csv('Movies.csv')

In [3]:
# Data Preprocessing
# Remove the unnamed column
movies_df = movies_df.drop(columns=['Unnamed: 0'], errors='ignore')
# Filter for English language movies
movies_df = movies_df[movies_df['original_language'] == 'en'].copy()
# Remove the overview and tagline columns
movies_df = movies_df.drop(columns=['overview', 'tagline'])
# Handle missing values (you might want to explore more sophisticated imputation)
movies_df.fillna('', inplace=True)

  movies_df.fillna('', inplace=True)


In [4]:
# Feature Engineering: Combine relevant text features
movies_df['combined_features'] = movies_df['original_title'] + ' ' + movies_df['genre'].apply(lambda x: ' '.join(eval(x)))

In [5]:
# Text Vectorization using TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_df['combined_features'])

In [6]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [7]:
# Create a reverse mapping of movie titles to indices
indices = pd.Series(movies_df.index, index=movies_df['original_title']).drop_duplicates()

In [8]:
def get_recommendations(title, cosine_sim=cosine_sim, movies_df=movies_df, indices=indices):
    try:
        # Get the index of the movie that matches the title
        idx = indices[title]

        # Get the pairwise similarity scores of all movies with that movie
        sim_scores = list(enumerate(cosine_sim[idx]))

        # Sort the movies based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the scores of the 10 most similar movies
        sim_scores = sim_scores[1:11]  # Exclude the input movie itself

        # Get the movie indices
        movie_indices = [i[0] for i in sim_scores]

        # Return the top 10 most similar movies
        recommendations_df = movies_df[['original_title', 'vote_average', 'vote_count']].iloc[movie_indices]
        return recommendations_df
    except KeyError:
        print(f"Error: Movie '{title}' not found in the dataset.")
        return None

In [9]:
# Example Usage
movie_title = "Avatar"
recommendations = get_recommendations(movie_title)

if recommendations is not None:
    print(f"Recommendations for '{movie_title}':")
    print(recommendations)

Recommendations for 'Avatar':
                           original_title  vote_average  vote_count
175              The Amazing Spider-Man 2           6.4      9866.0
3210               The Amazing Spider-Man           5.4        67.0
140                          Spider-Man 3           6.3     10473.0
1163                         Spider-Man 2           7.1     11527.0
3948     Spider-Man 2: Making the Amazing           6.5        29.0
186                            Spider-Man           7.2     14296.0
252     Spider-Man: Into the Spider-Verse           8.4     10126.0
861   Spider-Man: Across the Spider-Verse           0.0         0.0
5632              Spider-Man Strikes Back           5.2        30.0
9113                         Amazing Love           7.3        13.0
