<a href="https://colab.research.google.com/github/ShreyasCode1223/Outrix_Tasks/blob/main/Movie_Rec_Sys.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Load the dataset. Make sure you have uploaded tmdb_5000_movies.csv to your Colab session.
try:
    df_movies = pd.read_csv('/content/tmdb_5000_movies.csv.zip')
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: The file 'tmdb_5000_movies.csv' was not found.")
    print("Please upload the file to your Google Colab session.")
    exit()

print("\nColumns in the original DataFrame:")
print(df_movies.columns)

Dataset loaded successfully!

Columns in the original DataFrame:
Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')


In [None]:
# Function to safely parse a stringified list of dictionaries
def parse_features(features_string):
    if isinstance(features_string, str):
        try:
            # Safely convert the string to a Python list
            features_list = ast.literal_eval(features_string)
            # Extract the 'name' from each dictionary in the list
            return ' '.join([feature['name'] for feature in features_list])
        except (ValueError, SyntaxError):
            return ''
    return ''

# Apply the function to the 'genres' column
df_movies['genres'] = df_movies['genres'].apply(parse_features)
df_movies['keywords'] = df_movies['keywords'].apply(parse_features)

# Combine relevant features into a single 'tags' column
# We will use 'title', 'genres', and 'keywords' for richer content.
df_movies['tags'] = df_movies['title'].fillna('') + ' ' + df_movies['genres'].fillna('') + ' ' + df_movies['keywords'].fillna('')

# Create a clean DataFrame with just the columns we need
df_rec = df_movies[['title', 'tags']].copy()

# Drop rows where 'tags' might be empty
df_rec.dropna(subset=['tags'], inplace=True)
df_rec = df_rec[df_rec['tags'] != '']

print("\nProcessed DataFrame (first 5 rows):")
print(df_rec.head())
print(f"\nNumber of movies for recommendation: {len(df_rec)}")


Processed DataFrame (first 5 rows):
                                      title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   
2                                   Spectre   
3                     The Dark Knight Rises   
4                               John Carter   

                                                tags  
0  Avatar Action Adventure Fantasy Science Fictio...  
1  Pirates of the Caribbean: At World's End Adven...  
2  Spectre Action Adventure Crime spy based on no...  
3  The Dark Knight Rises Action Crime Drama Thril...  
4  John Carter Action Adventure Science Fiction b...  

Number of movies for recommendation: 4803


In [None]:
# Initialize a TF-IDF Vectorizer to convert text to numbers
tfidf = TfidfVectorizer(stop_words='english')

# Apply the vectorizer on the 'tags' column to create the feature matrix
tfidf_matrix = tfidf.fit_transform(df_rec['tags'])

# Calculate the cosine similarity matrix, which measures the similarity between movies
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Create a reverse map of movie titles to their indices for quick lookup
indices = pd.Series(df_rec.index, index=df_rec['title']).drop_duplicates()

print("\nCosine similarity matrix shape:", cosine_sim.shape)


Cosine similarity matrix shape: (4803, 4803)


In [None]:
def get_recommendations(title, cosine_sim=cosine_sim, df=df_rec):
    # Check if the movie title exists in our database
    if title not in indices:
        return f"Movie '{title}' not found in the database. Please check the spelling."

    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on their similarity scores in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 5 most similar movies (skipping the first one, which is the movie itself)
    sim_scores = sim_scores[1:6]

    # Get the movie indices from the sorted list
    movie_indices = [i[0] for i in sim_scores]

    # Return the titles of the top 5 most similar movies
    return df['title'].iloc[movie_indices]

# Test the recommender with a movie
print("\nRecommended movies for 'The Avengers':")
print(get_recommendations('The Avengers'))

print("\nRecommended movies for 'The Dark Knight Rises':")
print(get_recommendations('The Dark Knight Rises'))


Recommended movies for 'The Avengers':
7                  Avengers: Age of Ultron
182                                Ant-Man
79                              Iron Man 2
85     Captain America: The Winter Soldier
26              Captain America: Civil War
Name: title, dtype: object

Recommended movies for 'The Dark Knight Rises':
65      The Dark Knight
119       Batman Begins
1359             Batman
210      Batman & Robin
428      Batman Returns
Name: title, dtype: object
