In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Load the cleaned dataset
data = pd.read_csv('movies_data_cleaned.csv')

# 1. Data Preparation
# Combine genres into a single string for each movie
genre_columns = data.columns[6:-1]  # Exclude unnecessary columns
data['combined_genres'] = data[genre_columns].apply(lambda row: ' '.join(row[row == 1].index), axis=1)

# 2. Feature Engineering
# TF-IDF Vectorization of descriptions
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(data['description'].fillna(''))

# Combine TF-IDF features with genres
data['combined_metadata'] = data['combined_genres'] + ' ' + data['description']
combined_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
combined_matrix = combined_vectorizer.fit_transform(data['combined_metadata'])

# 3. Similarity Calculation
# Compute pairwise cosine similarity
cosine_sim = cosine_similarity(combined_matrix)

# 4. Recommendation Function
def recommend_movies(movie_title, cosine_sim=cosine_sim, data=data, top_n=5):
    """
    Recommend movies similar to a given movie based on cosine similarity.
    :param movie_title: Title of the movie to base recommendations on
    :param cosine_sim: Precomputed cosine similarity matrix
    :param data: Dataset containing movies
    :param top_n: Number of recommendations to return
    :return: List of recommended movies
    """
    # Find the index of the movie
    idx = data[data['title'] == movie_title].index[0]
    
    # Get similarity scores for all movies with the given movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort movies by similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top_n movies excluding the input movie itself
    top_movies = [data.iloc[i[0]]['title'] for i in sim_scores[1:top_n + 1]]
    
    return top_movies

# Test the recommendation function
test_movie = "Transmorphers: Mech Beasts"
print(f"Movies similar to '{test_movie}':")
print(recommend_movies(test_movie))

# 5. Evaluation
# Simulate a ground truth by assuming movies with the same genres are good recommendations
def evaluate_recommendation_system(data, cosine_sim, top_n=5):
    """
    Evaluate the recommendation system using simulated ground truth.
    :param data: Dataset containing movies
    :param cosine_sim: Precomputed cosine similarity matrix
    :param top_n: Number of recommendations to return
    :return: Evaluation metrics (MAE, RMSE)
    """
    true_ratings = []
    predicted_ratings = []
    
    for idx, row in data.iterrows():
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
        # Get top_n recommendations
        recommendations = [data.iloc[i[0]]['rating'] for i in sim_scores[1:top_n + 1]]
        
        # Use the current movie's rating as ground truth for evaluation
        true_rating = row['rating']
        
        # Simulate predicted ratings as the mean of the recommended movies' ratings
        predicted_rating = np.mean(recommendations)
        
        true_ratings.append(true_rating)
        predicted_ratings.append(predicted_rating)
    
    # Compute MAE and RMSE
    mae = mean_absolute_error(true_ratings, predicted_ratings)
    rmse = mean_squared_error(true_ratings, predicted_ratings, squared=False)
    
    return mae, rmse

# Evaluate the model
mae, rmse = evaluate_recommendation_system(data, cosine_sim)
print(f"Evaluation Results - MAE: {mae:.4f}, RMSE: {rmse:.4f}")


Movies similar to 'Transmorphers: Mech Beasts':
['Independence Day: Resurgence', 'The Wandering Earth', 'Robot Dreams', 'Outlander', '65']
Evaluation Results - MAE: 0.0939, RMSE: 0.1453


