In [None]:
# import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.metrics import mean_absolute_error, mean_squared_error
# import numpy as np

# # Load the cleaned dataset
# data = pd.read_csv('movies_data_cleaned.csv')

# # 1. Data Preparation
# # Combine genres into a single string for each movie
# genre_columns = data.columns[6:-1]  # Exclude unnecessary columns
# data['combined_genres'] = data[genre_columns].apply(lambda row: ' '.join(row[row == 1].index), axis=1)

# # 2. Feature Engineering
# # TF-IDF Vectorization of descriptions
# tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
# tfidf_matrix = tfidf_vectorizer.fit_transform(data['description'].fillna(''))

# # Combine TF-IDF features with genres
# data['combined_metadata'] = data['combined_genres'] + ' ' + data['description']
# combined_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
# combined_matrix = combined_vectorizer.fit_transform(data['combined_metadata'])

# # 3. Similarity Calculation
# # Compute pairwise cosine similarity
# cosine_sim = cosine_similarity(combined_matrix)

# # 4. Recommendation Function
# def recommend_movies(movie_title, cosine_sim=cosine_sim, data=data, top_n=5):
#     """
#     Recommend movies similar to a given movie based on cosine similarity.
#     :param movie_title: Title of the movie to base recommendations on
#     :param cosine_sim: Precomputed cosine similarity matrix
#     :param data: Dataset containing movies
#     :param top_n: Number of recommendations to return
#     :return: List of recommended movies
#     """
#     # Find the index of the movie
#     idx = data[data['title'] == movie_title].index[0]
    
#     # Get similarity scores for all movies with the given movie
#     sim_scores = list(enumerate(cosine_sim[idx]))
    
#     # Sort movies by similarity score
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
#     # Get top_n movies excluding the input movie itself
#     top_movies = [data.iloc[i[0]]['title'] for i in sim_scores[1:top_n + 1]]
    
#     return top_movies

# # Test the recommendation function
# test_movie = "Transmorphers: Mech Beasts"
# print(f"Movies similar to '{test_movie}':")
# print(recommend_movies(test_movie))

# # 5. Evaluation
# # Simulate a ground truth by assuming movies with the same genres are good recommendations
# def evaluate_recommendation_system(data, cosine_sim, top_n=5):
#     """
#     Evaluate the recommendation system using simulated ground truth.
#     :param data: Dataset containing movies
#     :param cosine_sim: Precomputed cosine similarity matrix
#     :param top_n: Number of recommendations to return
#     :return: Evaluation metrics (MAE, RMSE)
#     """
#     true_ratings = []
#     predicted_ratings = []
    
#     for idx, row in data.iterrows():
#         sim_scores = list(enumerate(cosine_sim[idx]))
#         sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
#         # Get top_n recommendations
#         recommendations = [data.iloc[i[0]]['rating'] for i in sim_scores[1:top_n + 1]]
        
#         # Use the current movie's rating as ground truth for evaluation
#         true_rating = row['rating']
        
#         # Simulate predicted ratings as the mean of the recommended movies' ratings
#         predicted_rating = np.mean(recommendations)
        
#         true_ratings.append(true_rating)
#         predicted_ratings.append(predicted_rating)
    
#     # Compute MAE and RMSE
#     mae = mean_absolute_error(true_ratings, predicted_ratings)
#     rmse = mean_squared_error(true_ratings, predicted_ratings, squared=False)
    
#     return mae, rmse

# # Evaluate the model
# mae, rmse = evaluate_recommendation_system(data, cosine_sim)
# print(f"Evaluation Results - MAE: {mae:.4f}, RMSE: {rmse:.4f}")


ValueError: np.nan is an invalid document, expected byte or unicode string.

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

def load_movie_data(file_path):
    """
    Load movie data from a CSV file.
    """
    return pd.read_csv(file_path)

def build_recommendation_model(movies_df):
    """
    Build a content-based recommendation model using movie metadata.
    """
    # Combine relevant metadata for better similarity calculations
    movies_df['metadata'] = (movies_df['genres'] + " " + 
                             movies_df['description'] + " " + 
                             movies_df['title'])
    
    # Fill NaN values with an empty string
    movies_df['metadata'] = movies_df['metadata'].fillna("")

    # Convert metadata into TF-IDF features
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(movies_df['metadata'])

    # Calculate similarity matrix
    similarity_matrix = linear_kernel(tfidf_matrix, tfidf_matrix)
    return similarity_matrix

def get_recommendations(input_titles, movies_df, similarity_matrix, num_recommendations=10):
    """
    Provide recommendations based on the input titles.
    """
    # Find indices of input movies
    movie_indices = [movies_df[movies_df['title'] == title].index[0] for title in input_titles if title in movies_df['title'].values]

    # Aggregate similarity scores for input movies
    sim_scores = similarity_matrix[movie_indices].sum(axis=0)
    sim_scores = [(i, score) for i, score in enumerate(sim_scores)]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Filter out input movies from recommendations
    input_indices = set(movie_indices)
    recommendations = [(i, score) for i, score in sim_scores if i not in input_indices]

    # Get top recommendations
    top_recommendations = recommendations[:num_recommendations]
    recommended_titles = [movies_df.iloc[i]['title'] for i, _ in top_recommendations]

    return recommended_titles

def evaluate_model(movies_df, similarity_matrix):
    """
    Evaluate the recommendation model using MAE and RMSE.
    """
    # Create a simulated ratings matrix for evaluation
    user_movie_ratings = pd.DataFrame(index=range(len(movies_df)), columns=range(len(movies_df)))
    np.random.seed(42)  # For reproducibility
    for user in range(len(movies_df)):
        # Assign random ratings for some movies
        rated_movies = np.random.choice(len(movies_df), size=np.random.randint(5, 15), replace=False)
        user_movie_ratings.loc[user, rated_movies] = np.random.uniform(1, 5, size=len(rated_movies))
    
    user_movie_ratings = user_movie_ratings.fillna(0)  # Replace NaN with 0 for unrated movies

    # Train-test split
    train_data, test_data = train_test_split(user_movie_ratings, test_size=0.2, random_state=42)
    
    # Convert dataframes to NumPy arrays for calculations
    train_data = train_data.to_numpy()
    test_data = test_data.to_numpy()

    # Predict ratings for the test set
    predicted_ratings = []
    for user_ratings in test_data:
        pred_ratings = similarity_matrix.dot(user_ratings) / np.array([np.abs(similarity_matrix).sum(axis=1)])
        predicted_ratings.append(pred_ratings)
    predicted_ratings = np.array(predicted_ratings)

    # Filter true and predicted ratings to match dimensions
    true_indices = test_data > 0  # Boolean mask for rated movies
    y_true = test_data[true_indices]
    y_pred = predicted_ratings[true_indices]

    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))

    return mae, rmse



# Example Usage
# Load movie data
movies_data_path = "movies_data_cleaned.csv"  # Replace with your file path
movies_df = load_movie_data(movies_data_path)

# Build the recommendation model
similarity_matrix = build_recommendation_model(movies_df)

# Input movies for recommendations
input_movies = ["Venom: The Last Dance", "The Wild Robot", "Wicked"]

# Get recommendations
recommendations = get_recommendations(input_movies, movies_df, similarity_matrix)
print("Recommended Movies:", recommendations)

# Evaluate the model
mae, rmse = evaluate_model(movies_df, similarity_matrix)
print("Model Evaluation:")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")


Recommended Movies: ['Wicked: Part Two', 'Venom', 'Robot Dreams', 'Venom: Let There Be Carnage', 'The Adventure of A.R.I.: My Robot Friend', 'The Wizard of Oz', 'Palmer', 'Supergirl', 'I, Robot', 'Journey 2: The Mysterious Island']


  user_movie_ratings = user_movie_ratings.fillna(0)  # Replace NaN with 0 for unrated movies


IndexError: boolean index did not match indexed array along dimension 1; dimension is 1 but corresponding boolean dimension is 4028