In [None]:

# Notebook 2: Matrix Factorization, Evaluation & Correlation-Based Recommendations

# Import required libraries
from google.colab import drive
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns

# Mount Google Drive to load data
drive.mount('/content/drive')

# Load the ratings, users, and movies datasets
ratings = pd.read_csv('/content/drive/MyDrive/Scalerdatasets/zee_rat.csv')
users = pd.read_csv('/content/drive/MyDrive/Scalerdatasets/zee_user.csv')
movies = pd.read_csv('/content/drive/MyDrive/Scalerdatasets/zee_movie.csv', encoding='ISO-8859-1')

# Clean column names for consistency
from skimpy import clean_columns
ratings = clean_columns(ratings, case='snake')
users = clean_columns(users, case='snake')
movies = clean_columns(movies, case='snake')

# Prepare data for matrix factorization with cmfrec
# Create a DataFrame with UserId, ItemId, Rating columns as expected by cmfrec
rmraw = ratings[['userid', 'movieid', 'rating']].copy()
rmraw.columns = ['UserId', 'ItemId', 'Rating']

# Import cmfrec library for collective matrix factorization
from cmfrec import CMF

# Initialize CMF model with 3 latent factors and regularization
model = CMF(k=3, lambda_=0.1, user_bias=False, item_bias=False, verbose=False)

# Fit the model on raw ratings data
model.fit(rmraw)

# Inspect shape of user and item factor matrices A and B
print("User factors matrix A shape:", model.A.shape)
print("Item factors matrix B shape:", model.B.shape)

# Access latent factors for users and items
user_factors = model.A
item_factors = model.B

# Generate top-N recommendations for a sample user (userid = 4)
top_items = model.topN(user=4, n=10)
print("Top 10 recommended movie IDs for user 4:", top_items)

# Fetch movie titles for recommended movie IDs
recommended_titles = movies.loc[movies.movieid.isin(top_items), ['movieid', 'title']]
print(recommended_titles)

# Evaluate model performance with RMSE and MAPE
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_percentage_error as mape

# Compute full rating matrix predicted by model
rm_pred = np.dot(model.A, model.B.T) + model.global_mean

# Get actual ratings matrix (pivoted)
ratings_pivot = ratings.pivot(index='userid', columns='movieid', values='rating').fillna(0)

# Flatten actual and predicted matrices for error calculation
y_true = ratings_pivot.values.flatten()
y_pred = rm_pred.flatten()

# Calculate RMSE and MAPE
rmse_value = np.sqrt(mse(y_true, y_pred))
mape_value = mape(y_true, y_pred)

print(f"Model RMSE: {rmse_value}")
print(f"Model MAPE: {mape_value}")

# Create movie-movie correlation matrix using mean ratings per movie grouped by genre
mean_ratings = ratings.groupby(['movieid'])['rating'].mean()
movie_ratings_matrix = ratings.pivot_table(index='userid', columns='movieid', values='rating').fillna(0)
correlation_matrix = movie_ratings_matrix.corr(method='pearson')

# Function to recommend movies based on correlation similarity to a given movie
def recommend_based_on_correlation(movie_title, movies_df, corr_matrix, n=10):
    """Recommend movies similar to the given movie based on correlation."""
    movie_id = movies_df[movies_df.title.str.contains(movie_title, case=False)].movieid.values[0]
    # Get correlations for the movie with all other movies
    movie_corrs = corr_matrix[movie_id].dropna().sort_values(ascending=False)
    # Exclude the movie itself
    movie_corrs = movie_corrs[movie_corrs.index != movie_id]
    # Map movie ids to titles for top recommended correlated movies
    top_ids = movie_corrs.head(n).index
    recommendations = movies_df[movies_df.movieid.isin(top_ids)].title
    return recommendations

# Example: Recommend movies similar to 'Braveheart'
similar_movies = recommend_based_on_correlation('Braveheart', movies, correlation_matrix)
print("Movies similar to Braveheart:")
print(similar_movies)

# Additional exploratory visualizations for user groups

# Merge datasets for enriched user information
df = ratings.merge(users, on='userid').merge(movies, on='movieid')

# Plot distribution of ratings by age groups
plt.figure(figsize=(10, 6))
age_group_rating_counts = df.groupby('age')['rating'].count().sort_values(ascending=False)
sns.barplot(x=age_group_rating_counts.index, y=age_group_rating_counts.values)
plt.title('Number of Ratings by Age Group')
plt.xlabel('Age')
plt.ylabel('Number of Ratings')
plt.show()

# Plot distribution of ratings by occupation groups
plt.figure(figsize=(10, 6))
occupation_rating_counts = df.groupby('occupation')['rating'].count().sort_values(ascending=False)
sns.barplot(x=occupation_rating_counts.index, y=occupation_rating_counts.values)
plt.title('Number of Ratings by Occupation')
plt.xlabel('Occupation')
plt.ylabel('Number of Ratings')
plt.show()

# Plot decade-wise releases count
df['decade'] = (df['releaseyear'] // 10) * 10
release_counts = df.groupby('decade')['movieid'].nunique()

plt.figure(figsize=(10, 6))
release_counts.plot(kind='bar')
plt.title('Number of Movies Released per Decade')
plt.xlabel('Decade')
plt.ylabel('Number of Movies')
plt.show()

# Sparse matrix example using scipy.sparse
import scipy.sparse as sp

dense_matrix = np.array([[1, 0, 3], [0, 0, 7]])
sparse_matrix = sp.coo_matrix(dense_matrix)
print("Sparse matrix representation:")
print(sparse_matrix)

# End of Notebook 2
