# Recommendation System: Matrix Factorization (SVD)

This notebook demonstrates the implementation of a collaborative filtering recommendation system using Singular Value Decomposition (SVD) on the MovieLens 100k dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
import os

# Load data
DATA_DIR = "ml-latest-small"
movies = pd.read_csv(os.path.join(DATA_DIR, "movies.csv"))
ratings = pd.read_csv(os.path.join(DATA_DIR, "ratings.csv"))

print(f"Loaded {len(movies)} movies and {len(ratings)} ratings.")
ratings.head()

## 1. Data Processing
We convert the ratings table into a User-Item Matrix.

In [None]:
user_item_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)
print(f"User-Item Matrix Shape: {user_item_matrix.shape}")

## 2. Matrix Factorization (SVD)
We decompose the large user-item matrix into lower-dimensional latent factors.

In [None]:
n_factors = 20
svd = TruncatedSVD(n_components=n_factors, random_state=42)

# Factorize
latent_matrix = svd.fit_transform(user_item_matrix)
print(f"Latent matrix shape (Users x Factors): {latent_matrix.shape}")

# Component matrix (Factors x Items)
item_factors = svd.components_
print(f"Item factors shape (Factors x Movies): {item_factors.shape}")

## 3. Evaluation: Matrix Reconstruction Error
We check how well the compressed representation can reconstruct the original ratings.

In [None]:
reconstructed_matrix = np.dot(latent_matrix, item_factors)

# Calculate RMSE for non-zero ratings
mask = user_item_matrix > 0
original_ratings = user_item_matrix.values[mask]
predicted_ratings = reconstructed_matrix[mask]

rmse = np.sqrt(mean_squared_error(original_ratings, predicted_ratings))
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

## 4. Explained Variance
How much information is captured by our latent factors?

In [None]:
explained_variance = svd.explained_variance_ratio_.sum()
print(f"Total Explained Variance with {n_factors} factors: {explained_variance*100:.2f}%")

plt.figure(figsize=(8,4))
plt.plot(np.cumsum(svd.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('SVD Explained Variance')
plt.grid(True)
plt.show()