In [85]:
!pip install pandas numpy scikit-learn



In [101]:
import pandas as pd
import numpy as np

from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [116]:
ratings = pd.read_csv(
    '../data/u.data',
    sep='\t',
    names=['user_id', 'movie_id', 'rating', 'timestamp']
)

movies = pd.read_csv(
    '../data/u.item',
    sep='|',
    encoding='latin-1',
    names=[
        'movie_id', 'title', 'release_date', 'video_release',
        'imdb_url', 'unknown', 'Action', 'Adventure', 'Animation',
        'Children', 'Comedy', 'Crime', 'Documentary', 'Drama',
        'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
        'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
    ]
)

In [130]:
user_item_matrix = ratings.pivot(
    index='user_id',
    columns='movie_id',
    values='rating'
).fillna(0)

In [143]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(
    ratings,
    test_size=0.2,
    random_state=42
)

In [155]:
# Mean-centering
user_means = train_matrix.replace(0, np.nan).mean(axis=1)

train_matrix_centered = train_matrix.sub(user_means, axis=0)
train_matrix_centered = train_matrix_centered.fillna(0)

In [166]:
svd = TruncatedSVD(
    n_components=50,
    random_state=42
)

latent_matrix = svd.fit_transform(train_matrix_centered)

In [176]:
reconstructed_matrix = np.dot(
    latent_matrix,
    svd.components_
)

reconstructed_df = pd.DataFrame(
    reconstructed_matrix,
    index=train_matrix.index,
    columns=train_matrix.columns
)

reconstructed_df = reconstructed_df.add(user_means, axis=0)
reconstructed_df = reconstructed_df.clip(1, 5)

In [185]:
def precision_recall_at_k(reconstructed_df, ratings, user_id, k=5, threshold=4):
    true_items = ratings[
        (ratings.user_id == user_id) & (ratings.rating >= threshold)
    ]['movie_id']

    recommended_items = (
        reconstructed_df.loc[user_id]
        .sort_values(ascending=False)
        .head(k)
        .index
    )

    relevant_and_recommended = set(true_items) & set(recommended_items)

    precision = len(relevant_and_recommended) / k
    recall = (
        len(relevant_and_recommended) / len(true_items)
        if len(true_items) > 0 else 0
    )

    return precision, recall

In [193]:
user_id = 196

precision, recall = precision_recall_at_k(
    reconstructed_df,
    ratings,
    user_id=user_id,
    k=5
)

print("Precision@5:", precision)
print("Recall@5:", recall)

Precision@5: 0.4
Recall@5: 0.09090909090909091


In [200]:
y_true = []
y_pred = []

for _, row in test_data.iterrows():
    user = row.user_id
    movie = row.movie_id
    
    if user in reconstructed_df.index and movie in reconstructed_df.columns:
        pred = reconstructed_df.loc[user, movie]
        
        # Skip unrealistic predictions
        if 1 <= pred <= 5:
            y_true.append(row.rating)
            y_pred.append(pred)

In [206]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mae = mean_absolute_error(y_true, y_pred)

print("RMSE:", rmse)
print("MAE:", mae)

RMSE: 1.0022099984187582
MAE: 0.7951547227623301


In [211]:
# ================================
# CLEAN MATRIX FACTORIZATION BLOCK
# ================================

# 1. Build user–item matrix from TRAIN DATA ONLY
train_matrix = train_data.pivot(
    index='user_id',
    columns='movie_id',
    values='rating'
)

# 2. Compute user means (ignore missing values)
user_means = train_matrix.mean(axis=1)

# 3. Mean-center the matrix
train_matrix_centered = train_matrix.sub(user_means, axis=0)

# 4. Fill NaNs with 0 AFTER centering
train_matrix_centered = train_matrix_centered.fillna(0)

# 5. Apply SVD
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=50, random_state=42)
latent_matrix = svd.fit_transform(train_matrix_centered)

# 6. Reconstruct ratings
reconstructed_matrix = latent_matrix @ svd.components_

reconstructed_df = pd.DataFrame(
    reconstructed_matrix,
    index=train_matrix.index,
    columns=train_matrix.columns
)

# 7. Add user means back
reconstructed_df = reconstructed_df.add(user_means, axis=0)

# 8. Clip to valid rating range
reconstructed_df = reconstructed_df.clip(1, 5)

In [213]:
user_id = 196

already_rated = ratings[ratings.user_id == user_id]['movie_id']

recommendations = (
    reconstructed_df.loc[user_id]
    .drop(already_rated)
    .sort_values(ascending=False)
    .head(5)
)

for movie_id, score in recommendations.items():
    title = movies[movies.movie_id == movie_id]['title'].values[0]
    print(f"{title} → Predicted Rating: {score:.2f}")

Piano, The (1993) → Predicted Rating: 3.79
Evita (1996) → Predicted Rating: 3.77
Shining, The (1980) → Predicted Rating: 3.76
Air Force One (1997) → Predicted Rating: 3.76
Devil's Own, The (1997) → Predicted Rating: 3.76


Evaluation Note
TruncatedSVD performs matrix factorization optimized for recommendation ranking.
Therefore, evaluation is performed using Precision@K and Recall@K
instead of RMSE and MAE.