In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from math import sqrt

In [35]:
# Load user data
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols, encoding='latin-1')

# Load ratings data
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols, encoding='latin-1')

# Load items data
i_cols = ['movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown',
          'Action', 'Adventure',
          'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
          'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
          'Western']
items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1', usecols=range(24))

In [36]:
# Preprocess item data - TF-IDF on genres
transformer = TfidfTransformer(smooth_idf=True, norm='l2')
tfidf = transformer.fit_transform(items.iloc[:, 5:].values).toarray()

# Dimension reduction (uncomment to use PCA)
# pca = PCA(n_components=12)  # Adjust components based on explained variance ratio
# tfidf = pca.fit_transform(tfidf)

In [37]:
# Merge ratings with user and item data for more features
data = ratings.join(users.set_index('user_id'), on='user_id')
data = data.join(items.set_index('movie_id'), on='movie_id')

# Normalize ratings
scaler = StandardScaler()
data['normalized_rating'] = scaler.fit_transform(data['rating'].values.reshape(-1, 1))

# Setup cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores = []

In [38]:
for train_index, test_index in kf.split(data):
    train_data, test_data = data.iloc[train_index], data.iloc[test_index]

    # Train the model
    clf = Ridge(alpha=0.01)
    clf.fit(tfidf[train_data['movie_id'].values - 1],
            train_data['normalized_rating'].values)

    # Predict on test set
    preds = clf.predict(tfidf[test_data['movie_id'].values - 1])

    # Reverse normalization for true ratings and calculate RMSE
    preds_denorm = scaler.inverse_transform(preds.reshape(-1, 1))
    rmse = sqrt(mean_squared_error(test_data['rating'], preds_denorm))
    rmse_scores.append(rmse)




In [39]:
# Report results
print(f"Mean RMSE: {np.mean(rmse_scores):.3f} ± {np.std(rmse_scores):.3f}")

Mean RMSE: 1.108 ± 0.005
