In [12]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from math import sqrt

def load_data(filepath):
    r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
    return pd.read_csv(filepath, sep='\t', names=r_cols, encoding='latin-1')

def load_item_data():
    i_cols = ['movie_id', 'movie_title', 'release_date', 'video_release_date',
              'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', "Children's",
              'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
              'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
    items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1', usecols=range(24))
    tfidf_transformer = TfidfTransformer(smooth_idf=True)
    genre_matrix = items.iloc[:, 5:].fillna(0)
    tfidf_features = tfidf_transformer.fit_transform(genre_matrix).toarray()
    return items, tfidf_features

def main():
    items, items_tfidf = load_item_data()
    scaler = StandardScaler()

    rmses = []
    accuracies = []

    # 5-fold cross-validation process
    for k in range(1, 6):  # For each of the 5 folds
        train_path = f'ml-100k/u{k}.base'  # Correct file name for training data
        test_path = f'ml-100k/u{k}.test'  # Correct file name for test data
        train_data = load_data(train_path)
        test_data = load_data(test_path)

        # Merge TF-IDF features from the item data
        train_data = train_data.join(items.set_index('movie_id'), on='movie_id')
        test_data = test_data.join(items.set_index('movie_id'), on='movie_id')

        # Prepare features and targets
        X_train = items_tfidf[train_data['movie_id'] - 1]
        X_test = items_tfidf[test_data['movie_id'] - 1]
        y_train = scaler.fit_transform(train_data[['rating']])
        y_test = test_data['rating']

        # Model training
        model = Ridge(alpha=0.01)
        model.fit(X_train, y_train.ravel())

        # Predict and evaluate
        preds = scaler.inverse_transform(model.predict(X_test).reshape(-1, 1))
        rmse = sqrt(mean_squared_error(y_test, preds))
        rmses.append(rmse)
        accuracy = np.mean(np.abs(preds.ravel() - y_test) <= 0.5)
        accuracies.append(accuracy)

    # Output results
    print(f"Mean RMSE across 5 folds: {np.mean(rmses):.3f} ± {np.std(rmses):.3f}")
    print(f"Accuracy (within 0.5 of true rating): {np.mean(accuracies):.3f} ± {np.std(accuracies):.3f}")

if __name__ == "__main__":
    main()


Mean RMSE across 5 folds: 1.107 ± 0.015
Accuracy (within 0.5 of true rating): 0.334 ± 0.004
