In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Dropout, Concatenate
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from math import sqrt

def load_data(filepath):
    r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
    return pd.read_csv(filepath, sep='\t', names=r_cols, encoding='latin-1')

def build_model(num_users, num_movies, embedding_size=50):
    user_input = Input(shape=(1,), name='user_input')
    user_embedding = Embedding(input_dim=num_users+1, output_dim=embedding_size, name='user_embedding')(user_input)
    user_vec = Flatten(name='flatten_user')(user_embedding)

    movie_input = Input(shape=(1,), name='movie_input')
    movie_embedding = Embedding(input_dim=num_movies+1, output_dim=embedding_size, name='movie_embedding')(movie_input)
    movie_vec = Flatten(name='flatten_movie')(movie_embedding)

    concat = Concatenate()([user_vec, movie_vec])
    dense = Dense(256, activation='relu')(concat)
    batch_norm = BatchNormalization()(dense)
    dropout = Dropout(0.5)(batch_norm)
    output = Dense(1)(dropout)

    model = Model(inputs=[user_input, movie_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.0005), loss='mean_squared_error')
    return model

def main():
    # Determine the maximum number of users and movies to set the embedding input dimensions
    ratings = pd.concat([load_data(f'ml-100k/u{k}.base') for k in range(1, 6)])
    num_users = ratings['user_id'].max()
    num_movies = ratings['movie_id'].max()

    model = build_model(num_users, num_movies)

    scaler = StandardScaler()

    rmses = []
    accuracies = []

    # Loop over each set of predefined splits
    for k in range(1, 6):
        train_data = load_data(f'ml-100k/u{k}.base')
        test_data = load_data(f'ml-100k/u{k}.test')

        # Preprocess ratings
        y_train = scaler.fit_transform(train_data['rating'].values.reshape(-1, 1))
        y_test = test_data['rating'].values

        # Train model
        model.fit([train_data['user_id'], train_data['movie_id']], y_train, 
                  batch_size=32, epochs=5, verbose=0)

        # Predict and evaluate
        preds = scaler.inverse_transform(model.predict([test_data['user_id'], test_data['movie_id']]))
        rmse = sqrt(mean_squared_error(y_test, preds))
        accuracy = np.mean(np.abs(preds.ravel() - y_test) <= 0.5)

        rmses.append(rmse)
        accuracies.append(accuracy)

    print(f"Mean RMSE across 5 folds: {np.mean(rmses):.3f} ± {np.std(rmses):.3f}")
    print(f"Accuracy (within 0.5 of true rating): {np.mean(accuracies):.3f} ± {np.std(accuracies):.3f}")

if __name__ == "__main__":
    main()


[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 818us/step
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 851us/step


In [22]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt

def load_data(base_path, test_path):
    # Load base and test data
    base_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
    test_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

    base_data = pd.read_csv(base_path, sep='\t', names=base_cols)
    test_data = pd.read_csv(test_path, sep='\t', names=test_cols)

    return base_data, test_data

def prepare_features(data):
    # Assuming that we have user and movie average ratings calculated
    data['user_avg'] = data.groupby('user_id')['rating'].transform('mean')
    data['movie_avg'] = data.groupby('movie_id')['rating'].transform('mean')
    data['interaction'] = data['user_avg'] * data['movie_avg']

    return data[['user_avg', 'movie_avg', 'interaction', 'rating']]

def train_and_evaluate(train_data, test_data):
    # Prepare features
    X_train = train_data[['user_avg', 'movie_avg', 'interaction']]
    y_train = train_data['rating']
    X_test = test_data[['user_avg', 'movie_avg', 'interaction']]
    y_test = test_data['rating']

    # Train Gradient Boosting Regressor
    model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    # Calculate RMSE and accuracy
    rmse = sqrt(mean_squared_error(y_test, preds))
    accuracy = np.mean(np.abs(preds - y_test) <= 0.5)

    return rmse, accuracy

def main():
    rmses = []
    accuracies = []

    # Iterate over each of the 5 predefined splits
    for k in range(1, 6):
        base_path = f'ml-100k/u{k}.base'
        test_path = f'ml-100k/u{k}.test'
        train_data, test_data = load_data(base_path, test_path)
        
        # Feature engineering
        train_data = prepare_features(train_data)
        test_data = prepare_features(test_data)
        
        rmse, accuracy = train_and_evaluate(train_data, test_data)
        rmses.append(rmse)
        accuracies.append(accuracy)
    
    # Output mean RMSE and Accuracy across 5 folds
    print(f"Mean RMSE across 5 folds: {np.mean(rmses):.3f} ± {np.std(rmses):.3f}")
    print(f"Accuracy (within 0.5 of true rating): {np.mean(accuracies):.3f} ± {np.std(accuracies):.3f}")

if __name__ == "__main__":
    main()


Mean RMSE across 5 folds: 0.895 ± 0.010
Accuracy (within 0.5 of true rating): 0.433 ± 0.004


In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from math import sqrt

def load_data(filepath):
    r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
    return pd.read_csv(filepath, sep='\t', names=r_cols, encoding='latin-1')

def load_item_data():
    i_cols = ['movie_id', 'movie_title', 'release_date', 'video_release_date',
              'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', "Children's",
              'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
              'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
    items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1', usecols=range(24))
    tfidf_transformer = TfidfTransformer(smooth_idf=True)
    genre_matrix = items.iloc[:, 5:].fillna(0)
    tfidf_features = tfidf_transformer.fit_transform(genre_matrix).toarray()
    return items, tfidf_features

def main():
    items, items_tfidf = load_item_data()
    scaler = StandardScaler()

    rmses = []
    accuracies = []

    # 5-fold cross-validation process
    for k in range(1, 6):  # For each of the 5 folds
        train_path = f'ml-100k/u{k}.base'  # Correct file name for training data
        test_path = f'ml-100k/u{k}.test'  # Correct file name for test data
        train_data = load_data(train_path)
        test_data = load_data(test_path)

        # Merge TF-IDF features from the item data
        train_data = train_data.join(items.set_index('movie_id'), on='movie_id')
        test_data = test_data.join(items.set_index('movie_id'), on='movie_id')

        # Prepare features and targets
        X_train = items_tfidf[train_data['movie_id'] - 1]
        X_test = items_tfidf[test_data['movie_id'] - 1]
        y_train = scaler.fit_transform(train_data[['rating']])
        y_test = test_data['rating']

        # Model training
        model = Ridge(alpha=0.01)
        model.fit(X_train, y_train.ravel())

        # Predict and evaluate
        preds = scaler.inverse_transform(model.predict(X_test).reshape(-1, 1))
        rmse = sqrt(mean_squared_error(y_test, preds))
        rmses.append(rmse)
        accuracy = np.mean(np.abs(preds.ravel() - y_test) <= 0.5)
        accuracies.append(accuracy)

    # Output results
    print(f"Mean RMSE across 5 folds: {np.mean(rmses):.3f} ± {np.std(rmses):.3f}")
    print(f"Accuracy (within 0.5 of true rating): {np.mean(accuracies):.3f} ± {np.std(accuracies):.3f}")

if __name__ == "__main__":
    main()