# Data Preprocessing

In [1]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os 
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
# Get the directory of each dataset 
path = os.path.dirname(os.path.abspath("Books.csv"))
book_path = path + "/datasets/Books.csv"
users_path = path + "/datasets/Users.csv"
ratings_path = path + "/datasets/Ratings.csv"

# Load the dataset
books_dataset = pd.read_csv(book_path, low_memory=False)
users_dataset = pd.read_csv(users_path, low_memory=False)
ratings_dataset = pd.read_csv(ratings_path, low_memory=False)

# Filter out users and books with fewer than 7 ratings
user_counts = ratings_dataset['User-ID'].value_counts()
valid_users = user_counts[user_counts >= 7].index

book_counts = ratings_dataset['ISBN'].value_counts()
valid_books = book_counts[book_counts >= 7].index

# Filter ratings dataset
ratings_dataset = ratings_dataset[ratings_dataset['User-ID'].isin(valid_users) & ratings_dataset['ISBN'].isin(valid_books)]

# Print trimmed dataset summaries
print(f"Trimmed Ratings Dataset: {ratings_dataset.shape[0]} entries")
print(f"Unique Users: {ratings_dataset['User-ID'].nunique()}, Unique Books: {ratings_dataset['ISBN'].nunique()}")

Trimmed Ratings Dataset: 532337 entries
Unique Users: 16842, Unique Books: 28643


In [3]:
# Merge the ratings and users dataset together based on User-ID
merged_data = users_dataset.merge(ratings_dataset, on='User-ID')

# Check for missing values in each dataset
missing_merged = merged_data.isnull().sum()
missing_books = books_dataset.isnull().sum()
print("Missing values in merged dataset: ")
print(missing_merged, end="\n\n")
print("Missing values in books dataset: ")
print(missing_books)

# Remove rows where Book-Author and Publisher are null
books_dataset.dropna(subset=['Book-Author', 'Publisher'], inplace=True)

Missing values in merged dataset: 
User-ID             0
Location            0
Age            134958
ISBN                0
Book-Rating         0
dtype: int64

Missing values in books dataset: 
ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64


# Split the Data and Get Some Baseline 

In [4]:
# Splitting the data into training and testing
train_data, test_data = train_test_split(merged_data, test_size=0.2, random_state=42)

# Calculate global mean rating, user mean rating and item mean rating
global_mean = train_data['Book-Rating'].mean()
user_means = train_data.groupby('User-ID')['Book-Rating'].mean()
item_means = train_data.groupby('ISBN')['Book-Rating'].mean()

# Print the means
print("Global Mean:", global_mean,"\n")
print("User Mean:\n",user_means,"\n")
print("Item Mean:\n",item_means,"\n")

Global Mean: 2.6563896409459247 

User Mean:
 User-ID
8         1.250000
17        1.800000
53        7.333333
99        5.142857
114       8.000000
            ...   
278774    0.000000
278838    0.000000
278843    2.181818
278851    4.000000
278854    5.833333
Name: Book-Rating, Length: 16676, dtype: float64 

Item Mean:
 ISBN
0 907 062 008    3.750000
000000000        0.666667
0000000000       4.714286
0002005018       5.200000
0002005115       1.800000
                   ...   
9871138148       5.000000
987932504        4.571429
B00009EF82       2.285714
B0000AA9IZ       1.500000
M79702002        2.166667
Name: Book-Rating, Length: 28640, dtype: float64 



# Collaborative Filtering Recommender

In [5]:
#import nearest neighbors for KNN

from sklearn.neighbors import NearestNeighbors

def collaborative_filtering(user_id, item_id, user_item_matrix, k=5):
    if user_id not in user_item_matrix.index or item_id not in user_item_matrix.columns:
        return global_mean
    
    user_index = user_item_matrix.index.get_loc(user_id)

    # create user item matrix
    
    user_item_matrix = ratings_dataset.pivot(index='User-ID', columns='ISBN', values='Book-Rating').fillna(0)
    

    # train model (knn)
    knn = NearestNeighbors(n_neighbors=k, metric='cosine')
    knn.fit(user_item_matrix.values.T)

    # find k nearest neighbors

    distances, indices = knn.kneighbors(user_item_matrix.iloc[user_index].values.reshape(1, -1), n_neighbors=k)
    neighbors_indices = indices.flatten()[1:]

    # predict rating
    neighbors_ratings = user_item_matrix.iloc[neighbors_indices][item_id]
    neighbors_ratings = neighbors_ratings[neighbors_ratings.notnull()]
    print(user_item_matrix.head(5))
    return neighbors_ratings.mean() if not neighbors_ratings.empty else global_mean

# Content-Based Filtering

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Combine relevant text features
books_dataset['combine'] = books_dataset['Book-Title'] + ' ' + books_dataset['Book-Author'] + ' ' + books_dataset['Publisher']

# Create the TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(books_dataset['combine'])

# Function for efficient content-based filtering
def content_based_filtering(item_id, tfidf_matrix=tfidf_matrix, top_n=20):
    # Find the index of the item
    try:
        idx = books_dataset[books_dataset['ISBN'] == item_id].index[0]
    except IndexError:
        return []  # If the book ID is not found

    # Compute cosine similarities for the target book on demand
    target_vector = tfidf_matrix[idx]
    cosine_similarities = cosine_similarity(target_vector, tfidf_matrix).flatten()

    # Get the indices of the top N similar books (excluding the target book itself)
    similar_indices = cosine_similarities.argsort()[-(top_n + 1):-1][::-1]

    # Return the top N similar books' ISBNs
    return books_dataset.iloc[similar_indices]['ISBN'].tolist()

# Example Usage
example_book_id = books_dataset['ISBN'].iloc[0]  # Replace with a valid ISBN
similar_books = content_based_filtering(example_book_id)
print(f"Books similar to {example_book_id}: {similar_books}")

Books similar to 0195153448: ['0582280044', '0801304652', '0801319536', '0195210301', '019866172X', '0192814907', '0198601654', '0198721129', '019509199X', '0553257765', '0192177478', '0844255610', '0192892878', '0195089618', '0192813242', '0192800329', '0191012084', '0195215214', '0192861026', '0195123034']


# Hybrid Weighted Recommender

In [7]:
def hybrid_recommender(user_id, item_id, user_means, item_means, user_item_matrix, weights=(0.3, 0.3, 0.4)):
    global_pred = global_mean
    user_pred = user_means.get(user_id, global_mean)
    item_pred = item_means.get(item_id, global_mean)
    cf_pred = collaborative_filtering(user_id, item_id, user_item_matrix)

    # Weighted sum
    final_pred = (weights[0] * global_pred + weights[1] * user_pred + weights[2] * item_pred + (1 - sum(weights)) * cf_pred)
    return final_pred

# Create Matrices

In [8]:
def train_test_split_user_item_matrix(matrix, test_ratio=0.2):
    """Splits the matrix into training and testing data."""
    test_matrix = matrix.copy()
    train_matrix = matrix.copy()

    # Mask 20% of the ratings in the test set
    for user in matrix.index:
        non_zero_indices = matrix.loc[user, :].to_numpy().nonzero()[0]
        test_indices = random.sample(list(non_zero_indices), int(len(non_zero_indices) * test_ratio))
        train_matrix.loc[user, matrix.columns[test_indices]] = 0  # Remove ratings from training set

    return train_matrix, test_matrix

# Create the user-book matrix
user_book_matrix = ratings_dataset.pivot_table(
    index='User-ID', columns='ISBN', values='Book-Rating', fill_value=0
)

train_matrix, test_matrix = train_test_split_user_item_matrix(user_book_matrix)

# Prediction Models and RMSE Calculation

In [9]:
def predict_global_mean(global_mean, test_matrix):
    """Predict all ratings using the global mean."""
    pred = test_matrix.copy()
    pred[pred != 0] = global_mean  # Replace all non-zero values with the global mean
    return pred

def predict_user_mean(user_means, test_matrix):
    """Predict ratings using user means."""
    pred = test_matrix.copy()
    for user in test_matrix.index:
        pred.loc[user, :] = user_means.get(user, global_mean)
    return pred

def predict_item_mean(item_means, test_matrix):
    """Predict ratings using item means."""
    pred = test_matrix.copy()
    for item in test_matrix.columns:
        pred[item] = item_means.get(item, global_mean)
    return pred

def calculate_rmse(actual, predicted):
    """Calculate RMSE between actual and predicted ratings, replacing NaNs with 0."""
    # Flatten the matrices
    actual_values = actual.to_numpy().flatten()
    predicted_values = predicted.to_numpy().flatten()
    
    # Replace NaN values with 0 for both actual and predicted values
    actual_values = np.nan_to_num(actual_values, nan=0)
    predicted_values = np.nan_to_num(predicted_values, nan=0)
    
    # Return RMSE for valid values
    return sqrt(mean_squared_error(actual_values, predicted_values))

# Run Results

In [10]:
# Global Mean
global_mean = train_matrix[train_matrix != 0].mean().mean()
global_pred = predict_global_mean(global_mean, test_matrix)

# User Mean
user_means = train_matrix.replace(0, np.nan).mean(axis=1)
user_pred = predict_user_mean(user_means, test_matrix)

# Item Mean
item_means = train_matrix.replace(0, np.nan).mean(axis=0)
item_pred = predict_item_mean(item_means, test_matrix)

# RMSE Calculation
global_rmse = calculate_rmse(test_matrix, global_pred)
user_rmse = calculate_rmse(test_matrix, user_pred)
item_rmse = calculate_rmse(test_matrix, item_pred)

# Results
print("Evaluation Results:")
print(f"Global Mean RMSE: {global_rmse}")
print(f"User Mean RMSE: {user_rmse}")
print(f"Item Mean RMSE: {item_rmse}")

Evaluation Results:
Global Mean RMSE: 0.03507656554843828
User Mean RMSE: 7.344924415308821
Item Mean RMSE: 7.413210835632333
