# Data Preprocessing

In [1]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os 
from sklearn.model_selection import train_test_split

In [2]:
# Get the directory of each dataset 
path = os.path.dirname(os.path.abspath("Books.csv"))
book_path = path + "/datasets/Books.csv"
users_path = path + "/datasets/Users.csv"
ratings_path = path + "/datasets/Ratings.csv"

# Load the dataset
books_dataset = pd.read_csv(book_path, low_memory=False)
users_dataset = pd.read_csv(users_path, low_memory=False)
ratings_dataset = pd.read_csv(ratings_path, low_memory=False)

# Print the first 5 rows of each dataset 
print("First 5 rows of books dataset")
print(books_dataset.head(5))

print("First 5 rows of users dataset")
print(users_dataset.head(5))

print("First 5 rows of ratings dataset")
print(ratings_dataset.head(5))


First 5 rows of books dataset
         ISBN                                         Book-Title  \
0  0195153448                                Classical Mythology   
1  0002005018                                       Clara Callan   
2  0060973129                               Decision in Normandy   
3  0374157065  Flu: The Story of the Great Influenza Pandemic...   
4  0393045218                             The Mummies of Urumchi   

            Book-Author Year-Of-Publication                   Publisher  \
0    Mark P. O. Morford                2002     Oxford University Press   
1  Richard Bruce Wright                2001       HarperFlamingo Canada   
2          Carlo D'Este                1991             HarperPerennial   
3      Gina Bari Kolata                1999        Farrar Straus Giroux   
4       E. J. W. Barber                1999  W. W. Norton &amp; Company   

                                         Image-URL-S  \
0  http://images.amazon.com/images/P/0195153448.0...  

In [3]:
# Display the statistical summary for each dataset
print("Books Dataset Summary: ")
print(books_dataset.describe())
print("-----------------------")

print("Users Dataset Summary: ")
print(users_dataset.describe())
print("-----------------------")

print("Ratings Dataset Summary:")
print(ratings_dataset.describe())
print("-----------------------")

Books Dataset Summary: 
              ISBN      Book-Title      Book-Author Year-Of-Publication  \
count       271360          271360           271358              271360   
unique      271360          242135           102022                 118   
top     020130998X  Selected Poems  Agatha Christie                2002   
freq             1              27              632               17627   

        Publisher                                        Image-URL-S  \
count      271358                                             271360   
unique      16807                                             271044   
top     Harlequin  http://images.amazon.com/images/P/155936078X.0...   
freq         7535                                                  2   

                                              Image-URL-M  \
count                                              271360   
unique                                             271044   
top     http://images.amazon.com/images/P/155936078X.0..

In [4]:
# Merge the ratings and users dataset together based on ID
merged_data = users_dataset.merge(ratings_dataset, on='User-ID')

# Check for missing values in each dataset
missing_merged = merged_data.isnull().sum()
missing_books = books_dataset.isnull().sum()
print("Missing values in merged dataset: ")
print(missing_merged, end="\n\n")

print("Missing values in books dataset: ")
print(missing_books)

# Remove the rows where Book-Author and Publisher are null
books_dataset.dropna(subset=['Book-Author', 'Publisher'], inplace=True)

Missing values in merged dataset: 
User-ID             0
Location            0
Age            309492
ISBN                0
Book-Rating         0
dtype: int64

Missing values in books dataset: 
ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64


# Split the Data and Get Some Baseline 

In [5]:
# Splitting the data into training and testing
train_data, test_data = train_test_split(merged_data, test_size=0.2, random_state=42)

# Calculate global mean rating, user mean rating and item mean rating
global_mean = train_data['Book-Rating'].mean()
user_means = train_data.groupby('User-ID')['Book-Rating'].mean()
item_means = train_data.groupby('ISBN')['Book-Rating'].mean()

# Print the means
print("Global Mean:", global_mean,"\n")
print("User Mean:\n",user_means,"\n")
print("Item Mean:\n",item_means,"\n")

Global Mean: 2.8676801214145313 

User Mean:
 User-ID
7          0.000000
8          2.090909
9          2.000000
10         3.000000
12        10.000000
            ...    
278846     4.000000
278849     2.250000
278851     3.857143
278852     8.000000
278854     4.833333
Name: Book-Rating, Length: 92906, dtype: float64 

Item Mean:
 ISBN
 0330299891    3.0
 0586045007    0.0
 9022906116    3.5
 9032803328    0.0
 9044922572    0.0
              ... 
cn113107       0.0
ooo7156103     7.0
§423350229     0.0
´3499128624    8.0
Ô½crosoft      7.0
Name: Book-Rating, Length: 298432, dtype: float64 



# Collaborative Filtering Recommender

In [6]:
def collaborative_filtering(user_id, item_id, user_item_matrix, k=5):
    if user_id not in user_item_matrix.index or item_id not in user_item_matrix.columns:
        return global_mean
    user_ratings = user_item_matrix.loc[user_id]
    similar_users = user_item_matrix.corrwith(user_ratings).dropna().sort_values(ascending=False)[1:k+1]
    similar_users_ratings = user_item_matrix.loc[similar_users.index, item_id]
    return similar_users_ratings.mean() if not similar_users_ratings.empty else global_mean

# Hybrid Weighted Recommender

In [7]:
def hybrid_recommender(user_id, item_id, user_means, item_means, user_item_matrix, weights=(0.3, 0.3, 0.4)):
    global_pred = global_mean
    user_pred = user_means.get(user_id, global_mean)
    item_pred = item_means.get(item_id, global_mean)
    cf_pred = collaborative_filtering(user_id, item_id, user_item_matrix)

    # Weighted sum
    final_pred = (weights[0] * global_pred + weights[1] * user_pred + weights[2] * item_pred + (1 - sum(weights)) * cf_pred)
    return final_pred