In [1]:
import pandas as pd
import matplotlib.pyplot as plt

## **Reading the CSVs**

In [2]:
books_df = pd.read_csv('books.csv', sep=';', encoding='latin-1', low_memory=False, on_bad_lines='skip', quoting=1)

rating_df = pd.read_csv('ratings.csv', sep=';',
                        encoding='latin1', on_bad_lines='skip')

users_df = pd.read_csv('users.csv', sep=';',
                       encoding='latin1', on_bad_lines='skip')

In [3]:
# dropping missing rows

books_df.dropna(inplace=True)

In [4]:
print(f"Books: {books_df.isnull().sum()}\n")

Books: ISBN                   0
Book-Title             0
Book-Author            0
Year-Of-Publication    0
Publisher              0
Image-URL-S            0
Image-URL-M            0
Image-URL-L            0
dtype: int64



In [5]:
# removing duplicate book titles
books_df = books_df[~(books_df.duplicated(['Book-Title']))]

len(books_df['Book-Title']), len(books_df['Book-Title'].unique())

(242129, 242129)

In [6]:
# removing publication year > 2004

books_df['Year-Of-Publication'] = pd.to_numeric(books_df['Year-Of-Publication'], errors='coerce')
books_df = books_df[(books_df['Year-Of-Publication'] > 0) & (books_df['Year-Of-Publication'] <= 2004)]
print("Min year:", books_df['Year-Of-Publication'].min())
print("Max year:", books_df['Year-Of-Publication'].max())


Min year: 1376
Max year: 2004


In [7]:
books_df[books_df['Year-Of-Publication'] == 1376] 

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
253750,964442011X,Tasht-i khun,IsmaÂ°il Fasih,1376,Nashr-i Alburz,http://images.amazon.com/images/P/964442011X.0...,http://images.amazon.com/images/P/964442011X.0...,http://images.amazon.com/images/P/964442011X.0...


In [8]:
books_df.to_csv('books_df_processed.csv', index=False)
rating_df.to_csv('ratings_df_processed.csv', index=False)
users_df.to_csv('users_df_processed.csv', index=False)

### ** Calculating User-Item Matrix **

In [9]:

import numpy as np

# Users who rated less than 75 books are removed as they lack richness of information that is needed in
# collaborative filtering

# tried different thresholds but 75 gave better results
users_r = rating_df['User-ID'].value_counts()

filter_rating = rating_df[rating_df['User-ID']
                          .isin(users_r[users_r > 75].index)]

# 0 means not rated
explicit_rating = filter_rating[~(filter_rating['Book-Rating'] == 0)]

final_rating_books = explicit_rating.merge(books_df, on='ISBN')

# Removing books that are rated less than the 80th percentile as they lack richness of information that is
# needed in collaborative filtering

num_rating = final_rating_books.groupby('ISBN')['Book-Rating'].count().reset_index()
num_rating = num_rating.rename(columns={'Book-Rating': 'rating_count'})

# Use 80th percentile instead of fixed threshold
rating_threshold = np.percentile(num_rating['rating_count'], 80)
print(f"Book rating threshold (80th percentile): {rating_threshold}")

final_rating_books = final_rating_books.merge(num_rating, on='ISBN')
final_rating = final_rating_books[final_rating_books['rating_count'] >= rating_threshold]

final_rating = final_rating[~(final_rating.duplicated(['User-ID', 'Book-Title']))]

user_item_m = final_rating.pivot(columns='Book-Title' , index= 'User-ID' , values='Book-Rating')

# import joblib
# joblib.dump(user_item_m , 'U_I_M.pkl')

Book rating threshold (80th percentile): 2.0


In [10]:
# Save the model with compression (recommended for large objects)
joblib.dump(user_item_m, 'U_I_M_joblib.pkl', compress=3)

In [11]:
print(f"Zero ratings in final_rating: {(final_rating['Book-Rating'] == 0).sum()}")

Zero ratings in final_rating: 0


In [13]:
import joblib
user_item_matrix = joblib.load("U_I_M_joblib.pkl")
print(user_item_matrix.index[:10])

Index([183, 243, 254, 507, 643, 882, 1025, 1211, 1424, 1435], dtype='int64', name='User-ID')
