In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import re

In [None]:
!pip install surprise

from surprise import Dataset, Reader, BaselineOnly
from surprise.model_selection import cross_validate, GridSearchCV
from sklearn.model_selection import train_test_split
from surprise import accuracy

In [7]:
#Load data
ratings_data=pd.read_csv('Ratings.csv')
users_data=pd.read_csv('Users.csv')
books_data=pd.read_csv('Books.csv')

print(ratings_data.shape, users_data.shape, books_data.shape)

(1149780, 3) (278858, 3) (271360, 8)


  exec(code_obj, self.user_global_ns, self.user_ns)


In [8]:
#pre-processing

'''
On handling nulls
For this dataset, since there are not many features while there are only 4 records with nulls,
I think it would be best to just drop the NA rows.
'''
#Remove image url columns
books=books_data.iloc[:,:5]
books=books.dropna()


#Publication year

#Convert publication year to int
books['Year-Of-Publication']=pd.to_numeric(books['Year-Of-Publication'],errors='coerce')

#Publication years that are null
print('Count of records with null publication years: ', len(books[books['Year-Of-Publication'].isna()]), 
     )
books.dropna(subset=['Year-Of-Publication'])
print(1, books.shape)


#and Publication years years that are in the future. These books and their reviews shouldn't be included
print('Count: ', len(books[books['Year-Of-Publication']>2023]),
      '\n',books[books['Year-Of-Publication']>2023]
     )
books=books[books['Year-Of-Publication']<2023]
print(2, books.shape)

#Book author
#There are 3 records where the value for author are PRESUMABLY publication years. The same 3 that had null publication years. We'll get rid of them.   
print(books[books['Book-Author'].str.contains('^\d{4}$')])
books=books[~books['Book-Author'].str.contains('^\d{4}$')]
print(3, books.shape)

#filter by year of publication
books=books[books['Year-Of-Publication']>1900]

#ISBN remove non alpha-numeric characters from ISBN
books['ISBN'] = books['ISBN'].replace('[^a-zA-Z0-9]', '', regex=True)
ratings_data['ISBN'] = ratings_data['ISBN'].replace('[^a-zA-Z0-9]', '', regex=True)


#Remove outlier age

users_data[(users_data['Age']>0)&(users_data['Age']<100)]
print(4, users_data.shape)

#Dropping null age since we dont have that many features to beging with 
#and having this much nulls in one of the features is bad
users_data=users_data.dropna()
print(5, users_data.shape)

Count of records with null publication years:  3
1 (271357, 5)
Count:  13 
               ISBN                                         Book-Title  \
37487   0671746103  MY TEACHER FRIED MY BRAINS (RACK SIZE) (MY TEA...   
55676   0671791990  MY TEACHER FLUNKED THE PLANET (RACK SIZE) (MY ...   
78168   0870449842                                   Crossing America   
80264   0140301690  Alice's Adventures in Wonderland and Through t...   
97826   0140201092      Outline of European Architecture (Pelican S.)   
116053  0394701658                       Three Plays of Eugene Oneill   
118294  3442436893        Das groÃ?Â?e BÃ?Â¶se- MÃ?Â¤dchen- Lesebuch.   
192993  0870446924  Field Guide to the Birds of North America, 3rd...   
228173  0671266500       FOREST PEOPLE (Touchstone Books (Hardcover))   
240169  0684718022            In Our Time: Stories (Scribner Classic)   
246842  0380000059                                              CLOUT   
255409  068471809X                              

In [9]:
#Duke's preprocessing

users=users_data.copy()

# add a new column, age group, to label users' age

bins= [0,2,4,13,20,110]
labels = ['Infant','Toddler','Kid','Teen','Adult']
users['AgeGroup'] = pd.cut(users['Age'], bins=bins, labels=labels, right=False)

# add a columns: Country,State,and City, to label users' country, state, and city.

users['Country'] = users['Location'].str.split(',').str[-1]
users['State'] = users['Location'].str.split(',').str[-2]
users['City'] = users['Location'].str.split(',').str[-3]

# Process the country column to only retain alphabetic letters+space+period as no country name contain numbers, special characters
users['Country'] = users['Country'].str.replace('[^a-zA-Z\.\ ]', '')
users['State'] = users['State'].str.replace('[^a-zA-Z\.\ ]', '')
users['City'] = users['City'].str.replace('[^a-zA-Z\.\ ]', '')

# strip off the leading and trailing white spaces
users['Country'] = users['Country'] .str.strip()
users['State'] = users['State'] .str.strip()
users['City'] = users['City'] .str.strip()


  users['Country'] = users['Country'].str.replace('[^a-zA-Z\.\ ]', '')
  users['State'] = users['State'].str.replace('[^a-zA-Z\.\ ]', '')
  users['City'] = users['City'].str.replace('[^a-zA-Z\.\ ]', '')


In [10]:
#selecting only usa data in attpemt to refine data
users=users[users.Country=='usa']
print(users.shape)

#remove redundancy
users=users.loc[:,['User-ID','AgeGroup','State','City']]


#selecting only users with more than 5 book ratings
ratings=ratings_data.copy()
ratings=ratings.groupby('User-ID').filter(lambda x: len(x) > 5)
print(ratings.shape)

(76495, 7)
(1013453, 3)


In [21]:
#joining data
joined_data=pd.merge(books, ratings, on='ISBN', how='inner')
joined_data=pd.merge(joined_data, users, on='User-ID', how='inner')
joined_data

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating,AgeGroup,State,City
0,0060973129,Decision in Normandy,Carlo D'Este,1991.0,HarperPerennial,2954,8,Adult,kansas,wichita
1,0440235502,October Sky: A Memoir,Homer Hickam,1999.0,Dell,2954,10,Adult,kansas,wichita
2,0380973499,War's End: An Eyewitness Account of America's ...,Charles W. Sweeney,1997.0,William Morrow &amp; Company,2954,7,Adult,kansas,wichita
3,0684867184,"Comrades : Brothers, Fathers, Heroes, Sons, Pals",Stephen E. Ambrose,1999.0,Simon &amp; Schuster,2954,0,Adult,kansas,wichita
4,031224116X,Murder on the Mauretania (George Porter Dillma...,Conrad Allen,2000.0,St. Martin's Minotaur,2954,8,Adult,kansas,wichita
...,...,...,...,...,...,...,...,...,...,...
510838,0761101861,"Beauty, The New Basics",Rona Berg,2001.0,Workman Publishing,251903,10,Adult,washington,seattle
510839,0882669850,"Perfumes, Splashes &amp; Colognes: Discovering...",Nancy M. Booth,1997.0,Storey Publishing,251903,10,Adult,washington,seattle
510840,0877739870,Personality Type (Jung on the Hudson Book Series),Lenore Thomson,1998.0,Shambhala,251903,8,Adult,washington,seattle
510841,0825431654,The Doctrines That Divide: A Fresh Look at the...,Erwin W. Lutzer,1998.0,Kregel Publications,251903,10,Adult,washington,seattle


In [18]:
#Modeling
# data=ratings.copy()

unique_user_ids = data['User-ID'].unique()
train_user_ids, test_user_ids = train_test_split(unique_user_ids, test_size=0.2, random_state=42)

train_set = data[data['User-ID'].isin(train_user_ids)]
test_set = data[data['User-ID'].isin(test_user_ids)]

common_values = train_set['User-ID'].isin(test_set['User-ID'])
if common_values.any():
    print("The two Series share some common values.")
else:
    print("The two Series do not share any common values.")

The two Series do not share any common values.


In [19]:
reader = Reader(rating_scale=(0,10))
train_data = Dataset.load_from_df(train_set[['User-ID', 'ISBN', 'Book-Rating']], reader)
test_data = Dataset.load_from_df(test_set[['User-ID', 'ISBN', 'Book-Rating']], reader)

algo = BaselineOnly()
cross_validate(algo, train_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# predictions = baseline_model.test()
# cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.3087  3.3089  3.3091  3.3156  3.3124  3.3110  0.0027  
MAE (testset)     2.6760  2.6767  2.6760  2.6802  2.6799  2.6777  0.0019  
Fit time          3.72    3.81    4.00    3.78    4.04    3.87    0.13    
Test time         1.05    0.65    1.07    1.03    1.07    0.98    0.16    


{'test_rmse': array([3.30868946, 3.30892981, 3.3090849 , 3.31564894, 3.31240834]),
 'test_mae': array([2.6759945 , 2.67672972, 2.67598772, 2.68015885, 2.67985104]),
 'fit_time': (3.717318058013916,
  3.811126708984375,
  3.9963297843933105,
  3.781867265701294,
  4.043710947036743),
 'test_time': (1.0548686981201172,
  0.6518993377685547,
  1.072000503540039,
  1.0333435535430908,
  1.0674190521240234)}

In [20]:
trainset = train_data.build_full_trainset()
testset = test_data.build_full_trainset().build_testset()

algo.fit(trainset)

predictions = algo.test(testset)
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)
print(f'RMSE on test set: {rmse}')
print(f'MAE on test set: {mae}')

Estimating biases using als...
RMSE: 3.7317
MAE:  3.4396
RMSE on test set: 3.7316742494558115
MAE on test set: 3.4395701474889817


In [24]:
predictions

[Prediction(uid=276746, iid='0425115801', r_ui=0.0, est=3.1979494847852354, details={'was_impossible': False}),
 Prediction(uid=276746, iid='0449006522', r_ui=0.0, est=2.9900445492687986, details={'was_impossible': False}),
 Prediction(uid=276746, iid='0553561618', r_ui=0.0, est=2.582573062324375, details={'was_impossible': False}),
 Prediction(uid=276746, iid='055356451X', r_ui=0.0, est=2.8427554514340643, details={'was_impossible': False}),
 Prediction(uid=276746, iid='0786013990', r_ui=0.0, est=2.786577270078306, details={'was_impossible': False}),
 Prediction(uid=276746, iid='0786014512', r_ui=0.0, est=2.2158863856199806, details={'was_impossible': False}),
 Prediction(uid=276786, iid='2864322102', r_ui=6.0, est=2.686490452487624, details={'was_impossible': False}),
 Prediction(uid=276786, iid='8402065945', r_ui=0.0, est=2.686490452487624, details={'was_impossible': False}),
 Prediction(uid=276786, iid='8423314901', r_ui=0.0, est=2.5530620039523018, details={'was_impossible': False

In [78]:
books

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,0195153448,Classical Mythology,Mark P. O. Morford,2002.0,Oxford University Press
1,0002005018,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada
2,0060973129,Decision in Normandy,Carlo D'Este,1991.0,HarperPerennial
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999.0,Farrar Straus Giroux
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999.0,W. W. Norton &amp; Company
...,...,...,...,...,...
271355,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988.0,Random House Childrens Pub (Mm)
271356,0525447644,From One to One Hundred,Teri Sloat,1991.0,Dutton Books
271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004.0,HarperSanFrancisco
271358,0192126040,Republic (World's Classics),Plato,1996.0,Oxford University Press


In [79]:
ratings

Unnamed: 0,User-ID,ISBN,Book-Rating
10,276746,0425115801,0
11,276746,0449006522,0
12,276746,0553561618,0
13,276746,055356451X,0
14,276746,0786013990,0
...,...,...,...
1149771,276704,0743211383,7
1149772,276704,080410526X,0
1149773,276704,0806917695,5
1149774,276704,0876044011,0


In [25]:
import pandas as pd
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor

In [135]:
df=joined_data
df['User-ID'] = df['User-ID'].astype(int)
df['AgeGroup'] = df['AgeGroup'].astype(str)
df['Book-Rating'] = df['Book-Rating'].astype(int)
df['Year-Of-Publication'] = df['Year-Of-Publication'].apply(lambda x: x.split('.')[0])
df.dropna(how='any',inplace=True)

In [136]:
df.info()

# from sklearn.feature_extraction.text import CountVectorizer

<class 'pandas.core.frame.DataFrame'>
Int64Index: 509159 entries, 0 to 510842
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 509159 non-null  object
 1   Book-Title           509159 non-null  object
 2   Book-Author          509159 non-null  object
 3   Year-Of-Publication  509159 non-null  object
 4   Publisher            509159 non-null  object
 5   User-ID              509159 non-null  int64 
 6   Book-Rating          509159 non-null  int64 
 7   AgeGroup             509159 non-null  object
 8   State                509159 non-null  object
 9   City                 509159 non-null  object
dtypes: int64(2), object(8)
memory usage: 42.7+ MB


In [142]:

# Define a custom transformer for feature engineering
class FeatureEngineering(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Create a feature representing the age of a book
#         X['AgeOfBook'] = pd.Timestamp.now().year - X['Year-Of-Publication'].astype(int)
        return X

# Define the features for collaborative filtering and content-based filtering
collaborative_features = ['User-ID', 'ISBN', 'Book-Rating']
content_based_features = ['Book-Title', 
                          'Book-Author', 
                          'Year-Of-Publication',
                         'Publisher','AgeGroup','State','City'
                         ]

# Create a Surprise Dataset for collaborative filtering
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df[collaborative_features], reader)

# Train-test split for collaborative filtering
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# unique_user_ids = df['User-ID'].unique()
# train_user_ids, test_user_ids = train_test_split(unique_user_ids, test_size=0.2, random_state=42)
# trainset = data[data['User-ID'].isin(train_user_ids)]
# testset = data[data['User-ID'].isin(test_user_ids)]



In [143]:
# Collaborative filtering model
collaborative_algo = KNNBasic(sim_options={'user_based': True})
collaborative_algo.fit(trainset)



Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x145aa240f0a0>

In [170]:
# Content-based filtering model (using book titles, authors, and year of publication)
content_based_pipeline = Pipeline([
    ('feature_engineering', FeatureEngineering()),
#     ('imputer', SimpleImputer(strategy='constant', fill_value='')),
    ('tfidf_vectorizer', TfidfVectorizer(analyzer='word', stop_words='english'
#                                          , lowercase=False
                                        )),
# #     ('count_vectorizer', CountVectorizer(analyzer='word', stop_words='english')),
    ('scaler', StandardScaler(with_mean=False)),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])



# Fit content-based model
content_based_pipeline.fit(df[content_based_features], df['Book-Rating'])



ValueError: Found input variables with inconsistent numbers of samples: [7, 509159]

In [165]:
df[content_based_features].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 509159 entries, 0 to 510842
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Book-Title           509159 non-null  object
 1   Book-Author          509159 non-null  object
 2   Year-Of-Publication  509159 non-null  object
 3   Publisher            509159 non-null  object
 4   AgeGroup             509159 non-null  object
 5   State                509159 non-null  object
 6   City                 509159 non-null  object
dtypes: object(7)
memory usage: 31.1+ MB


In [139]:
content_based_pipeline.predict([['0440400988']])

<surprise.trainset.Trainset at 0x145af806d9a0>

In [147]:
# Combine collaborative and content-based predictions
def hybrid_predict(user_id, isbn):
    # Collaborative filtering prediction
    collaborative_prediction = collaborative_algo.predict(user_id, isbn).est

    # Content-based filtering prediction
    content_based_prediction = content_based_pipeline.predict([[isbn]])

    # Combine predictions (you can customize the weighting)
    hybrid_prediction = 0.7 * collaborative_prediction + 0.3 * content_based_prediction

    return hybrid_prediction


user_id_example = 278838
isbn_example = '0440400988'
prediction_example = hybrid_predict(user_id_example, isbn_example)
print(f'Hybrid Prediction for User {user_id_example}, ISBN {isbn_example}: {prediction_example}')

In [151]:
collaborative_prediction = collaborative_algo.predict(user_id_example, isbn_example).est

In [152]:
collaborative_prediction

2.464030128128015