In [1]:
!pip install surprise

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import re

from surprise import Dataset, Reader, BaselineOnly, KNNBasic, accuracy
from surprise.model_selection import cross_validate, GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

from joblib import dump, load


Defaulting to user installation because normal site-packages is not writeable


In [34]:
#Load data
ratings_data=pd.read_csv('Ratings.csv')
users_data=pd.read_csv('Users.csv')
books_data=pd.read_csv('Books.csv')

print(ratings_data.shape, users_data.shape, books_data.shape)

(1149780, 3) (278858, 3) (271360, 8)


  exec(code_obj, self.user_global_ns, self.user_ns)


In [35]:
#pre-processing

'''
On handling nulls
For this dataset, since there are not many features while there are only 4 records with nulls,
I think it would be best to just drop the NA rows.
'''
#Remove image url columns
books=books_data.iloc[:,:5]
books=books.dropna()


#Publication year

#Convert publication year to int
books['Year-Of-Publication']=pd.to_numeric(books['Year-Of-Publication'],errors='coerce')

#Publication years that are null
print('Count of records with null publication years: ', len(books[books['Year-Of-Publication'].isna()]), 
     )
books.dropna(subset=['Year-Of-Publication'])
print(1, books.shape)


#and Publication years years that are in the future. These books and their reviews shouldn't be included
print('Count: ', len(books[books['Year-Of-Publication']>2023]),
      '\n',books[books['Year-Of-Publication']>2023]
     )
books=books[books['Year-Of-Publication']<2023]
print(2, books.shape)

#Book author
#There are 3 records where the value for author are PRESUMABLY publication years. The same 3 that had null publication years. We'll get rid of them.   
print(books[books['Book-Author'].str.contains('^\d{4}$')])
books=books[~books['Book-Author'].str.contains('^\d{4}$')]
print(3, books.shape)

#filter by year of publication
books=books[books['Year-Of-Publication']>1900]

#ISBN remove non alpha-numeric characters from ISBN
books['ISBN'] = books['ISBN'].replace('[^a-zA-Z0-9]', '', regex=True)
ratings_data['ISBN'] = ratings_data['ISBN'].replace('[^a-zA-Z0-9]', '', regex=True)


#Remove outlier age

users_data[(users_data['Age']>0)&(users_data['Age']<100)]
print(4, users_data.shape)

#Dropping null age since we dont have that many features to beging with 
#and having this much nulls in one of the features is bad
users_data=users_data.dropna()
print(5, users_data.shape)

Count of records with null publication years:  3
1 (271357, 5)
Count:  13 
               ISBN                                         Book-Title  \
37487   0671746103  MY TEACHER FRIED MY BRAINS (RACK SIZE) (MY TEA...   
55676   0671791990  MY TEACHER FLUNKED THE PLANET (RACK SIZE) (MY ...   
78168   0870449842                                   Crossing America   
80264   0140301690  Alice's Adventures in Wonderland and Through t...   
97826   0140201092      Outline of European Architecture (Pelican S.)   
116053  0394701658                       Three Plays of Eugene Oneill   
118294  3442436893        Das groÃ?Â?e BÃ?Â¶se- MÃ?Â¤dchen- Lesebuch.   
192993  0870446924  Field Guide to the Birds of North America, 3rd...   
228173  0671266500       FOREST PEOPLE (Touchstone Books (Hardcover))   
240169  0684718022            In Our Time: Stories (Scribner Classic)   
246842  0380000059                                              CLOUT   
255409  068471809X                              

In [36]:
#Duke's preprocessing

users=users_data.copy()

# add a new column, age group, to label users' age

bins= [0,2,4,13,20,110]
labels = ['Infant','Toddler','Kid','Teen','Adult']
users['AgeGroup'] = pd.cut(users['Age'], bins=bins, labels=labels, right=False)

# add a columns: Country,State,and City, to label users' country, state, and city.

users['Country'] = users['Location'].str.split(',').str[-1]
users['State'] = users['Location'].str.split(',').str[-2]
users['City'] = users['Location'].str.split(',').str[-3]

# Process the country column to only retain alphabetic letters+space+period as no country name contain numbers, special characters
users['Country'] = users['Country'].str.replace('[^a-zA-Z\.\ ]', '')
users['State'] = users['State'].str.replace('[^a-zA-Z\.\ ]', '')
users['City'] = users['City'].str.replace('[^a-zA-Z\.\ ]', '')

# strip off the leading and trailing white spaces
users['Country'] = users['Country'] .str.strip()
users['State'] = users['State'] .str.strip()
users['City'] = users['City'] .str.strip()


  users['Country'] = users['Country'].str.replace('[^a-zA-Z\.\ ]', '')
  users['State'] = users['State'].str.replace('[^a-zA-Z\.\ ]', '')
  users['City'] = users['City'].str.replace('[^a-zA-Z\.\ ]', '')


In [37]:
#selecting only usa data in attpemt to refine data
users=users[users.Country=='usa']
print(users.shape)

#remove redundancy
users=users.loc[:,['User-ID','AgeGroup','State','City']]

#selecting only users with more than 5 book ratings
ratings=ratings_data.copy()
ratings=ratings.groupby('User-ID').filter(lambda x: len(x) > 5)
print(ratings.shape)

(76495, 7)
(1013453, 3)


In [38]:
#joining data
joined_data=pd.merge(books, ratings, on='ISBN', how='inner')
joined_data=pd.merge(joined_data, users, on='User-ID', how='inner')


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating,AgeGroup,State,City
0,0060973129,Decision in Normandy,Carlo D'Este,1991.0,HarperPerennial,2954,8,Adult,kansas,wichita
1,0440235502,October Sky: A Memoir,Homer Hickam,1999.0,Dell,2954,10,Adult,kansas,wichita
2,0380973499,War's End: An Eyewitness Account of America's ...,Charles W. Sweeney,1997.0,William Morrow &amp; Company,2954,7,Adult,kansas,wichita
3,0684867184,"Comrades : Brothers, Fathers, Heroes, Sons, Pals",Stephen E. Ambrose,1999.0,Simon &amp; Schuster,2954,0,Adult,kansas,wichita
4,031224116X,Murder on the Mauretania (George Porter Dillma...,Conrad Allen,2000.0,St. Martin's Minotaur,2954,8,Adult,kansas,wichita
...,...,...,...,...,...,...,...,...,...,...
510838,0761101861,"Beauty, The New Basics",Rona Berg,2001.0,Workman Publishing,251903,10,Adult,washington,seattle
510839,0882669850,"Perfumes, Splashes &amp; Colognes: Discovering...",Nancy M. Booth,1997.0,Storey Publishing,251903,10,Adult,washington,seattle
510840,0877739870,Personality Type (Jung on the Hudson Book Series),Lenore Thomson,1998.0,Shambhala,251903,8,Adult,washington,seattle
510841,0825431654,The Doctrines That Divide: A Fresh Look at the...,Erwin W. Lutzer,1998.0,Kregel Publications,251903,10,Adult,washington,seattle


In [39]:
df=joined_data
df['User-ID'] = df['User-ID'].astype(int)
df['AgeGroup'] = df['AgeGroup'].astype(str)
df['Book-Rating'] = df['Book-Rating'].astype(int)
# df['Year-Of-Publication'] = df['Year-Of-Publication'].astype(str).apply(lambda x: x.split('.')[0])
df['Year-Of-Publication'] = df['Year-Of-Publication'].astype(int)
df.dropna(how='any',inplace=True)

In [40]:
df.info()

# from sklearn.feature_extraction.text import CountVectorizer

<class 'pandas.core.frame.DataFrame'>
Int64Index: 510843 entries, 0 to 510842
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 510843 non-null  object
 1   Book-Title           510843 non-null  object
 2   Book-Author          510843 non-null  object
 3   Year-Of-Publication  510843 non-null  int64 
 4   Publisher            510843 non-null  object
 5   User-ID              510843 non-null  int64 
 6   Book-Rating          510843 non-null  int64 
 7   AgeGroup             510843 non-null  object
 8   State                510843 non-null  object
 9   City                 510843 non-null  object
dtypes: int64(3), object(7)
memory usage: 42.9+ MB


In [41]:

# Define a custom transformer for feature engineering
class FeatureEngineering(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Create a feature representing the age of a book
#         X['AgeOfBook'] = pd.Timestamp.now().year - X['Year-Of-Publication'].astype(int)
        return X




In [42]:
# Create Dataset 
reader = Reader(rating_scale=(1, 10))
data = df.copy()


# To address leakage
unique_user_ids = data['User-ID'].unique()
train_user_ids, test_user_ids = train_test_split(unique_user_ids, test_size=0.2, random_state=42)
trainset = data[data['User-ID'].isin(train_user_ids)]
testset = data[data['User-ID'].isin(test_user_ids)]



In [43]:
# Collaborative filtering

# Define the features
collaborative_features = ['User-ID', 'ISBN', 'Book-Rating']

collab_train=Dataset.load_from_df(trainset[collaborative_features], reader)
collab_test=Dataset.load_from_df(testset[collaborative_features], reader)

baseline = BaselineOnly()
KNN = KNNBasic(sim_options={'user_based': True})

# cross_validate(baseline, collab_train, measures=['RMSE', 'MAE'], cv=5, verbose=True)

collab_trainset=collab_train.build_full_trainset()
collab_testset=collab_test.build_full_trainset().build_testset()


baseline.fit(collab_trainset)
predictions = baseline.test(collab_testset)
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)
print(f'RMSE on test set: {rmse}')
print(f'MAE on test set: {mae}')
print('___________________________')


KNN.fit(collab_trainset)
predictions = KNN.test(collab_testset)
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)
print(f'RMSE on test set: {rmse}')
print(f'MAE on test set: {mae}')
print('___________________________')


Estimating biases using als...
RMSE: 3.6987
MAE:  3.3494
RMSE on test set: 3.698713221640033
MAE on test set: 3.3494474448901776
___________________________
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 3.7194
MAE:  3.3456
RMSE on test set: 3.7194109611329065
MAE on test set: 3.3456139855661795
___________________________


In [44]:
# Content-based filtering


content_based_features = ['Book-Title', 
                          'Book-Author', 
                          'Year-Of-Publication',
#                          'Publisher',
                          'AgeGroup',
#                           'State',
                          'City'
                         ]

trainset_X = trainset[content_based_features]
trainset_y = trainset['Book-Rating']
testset_X = testset[content_based_features]
testset_y = testset['Book-Rating']

# Encoding


categorical_attributes = list(trainset_X.select_dtypes(include=['object']).columns)
numerical_attributes = list(trainset_X.select_dtypes(include=['float64', 'int64']).columns)


num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')),
                         ('std_scaler', StandardScaler()),
                        ])
full_pipeline = ColumnTransformer([('num', num_pipeline, numerical_attributes),
                                   ('cat', OneHotEncoder(), categorical_attributes),
                                  ])



In [47]:
train = full_pipeline.fit_transform(trainset_X)
train_labels = trainset_y
train.shape ,trainset_X.shape

((411152, 188526), (411152, 5))

In [None]:
# Linear

lin_reg = LinearRegression()
lin_reg.fit(train, train_labels)

area_predictions = lin_reg.predict(train)
lin_mse = mean_squared_error(train_labels, area_predictions)
lin_rmse = np.sqrt(lin_mse)
print('linear_train_rmse', lin_rmse) 


scores = cross_val_score(lin_reg, train, train_labels, scoring='neg_mean_squared_error', cv=10)
lin_rmse_scores = np.sqrt(-scores)

def explain_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
 
explain_scores(lin_rmse_scores)



linear_train_rmse 2.7520909647897973


In [None]:
# Saving model
dump(lin_reg, 'lin_reg.joblib') 
# model=load('lin_reg.joblib')

In [None]:
# KNN

knn_reg = KNeighborsRegressor()
knn_reg.fit(train, train_labels)

area_predictions = knn_reg.predict(train)
knn_mse = mean_squared_error(train_labels, area_predictions)
knn_rmse = np.sqrt(knn_mse)
print('knn_train_rmse', knn_rmse) #overfiiting

scores = cross_val_score(knn_reg, train, train_labels, scoring='neg_mean_squared_error', cv=10)
knn_rmse_scores = np.sqrt(-scores)
explain_scores(knn_rmse_scores)


In [None]:
# Saving model
dump(knn_reg, 'knn_reg.joblib') 
# model=load('knn_reg.joblib')

In [50]:
# SVM

svm_reg = SVR(kernel='linear')
svm_reg.fit(train, train_labels)

area_predictions = svm_reg.predict(train)
svm_mse = mean_squared_error(train_labels, area_predictions)
svm_rmse = np.sqrt(svm_mse)
print('svm_train_rmse', svm_rmse) #svm is generalizing well to crossvalidation set

scores = cross_val_score(svm_reg, train, train_labels, scoring='neg_mean_squared_error', cv=10)
svm_rmse_scores = np.sqrt(-scores)
explain_scores(svm_rmse_scores)


In [None]:
# Saving model
dump(svm_reg, 'svm_reg.joblib') 
# model=load('svm_reg.joblib')

In [92]:
# Decision Tree

tree_reg = DecisionTreeRegressor()
tree_reg.fit(train, train_labels)

area_predictions = tree_reg.predict(train)
tree_mse = mean_squared_error(train_labels, area_predictions)
tree_rmse = np.sqrt(tree_mse)
print('tree_train_rmse', tree_rmse)

scores = cross_val_score(tree_reg, train, train_labels, scoring='neg_mean_squared_error', cv=10)
tree_rmse_scores = np.sqrt(-scores)
explain_scores(tree_rmse_scores)


tree_train_rmse 0.0031622776601683794


In [None]:
# Saving model
dump(tree_reg, 'tree_reg.joblib') 
# model=load('tree_reg.joblib')

In [19]:
# RF

rf_reg = RandomForestRegressor(n_jobs=-1, verbose=True)
rf_reg.fit(train, train_labels)

area_predictions = rf_reg.predict(train)
rf_mse = mean_squared_error(train_labels, area_predictions)
rf_rmse = np.sqrt(rf_mse)
print('rf_train_rmse', rf_rmse) 

scores = cross_val_score(rf_reg, train, train_labels, scoring='neg_mean_squared_error', cv=10)
rf_rmse_scores = np.sqrt(-scores)
explain_scores(rf_rmse_scores)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 16.0min finished


RandomForestRegressor(n_jobs=-1, verbose=True)

In [None]:
# Saving model
dump(rf_reg, 'rf_reg.joblib') 
# model=load('rf_reg.joblib')

In [21]:
# Combine collaborative and content-based predictions
def hybrid_predict(user_id, isbn):
    # Collaborative filtering prediction
    collaborative_prediction = --collaborative_algo--.predict(user_id, isbn).est

    # Content-based filtering prediction
    content_based_prediction = --content_based_pipeline--.predict([[----------]])

    # Combine predictions (you can customize the weighting)
    hybrid_prediction = 0.7 * collaborative_prediction + 0.3 * content_based_prediction

    return hybrid_prediction


user_id_example = 278838
isbn_example = '0440400988'
prediction_example = hybrid_predict(user_id_example, isbn_example)
print(f'Hybrid Prediction for User {user_id_example}, ISBN {isbn_example}: {prediction_example}')

ValueError: X has 1 features, but DecisionTreeRegressor is expecting 36959 features as input.

In [151]:
collaborative_prediction = collaborative_algo.predict(user_id_example, isbn_example).est

In [152]:
collaborative_prediction

2.464030128128015