In [12]:
import pandas as pd
from surprise.model_selection import train_test_split,cross_validate
from surprise import Dataset, Reader, SVD, accuracy, NMF, AlgoBase, BaselineOnly
from scipy.optimize import minimize
import numpy as np
import nltk
import string
import re
import surprise

In [13]:
all_books = pd.read_csv('data/Books.csv')
all_ratings = pd.read_csv('data/Ratings.csv')
all_users = pd.read_csv('data/Users.csv')
all_ratings.head()

  all_books = pd.read_csv('data/Books.csv')


Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [14]:
# Checks if item is a valid ISBN
def is_valid_isbn(isbn):
     if len(isbn) != 10: return False
     if not isbn.isalnum(): return False

     return True

# Converts ISBN to numerical ID
def convert_isbn_to_id(isbn, isbn_to_id):
    return isbn_to_id.get(isbn, None)

data_list = []
isbn_to_id = {}
id_counter = 0

# Converts all ISBNs in data to numerical IDs
with open('data/Ratings.csv', 'r') as file:
    next(file)
    for line in file:
        # Skips misformatted items that would cause an error
        try:
            user_id, isbn, rating = line.strip().split(',')
        except:
            continue
        
        # Skips misformatted items (not valid ISBNs)
        if not is_valid_isbn(isbn):
            continue

        # Converts ISBN to numerical ID
        if isbn not in isbn_to_id:
            isbn_to_id[isbn] = id_counter
            id_counter += 1
        
        # Builds dictionary
        item_id = isbn_to_id[isbn]

        # Add data to list of tuples
        data_list.append((user_id, item_id, float(rating)))

# Creates inverted dictionary, use to convert numerical ID to ISBN for reporting results
id_to_isbn = {v: k for k, v in isbn_to_id.items()}

# Converts list to pandas data frame
df = pd.DataFrame(data_list, columns=['user_id', 'item_id', 'rating'])

# Creates Reader to extract data, uses it to load data
reader = Reader(line_format="user item rating", sep=',', rating_scale=(1,10))
data = Dataset.load_from_df(df, reader=reader)

# Splits the data into train_data and test_data
train_data, test_data = train_test_split(data, test_size=0.2)

In [15]:
# Custom recommendation system algorithm
class WeightedSum(AlgoBase):
    # Initializes algorithm using a list of models to serve as base
    def __init__(self, models):
        AlgoBase.__init__(self)

        self.models = models
        self.weights = [1/len(models)]*len(models)
    
    # Fits algorithm to data, optimizes weights
    def fit(self, trainset):
        AlgoBase.fit(self, trainset)
        
        # Splits trainset into train_data and optimize_data
        temp_df = pd.DataFrame(trainset.all_ratings(), columns=['user_id', 'item_id', 'rating'])
        temp_data = Dataset.load_from_df(temp_df, reader=reader)
        train_data, optimize_data = train_test_split(temp_data, test_size=0.4)

        # Fits base models to train_data
        for model in self.models:
            model.fit(train_data)
        
        # Optimizes weights using optimize_data through gradient descent
        change = 1
        threshold = 1e-4

        pred = self.test(optimize_data)
        acc = accuracy.rmse(pred)
        prev_acc = acc
        while change > threshold:
            # Copies current state of weights
            curr_weights = self.weights.copy()
            
            # Computes gradients
            gradients = []
            for i in range(len(self.weights)):
                weight = self.weights[i]

                self.weights[i] += 1e-5
                new_pred = self.test(optimize_data)
                new_acc = accuracy.rmse(new_pred)

                self.weights[i] = weight
                pred = self.test(optimize_data)
                acc = accuracy.rmse(pred)

                gradient = (new_acc - acc) / 1e-5
                gradients.append(gradient)

            # Restores state of weights
            self.weights = curr_weights

            # Modifies weights
            for i in range(len(self.weights)):
                self.weights[i] -= 0.01 * gradients[i]

            self.weights /= sum(self.weights)

            # Computes new accuracy
            pred = self.test(optimize_data)
            acc = accuracy.rmse(pred)
            change = abs(acc - prev_acc)
            prev_acc = acc
        
        return self

    # Estimates rating for given user and item
    def estimate(self, u , i):
        predictions = []
        for model in self.models:
            predictions.append(model.predict(u, i))

        final_prediction = 0
        for i in range(0, len(self.weights)):
            final_prediction += predictions[i].est*self.weights[i]

        return final_prediction

In [16]:
# Declares models for use in weighted sum
svd = SVD()
nmf = NMF()

colla_knn = surprise.KNNWithMeans()

models = [svd, nmf,colla_knn]

# Declares weighted sum, fits to train_data
weighted_sum = WeightedSum(models)
weighted_sum.fit(train_data)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 3.8568
RMSE: 3.8568
RMSE: 3.8568
RMSE: 3.8568
RMSE: 3.8568
RMSE: 3.8568
RMSE: 3.8568
RMSE: 3.8568


<__main__.WeightedSum at 0x2ba2a36d0>

In [6]:
baseline = BaselineOnly()
cross_validate(baseline, data, measures=['RMSE'], verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.4017  3.4113  3.4157  3.4090  3.4072  3.4090  0.0046  
Fit time          1.85    1.76    1.84    1.95    1.89    1.86    0.06    
Test time         0.87    0.87    0.35    0.35    0.85    0.66    0.25    


{'test_rmse': array([3.40174709, 3.41130826, 3.41568304, 3.40904479, 3.40718402]),
 'fit_time': (1.849841833114624,
  1.7617926597595215,
  1.8363456726074219,
  1.9479663372039795,
  1.8905467987060547),
 'test_time': (0.8651530742645264,
  0.8662881851196289,
  0.35010814666748047,
  0.3466312885284424,
  0.8547821044921875)}

In [17]:
# Reports accuracy
pred = weighted_sum.test(test_data)
accuracy.rmse(pred)

print("Optimal Weights: ", weighted_sum.weights)

RMSE: 3.5653
Optimal Weights:  [0.33326998 0.33336501 0.33336501]


In [33]:
#converts prediction vector to np.array
arr = np.array(pred)
pred_arr = np.array([i[:4] for i in arr])
#gets the values user id, item id, user's rating of item, and prediction score (can be expanded for more data)
df = pd.DataFrame(pred_arr, columns=['uid','iid','rating','score'])
#drop duplicate ISBN to prevent repeat predictions (currently most likely drops duplicate ISBNs without a rating first)
df = df.drop_duplicates('iid')
df["ISBN"] = df['iid'].apply(lambda x: id_to_isbn[x])
df = df.sort_values("score", ascending = False)
#combine all_books with all_ratings, with a left merge, drop duplicate ISBNs with priority based on book-rating
book_rating_df = all_ratings.merge(all_books, how="left", on="ISBN").sort_values("Book-Rating", ascending= False).drop_duplicates("ISBN")
rec_df = book_rating_df.loc[book_rating_df["ISBN"].isin(df["ISBN"][:5])][['ISBN','Book-Title','Book-Author','Book-Rating']]
rec_df


Unnamed: 0,ISBN,Book-Title,Book-Author,Book-Rating
435558,0451411129,The Corset Diaries,Katie Macalister,10
433029,0679821481,Six by Seuss: A Treasury of Dr. Seuss Classics,Seuss,10
100562,039548930X,"The Return of the King (The Lord of the Rings,...",J. R. R. Tolkien,10
1040785,192946200X,Sluggy Freelance: When Holidays Attack! (Book 3),Peter Abrams,10
408177,0425179494,The Big Nap (Mommy-Track Mystery),Ayelet Waldman,9


In [16]:
#merge dataframes
books_ratings = all_books.merge(all_ratings, how="right", on="ISBN")
books_ratings = books_ratings.dropna().drop_duplicates("ISBN")
books_ratings = books_ratings[["ISBN", "Book-Author", "Book-Title","Year-Of-Publication", "Publisher","User-ID", "Book-Rating"]]
books_ratings


Unnamed: 0,ISBN,Book-Author,Book-Title,Year-Of-Publication,Publisher,User-ID,Book-Rating
0,034545104X,M. J. Rose,Flesh Tones: A Novel,2002,Ballantine Books,276725,0
1,0155061224,Judith Rae,Rites of Passage,2001,Heinle,276726,5
2,0446520802,Nicholas Sparks,The Notebook,1996,Warner Books,276727,0
3,052165615X,Philip Prowse,Help!: Level 1,1999,Cambridge University Press,276729,3
4,0521795028,Sue Leather,The Amsterdam Connection : Level 4 (Cambridge ...,2001,Cambridge University Press,276729,6
...,...,...,...,...,...,...,...
1149701,0517145553,Douglas Adams,Mostly Harmless,1995,Random House Value Pub,276688,0
1149747,1575660792,Shirley Kennett,Gray Matter,1996,Kensington Publishing Corporation,276688,7
1149757,0590907301,Debbie Dadey,Triplet Trouble and the Class Trip (Triplet Tr...,1997,Apple,276690,0
1149770,0679752714,Judith Freeman,A Desert of Pure Feeling (Vintage Contemporaries),1997,Vintage Books USA,276704,0


In [17]:
spliter = surprise.model_selection.split.ShuffleSplit(n_splits=5,test_size=0.90)
for tmp_train_data, tmp_test_data in spliter.split(data):
    arr = np.array(tmp_train_data)
    items = []
    for x in tmp_train_data.all_items():
        items.append(id_to_isbn[x])
    testset_books = books_ratings.loc[books_ratings['ISBN'].isin(items)]
    break

testset_books = testset_books.drop_duplicates("Book-Title")

In [31]:
#This will use the exsiting training data set from above -> do not use currently
arr = np.array(train_data)
items = []
for x in train_data.all_items():
    items.append(id_to_isbn[x])
testset_books = books_ratings.loc[books_ratings['ISBN'].isin(items)]


In [18]:
def tokenize(field):
    stopwords = set(nltk.corpus.stopwords.words("english"))
    lemmatizer = nltk.stem.WordNetLemmatizer()
    field = field.lower()
    field = re.sub(r"[{}]".format(string.punctuation), " ", field)
    # Removing numbers
    field = re.sub(r'\d+', ' ', field)
    # Removing special characters
    field = re.sub(r"[^a-zA-Z0-9]+", ' ', field)
    tokens = nltk.tokenize.word_tokenize(field)
    modified_tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if not word.lower() in stopwords]
    txt = " ".join(modified_tokens)
    return txt

df = testset_books[["Book-Author","Book-Title"]]
#df["all_text"] = df["Book-Author"] +" "+ df["Book-Title"] +" "+df["Publisher"] +" "+df["Year-Of-Publication"].astype(str)
df['all_text'] = testset_books[["Book-Author"]]
df.all_text = df.all_text.astype(str) 
df['tokenized'] = df.apply(lambda x: tokenize(x['all_text']), axis=1)
df = df[['tokenized','all_text','Book-Title']]
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['all_text'] = testset_books[["Book-Author"]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.all_text = df.all_text.astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tokenized'] = df.apply(lambda x: tokenize(x['all_text']), axis=1)


Unnamed: 0,tokenized,all_text,Book-Title
0,j rose,M. J. Rose,Flesh Tones: A Novel
1,judith rae,Judith Rae,Rites of Passage
2,nicholas spark,Nicholas Sparks,The Notebook
3,philip prowse,Philip Prowse,Help!: Level 1
4,sue leather,Sue Leather,The Amsterdam Connection : Level 4 (Cambridge ...


In [19]:
import sklearn.feature_extraction.text
tfidf_maker = sklearn.feature_extraction.text.TfidfVectorizer()
df['tokenized'] = df['tokenized'].fillna("")
tfidf_matrix = tfidf_maker.fit_transform(df['all_text'])
tfidf_matrix.shape

(54039, 17573)

In [20]:
pd.DataFrame(
    tfidf_matrix.todense(),
    columns=tfidf_maker.get_feature_names_out(),
    index=df["Book-Title"]
).sample(15, axis=1).sample(10, axis=0)


Unnamed: 0_level_0,gina,gaunt,gold,lilla,winick,grady,heidelore,spoto,ruriko,posada,brizzi,antonella,wallraff,lou,marohn
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
The Velvet Promise,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Temporary Bride (Harlequin Presents, No 1238)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Fireship,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Revolutionary Voices,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Before Night Falls,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Prodigal Summer: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Books of Blood, Vol. 1",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Shopping Guide for Caring Consumers 2002: A Guide to Products That Are Not Tested on Animals (Shopping Guide for Caring Consumers, 2002)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Grimoire for the Apprentice Wizard,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Cat in the Hat Comes Back (I Can Read It All by Myself Beginner Books),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
from sklearn.metrics.pairwise import linear_kernel
cos_sim = linear_kernel(tfidf_matrix)
cos_sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [22]:
# Create a dataframe from the cosine_sim variable with rows and columns in the form of book titles
cosine_sim_df = pd.DataFrame(cos_sim, index=df['Book-Title'], columns=df['Book-Title'])
print('Shape:', cosine_sim_df.shape)

# View the similarity matrix for each book title
cosine_sim_df.sample(5, axis=1).sample(10, axis=0)

Shape: (54039, 54039)


Book-Title,Matthew Arnold: A Literary Life (Literary Lives Series),Princess Diana: Her Life Story 1961-1997,The Disappearing Teacher,Writing on Both Sides of the Brain : Breakthrough Techniques for People Who Write,Circles Cycles in the Air
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"When You Come to a Fork in the Road, Take It! Inspiration and Wisdom from One of Baseball's Greatest Heroes",0.0,0.0,0.0,0.0,0.0
My Only Story,0.0,0.0,0.0,0.0,0.0
The Simple Art of Murder (Vintage Crime),0.0,0.0,0.0,0.0,0.0
"Stand - In Bride (Xmas) (Romance, 3681)",0.0,0.0,0.0,0.0,0.0
"It Worked for Me: From Thumb Sucking to Schoolyard Fights, Parents Reveal Their Secrets to Solving the Everyday Problems of Raising Kids",0.0,0.0,0.0,0.0,0.0
Miss Rumphius,0.0,0.0,0.0,0.0,0.0
"Richard Brautigan : A Confederate General from Big Sur, Dreaming of Babylon, and the Hawkline Monster (Three Books in the Manner of Their Original ed)",0.0,0.208198,0.0,0.0,0.0
"The Big Five-Oh: Facing, Fearing, and Fighting Fifty",0.0,0.0,0.0,0.0,0.0
Triple (Bookcassette(r) Edition),0.0,0.0,0.0,0.0,0.0
PÃ¡nico nuclear,0.0,0.0,0.0,0.0,0.0


In [23]:
book_title_test = "Charlotte's Web" # book title example

df[df['Book-Title'].eq(book_title_test)]

Unnamed: 0,tokenized,all_text,Book-Title
524,e b white,E. B. White,Charlotte's Web


In [24]:
def book_recommendation(book_title, similarity_data=cosine_sim_df, items=testset_books[['Book-Title','Book-Author']], k=5):
     # Retrieve data by using argpartition to partition indirectly along a given axis
     # Dataframe converted to numpy
     # Range(start, stop, step)
     index = similarity_data.loc[:,book_title].to_numpy().argpartition(range(-1, -k, -1))
     # Retrieve data with the greatest similarity from the existing index
     closest = similarity_data.columns[index[-1:-(k+2):-1]]
    
     # Drop book_title so that the name of the book you are looking for does not appear in the recommendation list
     closest = closest.drop(book_title, errors='ignore')
   

     return pd.DataFrame(closest).merge(items).drop_duplicates("Book-Title").head(k)

# Get recommendations for similar book titles
book_recommendation(book_title_test)

Unnamed: 0,Book-Title,Book-Author
0,Charlotte's Web (Trophy Newbery),E. B. White
1,The Sword in the Stone,T. H. White
2,The Pomegranate Tree Speaks from the Dictator'...,J.P. White
3,Charlottes Web Special Read Along Edition,E B White
4,The Trumpet of the Swan,E. B. White
