In [24]:
import pandas as pd
from surprise.model_selection import train_test_split
from surprise import Dataset, Reader, SVD, accuracy, NMF, AlgoBase 
from scipy.optimize import minimize
import numpy as np

In [25]:
all_books = pd.read_csv('data/Books.csv')
all_ratings = pd.read_csv('data/Ratings.csv')
all_users = pd.read_csv('data/Users.csv')
all_books.head()

  all_books = pd.read_csv('data/Books.csv')


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [26]:
# Checks if item is a valid ISBN
def is_valid_isbn(isbn):
     if len(isbn) != 10: return False
     if not isbn.isalnum(): return False

     return True

# Converts ISBN to numerical ID
def convert_isbn_to_id(isbn, isbn_to_id):
    return isbn_to_id.get(isbn, None)

data_list = []
isbn_to_id = {}
id_counter = 0

# Converts all ISBNs in data to numerical IDs
with open('data/Ratings.csv', 'r') as file:
    next(file)
    for line in file:
        # Skips misformatted items that would cause an error
        try:
            item_id, isbn, rating = line.strip().split(',')
        except:
            continue
        
        # Skips misformatted items (not valid ISBNs)
        if not is_valid_isbn(isbn):
            continue

        # Converts ISBN to numerical ID
        if isbn not in isbn_to_id:
            isbn_to_id[isbn] = id_counter
            id_counter += 1
        
        # Builds dictionary
        item_id = isbn_to_id[isbn]

        # Add data to list of tuples
        data_list.append((item_id, item_id, float(rating)))

# Creates inverted dictionary, use to convert numerical ID to ISBN for reporting results
id_to_isbn = {v: k for k, v in isbn_to_id.items()}

# Converts list to pandas data frame
df = pd.DataFrame(data_list, columns=['user_id', 'item_id', 'rating'])

# Creates Reader to extract data, uses it to load data
reader = Reader(line_format="user item rating", sep=',', rating_scale=(1,10))
data = Dataset.load_from_df(df, reader=reader)

# Splits the data into train_data and test_data
train_data, test_data = train_test_split(data, test_size=0.5)

In [27]:
# Custom weighted sum recommendation system algorithm
class WeightedSum(AlgoBase):
    # Initializes algorithm using a list of models to serve as base
    def __init__(self, models, learn_rate=0.01, threshold=1e-4, perturbation=1e-5):
        AlgoBase.__init__(self)

        self.models = models
        self.weights = [1/len(models)]*len(models)
        self.learn_rate = learn_rate
        self.threshold = threshold
        self.perturbation = perturbation
    
    # Fits algorithm to data, optimizes weights
    def fit(self, trainset):
        AlgoBase.fit(self, trainset)
        
        # Splits trainset into train_data and optimize_data
        temp_df = pd.DataFrame(trainset.all_ratings(), columns=['user_id', 'item_id', 'rating'])
        temp_data = Dataset.load_from_df(temp_df, reader=reader)
        train_data, optimize_data = train_test_split(temp_data, test_size=0.4)

        # Fits base models to train_data
        for model in self.models:
            model.fit(train_data)
        
        # Optimizes weights using optimize_data through gradient descent
        change = 1

        pred = self.test(optimize_data)
        acc = accuracy.rmse(pred)
        prev_acc = acc
        while change > self.threshold:
            # Copies current state of weights
            curr_weights = self.weights.copy()
            
            # Computes gradients
            gradients = []
            for i in range(len(self.weights)):
                weight = self.weights[i]

                self.weights[i] += self.perturbation
                new_pred = self.test(optimize_data)
                new_acc = accuracy.rmse(new_pred)

                self.weights[i] = weight
                pred = self.test(optimize_data)
                acc = accuracy.rmse(pred)

                gradient = (new_acc - acc) / self.perturbation
                gradients.append(gradient)

            # Restores state of weights
            self.weights = curr_weights

            # Modifies weights
            for i in range(len(self.weights)):
                self.weights[i] -= max(self.learn_rate * gradients[i], 0)

            # Ensures weights sum to 1
            self.weights /= sum(self.weights)

            # Computes new accuracy
            pred = self.test(optimize_data)
            acc = accuracy.rmse(pred)
            change = abs(acc - prev_acc)
            prev_acc = acc
        
        return self

    # Estimates rating for given user and item
    def estimate(self, u , i):
        predictions = []
        for model in self.models:
            predictions.append(model.predict(u, i))

        final_prediction = 0
        for i in range(0, len(self.weights)):
            final_prediction += predictions[i].est*self.weights[i]

        return final_prediction

In [28]:
class GlobalMean(AlgoBase):
    def __init__(self):
        AlgoBase.__init__(self)

    def fit(self, trainset):
        AlgoBase.fit(self, trainset)

        self.mean_rating = trainset.global_mean

        return self

    def estimate(self, u, i):

        return self.mean_rating
    
class UserMean(AlgoBase):
    def __init__(self):
        AlgoBase.__init__(self)

    def fit(self, trainset):
        AlgoBase.fit(self, trainset)

        self.means = {}
        for user_id in trainset.all_users():
            user_ratings = [0]
            for rating in trainset.ur[user_id]:
                _, score = rating
                if score != 0:
                    user_ratings.append(score)

            if len(user_ratings) == 1:
                user_mean = trainset.global_mean
            else:
                user_mean = sum(user_ratings) / (len(user_ratings)-1)         

            self.means[user_id] = user_mean

        return self

    def estimate(self, u, i):
        return self.means.get(u, self.trainset.global_mean)

class ItemMean(AlgoBase):
    def __init__(self):
        AlgoBase.__init__(self)

    def fit(self, trainset):
        AlgoBase.fit(self, trainset)

        self.means = {}
        for item_id in trainset.all_users():
            item_ratings = [0]
            for rating in trainset.ir[item_id]:
                _, score = rating
                if score != 0:
                    item_ratings.append(score)

            if len(item_ratings) == 1:
                user_mean = trainset.global_mean
            else:
                user_mean = sum(item_ratings) / (len(item_ratings)-1)         

            self.means[item_id] = user_mean

        return self

    def estimate(self, u, i):
        return self.means.get(i, self.trainset.global_mean)

In [30]:
# Declares models for use in weighted sum (NOTE: this takes ~200min)
svd = SVD()
nmf = NMF()
gm = GlobalMean()
um = UserMean()
im = ItemMean()

models = [svd, nmf, gm, um, im]

# Declares weighted sum, fits to train_data
weighted_sum = WeightedSum(models)
weighted_sum.fit(train_data)

RMSE: 4.1309
RMSE: 4.1309
RMSE: 4.1309
RMSE: 4.1309
RMSE: 4.1309
RMSE: 4.1309
RMSE: 4.1309
RMSE: 4.1309
RMSE: 4.1309
RMSE: 4.1309
RMSE: 4.1309
RMSE: 4.1223
RMSE: 4.1223
RMSE: 4.1223
RMSE: 4.1223
RMSE: 4.1223
RMSE: 4.1223
RMSE: 4.1223
RMSE: 4.1223
RMSE: 4.1223
RMSE: 4.1223
RMSE: 4.1223
RMSE: 4.1136
RMSE: 4.1136
RMSE: 4.1136
RMSE: 4.1136
RMSE: 4.1136
RMSE: 4.1136
RMSE: 4.1136
RMSE: 4.1136
RMSE: 4.1136
RMSE: 4.1136
RMSE: 4.1136
RMSE: 4.1049
RMSE: 4.1049
RMSE: 4.1049
RMSE: 4.1049
RMSE: 4.1049
RMSE: 4.1049
RMSE: 4.1049
RMSE: 4.1049
RMSE: 4.1049
RMSE: 4.1049
RMSE: 4.1049
RMSE: 4.0960
RMSE: 4.0960
RMSE: 4.0960
RMSE: 4.0960
RMSE: 4.0960
RMSE: 4.0960
RMSE: 4.0960
RMSE: 4.0960
RMSE: 4.0960
RMSE: 4.0960
RMSE: 4.0960
RMSE: 4.0872
RMSE: 4.0872
RMSE: 4.0872
RMSE: 4.0872
RMSE: 4.0872
RMSE: 4.0872
RMSE: 4.0872
RMSE: 4.0872
RMSE: 4.0872
RMSE: 4.0872
RMSE: 4.0872
RMSE: 4.0783
RMSE: 4.0783
RMSE: 4.0783
RMSE: 4.0783
RMSE: 4.0783
RMSE: 4.0783
RMSE: 4.0783
RMSE: 4.0783
RMSE: 4.0783
RMSE: 4.0783
RMSE: 4.0783

<__main__.WeightedSum at 0x1db07ea2990>

In [31]:
# Reports accuracy
predictions = weighted_sum.test(test_data)
accuracy.rmse(predictions)

print("Optimal Weights: ", weighted_sum.weights)

RMSE: 3.8456
Optimal Weights:  [ 0.29373487  0.1645562   0.61503766 -0.03666437 -0.03666437]


In [32]:
#converts prediction vector to np.array
arr = np.array(predictions)
pred_arr = np.array([i[:4] for i in arr])
#gets the values user id, item id, user's rating of item, and prediction score (can be expanded for more data)
df = pd.DataFrame(pred_arr, columns=['uid','iid','rating','score'])
#drop duplicate ISBN to prevent repeat predictions (currently most likely drops duplicate ISBNs without a rating first)
df = df.drop_duplicates('iid')
df["ISBN"] = df['iid'].apply(lambda x: id_to_isbn[x])
df = df.sort_values("score", ascending = False)
for isbn in df['ISBN'][:5]:
    recc_book = all_books.loc[all_books["ISBN"] == isbn]['Book-Title'].to_string(index = False)
    print(recc_book)

The Shrinking of Treehorn
The Lion, the Witch and the Wardrobe (rpkg) (Na...
Series([], )
Deep Thoughts
Maus 1. Mein Vater kotzt Geschichte aus. Die Ge...
