# Creating a Top 10 recommendations dataframe

Using the KNNBaseline algorithm, this notebook calculates all the predictions with the data for users with ratings 100-200

In [1]:
# imports
import numpy as np 
import pandas as pd 
import surprise as sp
from tqdm.notebook import tqdm
from collections import defaultdict
import pickle

In [2]:
# Load the data
books = pd.read_csv('../data/processed/books.csv')
ratings = pd.read_csv('../data/raw/ratings.csv')

# Load the genres
genres = pickle.load(open('../references/genres.sav', 'rb'))

# load the models
SVDpp = pickle.load(open('../models/SVDpp_100.sav', 'rb'))
KNNBaseline = pickle.load(open('../models/KNNBaseline_100.sav', 'rb'))

In [3]:
print(genres)

['art', 'biography', 'business', 'chick lit', "children's", 'christian', 'comics', 'contemporary', 'cookbooks', 'crime', 'fantasy', 'fiction', 'gay and lesbian', 'graphic novels', 'historical fiction', 'history', 'horror', 'humor and comedy', 'manga', 'memoir', 'music', 'mystery', 'nonfiction', 'paranormal', 'philosophy', 'poetry', 'psychology', 'religion', 'romance', 'science', 'science fiction', 'self help', 'suspense', 'spirituality', 'sports', 'thriller', 'travel', 'young adult', 'childrens', 'classic', 'dystopia', 'dystopian', 'graphic-novel', 'graphic-novels', 'harry-potter', 'historical', 'historical-fiction', 'humor', 'magic', 'mythology', 'non-fiction', 'plays', 'sci-fi', 'science-fiction', 'steampunk', 'urban-fantasy', 'vampires', 'writing', 'ya', 'young-adult']


In [4]:
%%time
def read_data_surprise (df, minstar=1, maxstar=5, col1='user_id', col2='book_id', col3='rating'):
    '''
    Produces a surpise library data object from original dataframe

    ---Parameters---

    df (Pandas DataFrame)
    minstar (int) minimum rating possible in dataset (default set to 1)
    maxstar (int) maximum rating possible in dataset (default set to 5)
    col1 (string) column name that MUST correspond the the users in the df
    col2 (string) column name that MUST corresponds the the items in the df
    col3 (string) column name that corresponds the the ratings of the items in the df

    ---Returns---
    surprise library data object to manipulate later

    '''
    # need to specify the rating_scale of stars (default 1-3 stars)
    reader = sp.Reader(rating_scale=(minstar, maxstar))
    # The columns must correspond to user id, item id and ratings (in that order).
    data = sp.Dataset.load_from_df(df[[col1, col2, col3]], reader)

    return data

Wall time: 0 ns


In [5]:
%%time
def get_top(predictions, n=100):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    
    # Now we need to get the genres and only include books in the right genre
    # top_n is a defaultdict
    # we have a dict of tuples that have (book ID, pred_rating)
    # we can take the tuple[0] and pull the tag_1 tag_2 tag_3
    #print(len(top_n))
#     print(top_n)
          
#     for book_id, pred_rating in top_n.items(): 
#         book_genres=[]
#         book_ids_in_genre=[]
#         book = books[books.book_id == book_id]
#         book_genres.append(book.tag_1.values)
#         book_genres.append(book.tag_2.values)
#         book_genres.append(book.tag_3.values)
        
#         #print(type(book.tag_3))
#         #print(type(book.tag_3.values))
#         # print(book_genres)
        
#         if genre in book_genres: 
#             book_ids_in_genre.append(book_id)
#     for book_id in book_ids_in_genre: 
#         del top_n[book_id]
        
    #print(len(top_n))
    # Then sort the predictions for each user and retrieve the k highest ones
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

Wall time: 0 ns


In [6]:
%%time
def top_ten_df(df, algo_name='SVDpp'):
    '''
    inputs:
    df (Pandas DF) the dataframe that you would like to train on

    outputs:
    top_ten_df (DataFrame Pandas) returns a dataframe with the top ten predictions for every user in your original dataframe
    '''
    
    reader = sp.Reader(rating_scale=(1, 5))
    data = sp.Dataset.load_from_df(df[['user_id', 'book_id', 'rating']], reader)

    # First train an KNN Baseline algorithm on dataset
    trainset = data.build_full_trainset()
    
    if algo_name == 'SVDpp': 
        algo = SVDpp
    else: 
        algo = KNNBaseline
                    
    # algo = sp.KNNBaseline() # n_epochs= 18, lr_all= 0.01, reg_all= 0.175
    # algo.fit(trainset)

    # Then predict ratings for all pairs (u, i) that are NOT in the training set.
    testset = trainset.build_anti_testset() # THIS TAKES THE MOST RAM
    predictions = algo.test(testset)
    sp.accuracy.rmse(predictions)

    # create a dictionary of predictions
#     for genre in tqdm(genres): 
#         top_n = get_top_n_by_genre(predictions, genre, n=10)

#         #Turn the dictionary into a df
#         top_ten_df = pd.DataFrame(top_n)
        
#         # save file in models
#         top_ten_df.to_csv('../models/by_genre/top_10_100_{}_{}.csv'.format(genre, algo_name))

    #create a dictionary of predictions
    top_n = get_top(predictions, n=100)

    #Turn the dictionary into a df
    top_ten_df = pd.DataFrame(top_n)

    return top_ten_df

Wall time: 0 ns


In [7]:
%%time
num_ratings = ratings.user_id.value_counts()
hundred_fifty_ids = num_ratings[num_ratings.values > 100].index
hundred_fifty = ratings[ratings.user_id.isin(hundred_fifty_ids)]

Wall time: 775 ms


In [8]:
%%time
def df_samp_unique_vals (df, percent, col1, col2=None):
    '''
    Takes a random sample of current dataframe while keeping a few column values unique to decrease matrix sparsity of sample

    ---Parameters---
    df (Pandas DataFrame)
    percent (float) enter a decimal of the percent sample you want
    col1 ("string") column name you want to keep retain unique values for (include quotation marks)
    col2 ("string") column name you want to keep retain unique values for (include quotation marks)

    ---Return---
    matrix stats of new df
    df_samp (Pandas DataFrame) as a percent sample of the original while keeping the columns entered unique
    '''
    x = df.book_id.nunique()
    y = df.user_id.nunique()
    print(f"Initial num of unique books: {x}")
    print(f"Initial num of unique users: {y}")
    print(f"Matrix size: {x*y}")
    print(f"Shape of df: {df.shape}")
    print(f"Density of matrix: {(df.shape[0])/(x*y)}")
    print("---------------------")

    # df.user_id.unique().sample(frac= percent) #(more efficient code to explore??)
    df_drop = df.drop_duplicates(subset=[col1])
    print (f"User drop: {len(df_drop)}")
    if col2:
        df_drop = df_drop.drop_duplicates(subset=[col2])
        print (f"Book drop: {len(df_drop)}")
    #take a sample of the unique values
    sample1 = df_drop.sample(frac= percent, random_state=101)#Random state = random seed for .sample
    print (f"length of entire sample w/ unique users & books: {len(sample1)}")

    #turn the unique routes & user names into a list to reference
    sample1= sample1.loc[:, [col1, col2]].values.T.ravel()
    lst1= sample1.tolist()

    #Filter out the original DF with only unique the unique values
    df_samp = df[(df[col1].isin(lst1)) & (df[col2].isin(lst1))]
    
    x = df_samp.book_id.nunique()
    y = df_samp.user_id.nunique()
    print(f"Final num of unique books: {x}")
    print(f"Final num of unique users: {y}")
    print(f"Matrix size: {x*y}")
    print(f"Shape of df: {df_samp.shape}")
    print(f"Density of matrix: {(df_samp.shape[0])/(x*y)}")

    return df_samp


Wall time: 0 ns


In [19]:
%%time
#create a 65% sample out of the dataframe
sample = df_samp_unique_vals(ratings, 0.65, "user_id", "book_id")

Initial num of unique books: 10000
Initial num of unique users: 53424
Matrix size: 534240000
Shape of df: (5976479, 3)
Density of matrix: 0.011186880428271938
---------------------
User drop: 53424
Book drop: 7456
length of entire sample w/ unique users & books: 4846
Final num of unique books: 5745
Final num of unique users: 8757
Matrix size: 50308965
Shape of df: (687480, 3)
Density of matrix: 0.013665158883709892
Wall time: 403 ms


In [20]:
%%time
#not needed for below cells
reader = sp.Reader(rating_scale=(1, 5))
sample_data = sp.Dataset.load_from_df(sample[['user_id','book_id','rating']], reader)

Wall time: 431 ms


In [21]:
%%time
top_10_100_ratings = top_ten_df(sample)

RMSE: 0.4945
Wall time: 38min 45s


In [22]:
top_10_100_ratings

Unnamed: 0,1,2,4,8,11,15,18,22,24,25,...,38075,52199,16102,5242,12791,42977,41773,34531,7553,27329
0,"(3491, 4.507298355237171)","(7947, 4.779225161297545)","(608, 5)","(7947, 4.779225161297545)","(5376, 4.749559965051951)","(7947, 4.779225161297545)","(3491, 4.525946856862543)","(5376, 4.598991846094548)","(7254, 4.782543502576779)","(7947, 4.779225161297545)",...,"(7947, 4.779225161297545)","(7947, 4.779225161297545)","(7947, 4.779225161297545)","(3275, 5)","(4, 5)","(7947, 4.51710675160214)","(7947, 4.779225161297545)","(7039, 5)","(608, 5)","(3753, 5)"
1,"(3248, 4.492302435402482)","(9076, 4.693240946649816)","(422, 4.971727189076072)","(9076, 4.693240946649816)","(3946, 4.644198855876651)","(9076, 4.693240946649816)","(4483, 4.52341885123358)","(5202, 4.592741024719189)","(5580, 4.747829800337142)","(9076, 4.693240946649816)",...,"(9076, 4.693240946649816)","(9076, 4.693240946649816)","(9076, 4.693240946649816)","(422, 5)","(545, 5)","(3491, 4.491218651635088)","(9076, 4.693240946649816)","(4483, 5)","(422, 4.995499816407454)","(964, 5)"
2,"(608, 4.459816506487486)","(5580, 4.688202276065179)","(988, 4.891408185259499)","(5580, 4.688202276065179)","(964, 4.636097983113755)","(5580, 4.688202276065179)","(5580, 4.507874363574531)","(7947, 4.539133098505784)","(6361, 4.736745260673515)","(5580, 4.688202276065179)",...,"(5580, 4.688202276065179)","(5580, 4.688202276065179)","(5580, 4.688202276065179)","(464, 5)","(5580, 5)","(610, 4.4083696564485075)","(5580, 4.688202276065179)","(6361, 5)","(9076, 4.896211878011564)","(7254, 5)"
3,"(7947, 4.450359516764647)","(862, 4.683199725119521)","(1877, 4.873834006226008)","(862, 4.683199725119521)","(1788, 4.6105198160697585)","(862, 4.683199725119521)","(8233, 4.49323404290062)","(6361, 4.489839546486775)","(4483, 4.725220237990525)","(862, 4.683199725119521)",...,"(862, 4.683199725119521)","(862, 4.683199725119521)","(862, 4.683199725119521)","(2101, 4.995699771100003)","(4653, 5)","(5580, 4.407493600942811)","(862, 4.683199725119521)","(5580, 5)","(25, 4.874870083324913)","(7639, 5)"
4,"(339, 4.433124528450508)","(6361, 4.676444928059796)","(3275, 4.860545815901677)","(6361, 4.676444928059796)","(1266, 4.571994523059461)","(6361, 4.676444928059796)","(1788, 4.470077687646744)","(7883, 4.476092440043502)","(1788, 4.700766827885539)","(6361, 4.676444928059796)",...,"(6361, 4.676444928059796)","(6361, 4.676444928059796)","(6361, 4.676444928059796)","(8663, 4.970837172964066)","(5376, 5)","(2767, 4.402131896102272)","(6361, 4.676444928059796)","(780, 5)","(3753, 4.871815695046115)","(8455, 5)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,"(3396, 4.136507086333953)","(8495, 4.373739341956579)","(4301, 4.526428483504151)","(4301, 4.376425372222999)","(1650, 4.27220721449196)","(5422, 4.373967301393302)","(485, 4.1343284015884905)","(6902, 4.17835466906321)","(721, 4.341517416196756)","(5422, 4.373967301393302)",...,"(4301, 4.376425372222999)","(4301, 4.376425372222999)","(4301, 4.376425372222999)","(5181, 4.619808831114074)","(2240, 4.688504160771039)","(5789, 4.145542754206448)","(5422, 4.373967301393302)","(7894, 4.826368675628633)","(6892, 4.539410898159269)","(8159, 5)"
96,"(6228, 4.135711570663965)","(5991, 4.373026684639209)","(1047, 4.525937909689778)","(5422, 4.373967301393302)","(2174, 4.271765651824974)","(8495, 4.373739341956579)","(592, 4.132407028613393)","(8259, 4.177055820479226)","(862, 4.3410376079218365)","(8495, 4.373739341956579)",...,"(5422, 4.373967301393302)","(5422, 4.373967301393302)","(5422, 4.373967301393302)","(4061, 4.618993440582175)","(2068, 4.686940333364616)","(3234, 4.145263612182767)","(8495, 4.373739341956579)","(9004, 4.825750208464909)","(739, 4.53615831953296)","(7352, 5)"
97,"(2174, 4.1322744762440315)","(5202, 4.371504658406494)","(2648, 4.525932958200727)","(8495, 4.373739341956579)","(5202, 4.270480957602712)","(5991, 4.373026684639209)","(2151, 4.132348439075234)","(8393, 4.176581731142768)","(2151, 4.339874617631112)","(5991, 4.373026684639209)",...,"(8495, 4.373739341956579)","(8495, 4.373739341956579)","(8495, 4.373739341956579)","(1047, 4.616889081476855)","(1158, 4.686008630174047)","(2669, 4.143644336881434)","(5202, 4.371504658406494)","(1308, 4.82571447273409)","(8455, 4.535889541230775)","(5515, 5)"
98,"(1266, 4.1320345370846825)","(4325, 4.371190096982585)","(8333, 4.525845316191826)","(5991, 4.373026684639209)","(7364, 4.269782955931763)","(5202, 4.371504658406494)","(4301, 4.131647818942617)","(8870, 4.1763276059323955)","(4061, 4.339280739526069)","(5202, 4.371504658406494)",...,"(5991, 4.373026684639209)","(5991, 4.373026684639209)","(5991, 4.373026684639209)","(6659, 4.616134695939511)","(464, 4.683787320709736)","(9536, 4.143617922128082)","(4325, 4.371190096982585)","(444, 4.81764545224099)","(4688, 4.53458612742705)","(3189, 5)"


In [23]:
# top_10 = top_ten_df(sample)
# user_id = "589"
# print(type(top_10.columns[0]))
# if user_id not in top_10.columns: 
#         print("User id", user_id, "not in the database.")
# top = pd.DataFrame()
# book_list = []
# print("Top predicted books for user id", user_id)
# #print(top_10[user_id])
# for book_id, rating in top_10[int(user_id)]:
    
#     to_add = books[books.book_id == book_id]
#     #print(to_add)
#     to_add['predicted_rating'] = rating
#     #print(to_add)
#     top = pd.concat([top, to_add]) 
#     top = top.sort_values(['predicted_rating','bayesian_avg'], ascending=[False, False])

In [24]:
# top

In [25]:
# save the file
top_10_100_ratings.to_csv('../data/processed/top_100_100_SVDpp.csv')

In [26]:
%%time
top_10_100_ratings = top_ten_df(sample, algo_name='KNNBaseline')

RMSE: 0.4336
Wall time: 2h 18min 39s


In [27]:
top_10_100_ratings

Unnamed: 0,1,2,4,8,11,15,18,22,24,25,...,38075,52199,16102,5242,12791,42977,41773,34531,7553,27329
0,"(4676, 4.713498742416295)","(862, 4.777977045440078)","(4676, 4.936617383807282)","(862, 4.777977045440078)","(4676, 4.8650418607368495)","(862, 4.777977045440078)","(4676, 4.720429496663909)","(4676, 4.789614094448979)","(4676, 4.894389533601471)","(862, 4.777977045440078)",...,"(862, 4.777977045440078)","(862, 4.777977045440078)","(862, 4.777977045440078)","(4676, 5)","(8946, 5)","(4676, 4.79362850759539)","(862, 4.777977045440078)","(3753, 5)","(4676, 5)","(3753, 5)"
1,"(7947, 4.5268151122590625)","(1788, 4.748665552320921)","(9076, 4.694545663288844)","(1788, 4.748665552320921)","(7254, 4.587570602492985)","(1788, 4.748665552320921)","(8946, 4.546398709236097)","(9076, 4.61953787434699)","(862, 4.776996838915768)","(1788, 4.748665552320921)",...,"(1788, 4.748665552320921)","(1788, 4.748665552320921)","(1788, 4.748665552320921)","(7947, 5)","(3660, 5)","(9076, 4.61171529146489)","(1788, 4.748665552320921)","(8946, 5)","(7947, 4.849871664863939)","(103, 5)"
2,"(8946, 4.515804818820051)","(6361, 4.729073026644194)","(8946, 4.665666827126575)","(6361, 4.729073026644194)","(7947, 4.587527267822143)","(6361, 4.729073026644194)","(862, 4.511844257051887)","(5580, 4.565944264502711)","(7947, 4.63039878514021)","(6361, 4.729073026644194)",...,"(6361, 4.729073026644194)","(6361, 4.729073026644194)","(6361, 4.729073026644194)","(862, 4.9590492643402335)","(4676, 5)","(5580, 4.600505601082409)","(6361, 4.729073026644194)","(7039, 5)","(5580, 4.835607319836763)","(1380, 5)"
3,"(862, 4.494602301608409)","(4483, 4.72674770044625)","(7254, 4.623443910634503)","(4483, 4.72674770044625)","(8946, 4.5795492756168406)","(4483, 4.72674770044625)","(3491, 4.508507707536653)","(7881, 4.540034138561248)","(9076, 4.621546481071537)","(4483, 4.72674770044625)",...,"(4483, 4.72674770044625)","(4483, 4.72674770044625)","(4483, 4.72674770044625)","(9076, 4.947743525227287)","(7883, 5)","(862, 4.589578917649023)","(4483, 4.72674770044625)","(6361, 5)","(8946, 4.832600551109796)","(7254, 5)"
4,"(6902, 4.424350758367829)","(5580, 4.719719287144723)","(7947, 4.6173898519779515)","(5580, 4.719719287144723)","(8663, 4.555480691086405)","(5580, 4.719719287144723)","(5580, 4.503110157258827)","(862, 4.538243073332429)","(6361, 4.617337868002317)","(5580, 4.719719287144723)",...,"(5580, 4.719719287144723)","(5580, 4.719719287144723)","(5580, 4.719719287144723)","(3753, 4.927265934845718)","(7947, 5)","(8946, 4.587274561886334)","(5580, 4.719719287144723)","(9076, 5)","(1788, 4.796104336791346)","(5730, 5)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,"(8333, 4.134611390236774)","(7364, 4.406716251368796)","(1029, 4.334040932134603)","(757, 4.414972604506524)","(4325, 4.27064542165708)","(4971, 4.4080060682212)","(757, 4.155679484779115)","(2507, 4.188603608977696)","(746, 4.309393957433111)","(757, 4.414972604506524)",...,"(757, 4.414972604506524)","(757, 4.414972604506524)","(2743, 4.412300297768892)","(8982, 4.591363086272802)","(4344, 4.697678643006842)","(3230, 4.215912395696075)","(757, 4.414972604506524)","(1905, 4.7960662967723895)","(8259, 4.500471673502994)","(8976, 5)"
96,"(7569, 4.1341176305415654)","(8455, 4.405136845396076)","(1895, 4.3332951620616615)","(2743, 4.412300297768892)","(1265, 4.269748602154365)","(7364, 4.406716251368796)","(4061, 4.153834267218752)","(2411, 4.18755274528385)","(2043, 4.307024905000745)","(2743, 4.412300297768892)",...,"(2743, 4.412300297768892)","(2743, 4.412300297768892)","(5202, 4.410390480426765)","(2272, 4.589982897889328)","(2450, 4.697422601209538)","(958, 4.215162419975946)","(2743, 4.412300297768892)","(9781, 4.793433645329916)","(1451, 4.497963313627701)","(870, 5)"
97,"(5275, 4.133761323193745)","(6298, 4.404973340344802)","(2889, 4.329558797248195)","(5202, 4.410390480426765)","(3472, 4.261249210814182)","(8455, 4.405136845396076)","(2064, 4.152881097829046)","(1577, 4.186883974909664)","(6991, 4.304790204757767)","(5202, 4.410390480426765)",...,"(5202, 4.410390480426765)","(5202, 4.410390480426765)","(4971, 4.4080060682212)","(7550, 4.584617610715743)","(161, 4.6961016658061245)","(4410, 4.209903964777696)","(5202, 4.410390480426765)","(6995, 4.7931478131006084)","(2937, 4.497863699253519)","(466, 5)"
98,"(4653, 4.133100833958236)","(466, 4.403980325232698)","(2386, 4.329153215230759)","(4971, 4.4080060682212)","(4061, 4.260873137871568)","(6298, 4.404973340344802)","(2889, 4.151864827887605)","(7368, 4.1859071421388006)","(4921, 4.304688255011996)","(4971, 4.4080060682212)",...,"(4971, 4.4080060682212)","(4971, 4.4080060682212)","(7364, 4.406716251368796)","(2937, 4.584051095800863)","(958, 4.696070260378476)","(2831, 4.209269862380026)","(4971, 4.4080060682212)","(3946, 4.792249150154248)","(4620, 4.49748474485379)","(4874, 5)"


In [28]:
# save the file
top_10_100_ratings.to_csv('../data/processed/top_100_100_genre_KNNBaseline.csv')