# Creating a Top 10 recommendations dataframe

Using the KNNBaseline algorithm, this notebook calculates all the predictions with the data for users with ratings 151-200

In [2]:
# imports
import numpy as np 
import pandas as pd 
import surprise as sp
from collections import defaultdict
import pickle

In [3]:
# Load the data
books = pd.read_csv('../data/processed/books.csv')
ratings = pd.read_csv('../data/raw/ratings.csv')

# load the models
SVDpp = pickle.load(open('../models/SVDpp_150.sav', 'rb'))
KNNBaseline = pickle.load(open('../models/KNNBaseline_150.sav', 'rb'))

In [4]:
%%time
def read_data_surprise (df, minstar=1, maxstar=3, col1='user_id', col2='route', col3='rating'):
    '''
    Produces a surpise library data object from original dataframe

    ---Parameters---

    df (Pandas DataFrame)
    minstar (int) minimum rating possible in dataset (default set to 1)
    maxstar (int) maximum rating possible in dataset (default set to 5)
    col1 (string) column name that MUST correspond the the users in the df
    col2 (string) column name that MUST corresponds the the items in the df
    col3 (string) column name that corresponds the the ratings of the items in the df

    ---Returns---
    surprise library data object to manipulate later

    '''
    # need to specify the rating_scale of stars (default 1-3 stars)
    reader = sp.Reader(rating_scale=(minstar, maxstar))
    # The columns must correspond to user id, item id and ratings (in that order).
    data = sp.Dataset.load_from_df(df[[col1, col2, col3]], reader)

    return data

Wall time: 999 µs


In [5]:
%%time
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

Wall time: 0 ns


In [8]:
%%time
def top_ten_df (df, algo_name='SVDpp'):
    '''
    inputs:
    df (Pandas DF) the dataframe that you would like to train on

    outputs:
    top_ten_df (DataFrame Pandas) returns a dataframe with the top ten predictions for every user in your original dataframe
    '''
    
    reader = sp.Reader(rating_scale=(1, 5))
    data = sp.Dataset.load_from_df(df[['user_id', 'book_id', 'rating']], reader)

    # First train an KNN Baseline algorithm on dataset
    trainset = data.build_full_trainset()
    
    if algo_name == 'SVDpp': 
        algo = SVDpp
    else: 
        algo = KNNBaseline
                    
    # algo = sp.KNNBaseline() # n_epochs= 18, lr_all= 0.01, reg_all= 0.175
    # algo.fit(trainset)

    # Than predict ratings for all pairs (u, i) that are NOT in the training set.
    testset = trainset.build_anti_testset()# THIS TAKES THE MOST RAM
    predictions = algo.test(testset)
    sp.accuracy.rmse(predictions)

    #create a dictionary of predictions
    top_n = get_top_n(predictions, n=10)

    #Turn the dictionary into a df
    top_ten_df = pd.DataFrame(top_n)

    return top_ten_df

Wall time: 0 ns


In [9]:
%%time
num_ratings = ratings.user_id.value_counts()
hundred_fifty_ids = num_ratings[num_ratings.values > 150].index
hundred_fifty = ratings[ratings.user_id.isin(hundred_fifty_ids)]

Wall time: 146 ms


In [10]:
%%time
def df_samp_unique_vals (df, percent, col1, col2=None):
    '''
    Takes a random sample of current dataframe while keeping a few column values unique to decrease matrix sparsity of sample

    ---Parameters---
    df (Pandas DataFrame)
    percent (float) enter a decimal of the percent sample you want
    col1 ("string") column name you want to keep retain unique values for (include quotation marks)
    col2 ("string") column name you want to keep retain unique values for (include quotation marks)

    ---Return---
    matrix stats of new df
    df_samp (Pandas DataFrame) as a percent sample of the original while keeping the columns entered unique
    '''
    x = df.book_id.nunique()
    y = df.user_id.nunique()
    print(f"Initial num of unique books: {x}")
    print(f"Initial num of unique users: {y}")
    print(f"Matrix size: {x*y}")
    print(f"Shape of df: {df.shape}")
    print(f"Density of matrix: {(df.shape[0])/(x*y)}")
    print("---------------------")

    # df.user_id.unique().sample(frac= percent) #(more efficient code to explore??)
    df_drop = df.drop_duplicates(subset=[col1])
    print (f"User drop: {len(df_drop)}")
    if col2:
        df_drop = df_drop.drop_duplicates(subset=[col2])
        print (f"Book drop: {len(df_drop)}")
    #take a sample of the unique values
    sample1 = df_drop.sample(frac= percent, random_state=101)#Random state = random seed for .sample
    print (f"length of entire sample w/ unique users & books: {len(sample1)}")

    #turn the unique routes & user names into a list to reference
    sample1= sample1.loc[:, [col1, col2]].values.T.ravel()
    lst1= sample1.tolist()

    #Filter out the original DF with only unique the unique values
    df_samp = df[(df[col1].isin(lst1)) & (df[col2].isin(lst1))]
    
    x = df_samp.book_id.nunique()
    y = df_samp.user_id.nunique()
    print(f"Final num of unique books: {x}")
    print(f"Final num of unique users: {y}")
    print(f"Matrix size: {x*y}")
    print(f"Shape of df: {df_samp.shape}")
    print(f"Density of matrix: {(df_samp.shape[0])/(x*y)}")

    return df_samp


Wall time: 0 ns


In [11]:
%%time
#create a 65% sample out of the dataframe
sample = df_samp_unique_vals(ratings, .65, "user_id", "book_id")

Initial num of unique books: 10000
Initial num of unique users: 53424
Matrix size: 534240000
Shape of df: (5976479, 3)
Density of matrix: 0.011186880428271938
---------------------
User drop: 53424
Book drop: 7456
length of entire sample w/ unique users & books: 4846
Final num of unique books: 5745
Final num of unique users: 8757
Matrix size: 50308965
Shape of df: (687480, 3)
Density of matrix: 0.013665158883709892
Wall time: 400 ms


In [12]:
%%time
#not needed for below cell
reader = sp.Reader(rating_scale=(1, 5))
sample_data = sp.Dataset.load_from_df(sample[['user_id','book_id','rating']], reader)

Wall time: 459 ms


In [13]:
%%time
top_10_150_ratings = top_ten_df(sample)

RMSE: 0.2981
Wall time: 11min 13s


In [16]:
# save the file
top_10_150_ratings.to_csv('../data/processed/top_10_SVDpp.csv')

In [17]:
%%time
top_10_150_ratings = top_ten_df(sample, algo_name='KNNBaseline')

RMSE: 0.2630
Wall time: 11min 54s


In [19]:
# save the file
top_10_150_ratings.to_csv('../data/processed/top_10_KNNBaseline.csv')