# Predictions

I decided to take the top 100 books recommended for the users and recommend the top 10 that matched at least 2 of the user's top 5 genres. 

In [1]:
# Import the necessary libraries 
import pandas as pd
import numpy as np
from collections import Counter

In [2]:
# Read in the dataframes 
books = pd.read_csv('../data/processed/books.csv')
ratings = pd.read_csv('../data/processed/ratings.csv')
top_10_SVDpp= pd.read_csv('../data/processed/top_10_100_SVDpp.csv', index_col=0)
top_10_KNNBaseline = pd.read_csv('../data/processed/top_10_100_KNNBaseline.csv', index_col=0)

top_100_SVDpp= pd.read_csv('../data/processed/top_100_100_SVDpp.csv', index_col=0)
top_100_KNNBaseline = pd.read_csv('../data/processed/top_100_100_genre_KNNBaseline.csv', index_col=0)

In [3]:
def top_ten_prediction(user_id, top_10, books):
    if user_id not in top_10.columns: 
        print("User id", user_id, "not in the database.")
        
        # cold_start
        cold = pd.DataFrame(books.sort_values('bayesian_avg', ascending=False)[:10])
        cold['predicted_rating'] = cold['bayesian_avg']
        cold = cold.sort_values(['predicted_rating','bayesian_avg'], ascending=[False, False])
        #print(cold)
        return(cold)
    
    else:
        top = pd.DataFrame()
        book_list = []
        print("Top predicted books for user id", user_id)
        #print(top_10[user_id])
        for book_id in top_10[user_id]:
            to_add = books[books.book_id == eval(book_id)[0]]
            #print(to_add)
            to_add['predicted_rating'] = eval(book_id)[1]
            #print(to_add)
            top = pd.concat([top, to_add]) 
            top = top.sort_values(['predicted_rating','bayesian_avg'], ascending=[False, False])
        #print(top)
    return(top)

In [4]:
def top_ten_genre_prediction(user_id, top_10, books): 
    if user_id not in top_10.columns: 
        print("User id", user_id, "not in the database.")
        
        # cold_start
        cold = pd.DataFrame(books.sort_values('bayesian_avg', ascending=False)[:10])
        cold['predicted_rating'] = cold['bayesian_avg']
        cold = cold.sort_values(['predicted_rating','bayesian_avg'], ascending=[False, False])
        #print(cold)
        return(cold)
    
    else:
        genres = get_top_genres(user_id)
        top = pd.DataFrame()
        book_list = []
        # while(top.shape[0]>9): 
        for book_id in top_10[user_id]:
            to_add = books[books.book_id == eval(book_id)[0]]
            if to_add.tag_1.values not in genres and to_add.tag_2.values not in genres and to_add.tag_3.values not in genres: 
                #print("Not in genre")
                continue
            else:
                #print("In genres")
                #print(to_add)
                to_add['predicted_rating'] = eval(book_id)[1]
                #print(to_add)
                top = pd.concat([top, to_add]) 
                top = top.sort_values(['predicted_rating','bayesian_avg'], ascending=[False, False])
        print("Top predicted books for user id", user_id)
        return(top)

In [5]:
def top_ten_2genre_prediction(user_id, top_10, books): 
    if user_id not in top_10.columns: 
        print("User id", user_id, "not in the database.")
        
        # cold_start
        cold = pd.DataFrame(books.sort_values('bayesian_avg', ascending=False)[:10])
        cold['predicted_rating'] = cold['bayesian_avg']
        cold = cold.sort_values(['predicted_rating','bayesian_avg'], ascending=[False, False])
        #print(cold)
        return(cold)
    
    else:
        genres = get_top_genres(user_id)
        top = pd.DataFrame()
        book_list = []
        # while(top.shape[0]>9): 
        for book_id in top_10[user_id]:
            to_add = books[books.book_id == eval(book_id)[0]]
            book_tags = [x for x in [to_add.tag_1.values, to_add.tag_2.values, to_add.tag_3.values] if 'nan' not in x]
            if len(book_tags) > 1 and not pd.isnull(book_tags).any(): 
                if(len(np.intersect1d(book_tags, genres))> 1): 
                    to_add['predicted_rating'] = eval(book_id)[1]
                    top = pd.concat([top, to_add]) 
                    top = top.sort_values(['predicted_rating','bayesian_avg'], ascending=[False, False])
        return(top)

In [6]:
def get_top_genres(user_id): 
    user_id = int(user_id)
    rts = pd.merge(ratings[ratings.user_id == user_id], books, on='book_id')
    genres = {}
    for tag in rts.tag_1[:15].append(rts.tag_2[:15]).append(rts.tag_3[:15]): 
        if tag not in genres: 
            genres[tag] = 1
        else: 
            genres[tag]+=1
    top_genres = Counter(genres)
    top_five_genres = [x for x, y in top_genres.most_common(5)]
    #print("top genres for user", user_id, top_five_genres)
    return(top_five_genres)

### Predict for a user in the database

#### KNNBaseline

In [7]:
#Predict users top_ten books
top_books = top_ten_2genre_prediction("1234", top_100_KNNBaseline, books)
top_books.reset_index(inplace=True)
print([x for x in top_books.title][:10])
print("Top predicted books for user id", "1234")
top_books[['book_id', 'title','predicted_rating', 'bayesian_avg', 'tag_1', 'tag_2', 'tag_3']].head(10)

['The Way of Kings, Part 1 (The Stormlight Archive #1.1)', 'A Court of Mist and Fury (A Court of Thorns and Roses, #2)', 'Words of Radiance (The Stormlight Archive, #2)', 'Harry Potter Boxset (Harry Potter, #1-7)', 'A Song of Ice and Fire (A Song of Ice and Fire, #1-4)', 'Mark of the Lion Trilogy', 'Collected Fictions', 'The Way of Kings (The Stormlight Archive, #1)', 'Harry Potter Collection (Harry Potter, #1-6)', 'BookRags Summary:  A Storm of Swords']
Top predicted books for user id 1234


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  to_add['predicted_rating'] = eval(book_id)[1]


Unnamed: 0,book_id,title,predicted_rating,bayesian_avg,tag_1,tag_2,tag_3
0,9141,"The Way of Kings, Part 1 (The Stormlight Archi...",4.141386,4.082211,fantasy,fiction,magic
1,1308,A Court of Mist and Fury (A Court of Thorns an...,4.129135,4.399902,magic,fiction,fantasy
2,862,"Words of Radiance (The Stormlight Archive, #2)",4.049994,4.46031,fantasy,fiction,magic
3,422,"Harry Potter Boxset (Harry Potter, #1-7)",4.038917,4.479617,fiction,fantasy,ya
4,2149,A Song of Ice and Fire (A Song of Ice and Fire...,3.987012,4.193842,fantasy,fiction,sci-fi
5,8854,Mark of the Lion Trilogy,3.933624,3.995821,historical-fiction,christian,fiction
6,5754,Collected Fictions,3.92955,4.07889,fiction,classic,fantasy
7,562,"The Way of Kings (The Stormlight Archive, #1)",3.929173,4.384686,fantasy,fiction,magic
8,3753,"Harry Potter Collection (Harry Potter, #1-6)",3.925811,4.126539,fantasy,fiction,harry-potter
9,4708,BookRags Summary: A Storm of Swords,3.882081,4.090086,fantasy,fiction,magic


#### SVDpp

In [8]:
#Predict users top_ten books
top_books = top_ten_2genre_prediction("1234", top_100_SVDpp, books)
top_books.reset_index(inplace=True)
print([x for x in top_books.title][:10])
print("Top predicted books for user id", "1234")
top_books[['book_id', 'title','predicted_rating', 'bayesian_avg', 'tag_1', 'tag_2', 'tag_3']].head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  to_add['predicted_rating'] = eval(book_id)[1]


['Words of Radiance (The Stormlight Archive, #2)', 'Lonesome Dove', 'Homegoing', 'The Poisonwood Bible', 'The Complete Maus (Maus, #1-2)', 'The Harry Potter Collection 1-4 (Harry Potter, #1-4)', 'The Way of Kings, Part 1 (The Stormlight Archive #1.1)', 'Roots: The Saga of an American Family', "Fool's Quest  (The Fitz and The Fool, #2)", 'Musashi']
Top predicted books for user id 1234


Unnamed: 0,book_id,title,predicted_rating,bayesian_avg,tag_1,tag_2,tag_3
0,862,"Words of Radiance (The Stormlight Archive, #2)",4.048593,4.46031,fantasy,fiction,magic
1,757,Lonesome Dove,4.046664,4.301913,fiction,historical-fiction,classic
2,2840,Homegoing,3.979052,4.113482,historical-fiction,fiction,historical
3,100,The Poisonwood Bible,3.964556,3.977684,historical-fiction,fiction,classic
4,1380,"The Complete Maus (Maus, #1-2)",3.950189,4.274385,nonfiction,historical-fiction,fiction
5,2101,"The Harry Potter Collection 1-4 (Harry Potter,...",3.88278,4.242828,fantasy,harry-potter,fiction
6,9141,"The Way of Kings, Part 1 (The Stormlight Archi...",3.854665,4.082211,fantasy,fiction,magic
7,734,Roots: The Saga of an American Family,3.835154,4.247309,historical-fiction,fiction,classic
8,6228,"Fool's Quest (The Fitz and The Fool, #2)",3.824983,4.082711,fantasy,fiction,magic
9,7173,Musashi,3.79421,3.989245,fiction,historical-fiction,classic


### Real Ratings

In [9]:
real_ratings = ratings[ratings.user_id == 1234]
real_ratings = pd.merge(real_ratings, books, on='book_id')
real_ratings = real_ratings.sort_values(['rating','bayesian_avg'], ascending=[False, False])
print(get_top_genres(1234))
real_ratings[['book_id', 'title','rating', 'bayesian_avg', 'tag_1', 'tag_2', 'tag_3']].head(10)

['fiction', 'science-fiction', 'sci-fi', 'fantasy', 'historical-fiction']


Unnamed: 0,book_id,title,rating,bayesian_avg,tag_1,tag_2,tag_3
69,129,One Flew Over the Cuckoo's Nest,5,4.118052,classic,fiction,classic
107,516,The Amazing Adventures of Kavalier & Clay,5,4.019645,fiction,historical-fiction,historical
121,32,Of Mice and Men,5,3.847676,classic,fiction,classic
70,168,The Stand,4,4.223891,horror,fiction,fantasy
110,67,A Thousand Splendid Suns,4,4.194937,fiction,historical-fiction,contemporary
87,545,The Velveteen Rabbit,4,4.181946,childrens,classic,fiction
114,10,Pride and Prejudice,4,4.174904,classic,fiction,romance
111,11,The Kite Runner,4,4.157428,fiction,historical-fiction,contemporary
49,265,A Tree Grows in Brooklyn,4,4.156616,classic,historical-fiction,fiction
91,409,Fried Green Tomatoes at the Whistle Stop Cafe,4,4.134777,fiction,historical-fiction,chick lit


Based on these recommendations, it seems that they are fairly similar. 

### Another user in database

In [10]:
#Predict users top_ten books
top_books = top_ten_2genre_prediction("15", top_100_KNNBaseline, books)
top_books.reset_index(inplace=True)
# print([x for x in top_books.title])
print("Top predicted books for user id", "15")
top_books[['book_id', 'title','predicted_rating', 'bayesian_avg', 'tag_1', 'tag_2', 'tag_3']].head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  to_add['predicted_rating'] = eval(book_id)[1]


Top predicted books for user id 15


Unnamed: 0,book_id,title,predicted_rating,bayesian_avg,tag_1,tag_2,tag_3
0,862,"Words of Radiance (The Stormlight Archive, #2)",4.777977,4.46031,fantasy,fiction,magic
1,422,"Harry Potter Boxset (Harry Potter, #1-7)",4.653722,4.479617,fiction,fantasy,ya
2,1308,A Court of Mist and Fury (A Court of Thorns an...,4.640607,4.399902,magic,fiction,fantasy
3,9141,"The Way of Kings, Part 1 (The Stormlight Archi...",4.639382,4.082211,fantasy,fiction,magic
4,562,"The Way of Kings (The Stormlight Archive, #1)",4.626071,4.384686,fantasy,fiction,magic
5,1380,"The Complete Maus (Maus, #1-2)",4.624983,4.274385,nonfiction,historical-fiction,fiction
6,3753,"Harry Potter Collection (Harry Potter, #1-6)",4.601205,4.126539,fantasy,fiction,harry-potter
7,307,"The Wise Man's Fear (The Kingkiller Chronicle,...",4.570997,4.388037,fantasy,fiction,magic
8,3241,"Crooked Kingdom (Six of Crows, #2)",4.544693,4.168473,fantasy,young-adult,ya
9,4708,BookRags Summary: A Storm of Swords,4.542916,4.090086,fantasy,fiction,magic


In [11]:
#Predict users top_ten books
top_books = top_ten_2genre_prediction("15", top_10_SVDpp, books)
top_books.reset_index(inplace=True)
# print([x for x in top_books.title])
print("Top predicted books for user id", "15")
top_books[['book_id', 'title','predicted_rating', 'bayesian_avg', 'tag_1', 'tag_2', 'tag_3']].head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  to_add['predicted_rating'] = eval(book_id)[1]


Top predicted books for user id 15


Unnamed: 0,book_id,title,predicted_rating,bayesian_avg,tag_1,tag_2,tag_3
0,862,"Words of Radiance (The Stormlight Archive, #2)",4.678422,4.46031,fantasy,fiction,magic
1,9141,"The Way of Kings, Part 1 (The Stormlight Archi...",4.673807,4.082211,fantasy,fiction,magic


In [12]:
real_ratings = ratings[ratings.user_id == 15]
real_ratings = pd.merge(real_ratings, books, on='book_id')
real_ratings = real_ratings.sort_values(['rating','bayesian_avg'], ascending=[False, False])
real_ratings[['book_id', 'title','rating', 'bayesian_avg', 'tag_1', 'tag_2', 'tag_3']].head(10)

Unnamed: 0,book_id,title,rating,bayesian_avg,tag_1,tag_2,tag_3
90,25,Harry Potter and the Deathly Hallows (Harry Po...,5,4.502533,fantasy,young-adult,fiction
83,31,The Help,5,4.361369,fiction,historical-fiction,historical
29,337,The Ultimate Hitchhiker's Guide to the Galaxy,5,4.24307,science-fiction,sci-fi,fiction
25,87,Night (The Night Trilogy #1),5,4.23672,non-fiction,classic,history
16,10,Pride and Prejudice,5,4.174904,classic,fiction,romance
6,11,The Kite Runner,5,4.157428,fiction,historical-fiction,contemporary
21,709,Nine Stories,5,4.093035,contemporary,young-adult,
61,36,"The Giver (The Giver, #1)",5,4.058814,fiction,classic,dystopian
71,432,"I, Robot (Robot #0.1)",5,4.057556,sci-fi,fiction,classic
41,241,Number the Stars,5,4.026418,fiction,historical-fiction,classic


## Predict for a user not in the database

KNNBaseline and SVDpp should be the same, since they are showing a generic recommendation. 

#### KNNBaseline

In [13]:
#Predict users top_ten books
top_books = top_ten_genre_prediction("37640", top_10_SVDpp, books)
top_books.reset_index(inplace=True)
# print([x for x in top_books.title])
top_books[['book_id', 'title','predicted_rating', 'bayesian_avg', 'tag_1', 'tag_2', 'tag_3']]

User id 37640 not in the database.


Unnamed: 0,book_id,title,predicted_rating,bayesian_avg,tag_1,tag_2,tag_3
0,25,Harry Potter and the Deathly Hallows (Harry Po...,4.502533,4.502533,fantasy,young-adult,fiction
1,422,"Harry Potter Boxset (Harry Potter, #1-7)",4.479617,4.479617,fiction,fantasy,ya
2,862,"Words of Radiance (The Stormlight Archive, #2)",4.46031,4.46031,fantasy,fiction,magic
3,780,Calvin and Hobbes,4.433867,4.433867,comics,humor,graphic-novels
4,27,Harry Potter and the Half-Blood Prince (Harry ...,4.422748,4.422748,fantasy,young-adult,fiction
5,192,The Name of the Wind (The Kingkiller Chronicle...,4.421306,4.421306,fantasy,fiction,magic
6,24,Harry Potter and the Goblet of Fire (Harry Pot...,4.411219,4.411219,fantasy,young-adult,fiction
7,1010,The Essential Calvin and Hobbes: A Calvin and ...,4.409991,4.409991,comics,humor,graphic-novels
8,18,Harry Potter and the Prisoner of Azkaban (Harr...,4.400003,4.400003,fantasy,young-adult,fiction
9,1308,A Court of Mist and Fury (A Court of Thorns an...,4.399902,4.399902,magic,fiction,fantasy


#### SVDpp

In [14]:
#Predict users top_ten books
top_books = top_ten_genre_prediction("37640", top_10_KNNBaseline, books)
top_books.reset_index(inplace=True)
# print([x for x in top_books.title])
top_books[['book_id', 'title','predicted_rating', 'bayesian_avg', 'tag_1', 'tag_2', 'tag_3']]

User id 37640 not in the database.


Unnamed: 0,book_id,title,predicted_rating,bayesian_avg,tag_1,tag_2,tag_3
0,25,Harry Potter and the Deathly Hallows (Harry Po...,4.502533,4.502533,fantasy,young-adult,fiction
1,422,"Harry Potter Boxset (Harry Potter, #1-7)",4.479617,4.479617,fiction,fantasy,ya
2,862,"Words of Radiance (The Stormlight Archive, #2)",4.46031,4.46031,fantasy,fiction,magic
3,780,Calvin and Hobbes,4.433867,4.433867,comics,humor,graphic-novels
4,27,Harry Potter and the Half-Blood Prince (Harry ...,4.422748,4.422748,fantasy,young-adult,fiction
5,192,The Name of the Wind (The Kingkiller Chronicle...,4.421306,4.421306,fantasy,fiction,magic
6,24,Harry Potter and the Goblet of Fire (Harry Pot...,4.411219,4.411219,fantasy,young-adult,fiction
7,1010,The Essential Calvin and Hobbes: A Calvin and ...,4.409991,4.409991,comics,humor,graphic-novels
8,18,Harry Potter and the Prisoner of Azkaban (Harr...,4.400003,4.400003,fantasy,young-adult,fiction
9,1308,A Court of Mist and Fury (A Court of Thorns an...,4.399902,4.399902,magic,fiction,fantasy


### Real Ratings for that user

In [15]:
real_ratings = ratings[ratings.user_id == 37640]
real_ratings = pd.merge(real_ratings, books, on='book_id')
real_ratings = real_ratings.sort_values(['rating','bayesian_avg'], ascending=[False, False])
real_ratings[['book_id', 'title','rating', 'bayesian_avg', 'tag_1', 'tag_2', 'tag_3']].head(10)

Unnamed: 0,book_id,title,rating,bayesian_avg,tag_1,tag_2,tag_3
14,109,Les Misérables,5,4.095763,classic,fiction,historical-fiction
4,125,Hamlet,5,3.995079,classic,plays,fiction
8,2033,عزازيل,5,3.969736,fiction,historical-fiction,historical
1,7823,عائد إلى حيفا,5,3.947721,fiction,historical-fiction,classic
0,1887,Cyrano de Bergerac,5,3.944277,classic,plays,fiction
21,8748,رأيت رام الله,5,3.9263,biography,fiction,history
18,461,"The Good Earth (House of Earth, #1)",5,3.913542,fiction,young-adult,
5,353,Othello,5,3.90306,classic,plays,fiction
2,154,Macbeth,5,3.898857,plays,fiction,classic
15,778,The Hunchback of Notre-Dame,5,3.86871,classic,fiction,classic


### Rating statistics for the KNNBaseline model

In [16]:
%%time 
dataframe_lengths = []
for uid in top_10_KNNBaseline.columns: 
    dataframe_lengths.append(top_ten_2genre_prediction(uid, top_100_KNNBaseline, books).shape[0])
print(sum(dataframe_lengths)/len(dataframe_lengths))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  to_add['predicted_rating'] = eval(book_id)[1]


22.955464200068516
Wall time: 15min 54s


In [17]:
mean = sum(dataframe_lengths) / len(dataframe_lengths)
variance = sum([((x - mean) ** 2) for x in dataframe_lengths]) / len(dataframe_lengths)
res = variance ** 0.5

print("The maximum recommendations for a given user is {} and the minimum is {}".format(max(dataframe_lengths), min(dataframe_lengths)))
print("The average number of recommendations is {:.2f}".format(mean))
print("The standard deviation is {:.2f}".format(res))

The maximum recommendations for a given user is 70 and the minimum is 0
The average number of recommendations is 22.96
The standard deviation is 12.91


### Rating statistics for the SVDpp model

In [18]:
%%time 
dataframe_lengths_SVDpp=[]
for uid in top_10_KNNBaseline.columns: 
    dataframe_lengths_SVDpp.append(top_ten_2genre_prediction(uid, top_100_SVDpp, books).shape[0])
print(sum(dataframe_lengths)/len(dataframe_lengths))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  to_add['predicted_rating'] = eval(book_id)[1]


22.955464200068516
Wall time: 16min 40s


In [19]:
mean = sum(dataframe_lengths_SVDpp) / len(dataframe_lengths_SVDpp)
variance = sum([((x - mean) ** 2) for x in dataframe_lengths_SVDpp]) / len(dataframe_lengths_SVDpp)
res = variance ** 0.5

print("The maximum recommendations for a given user is {} and the minimum is {}".format(max(dataframe_lengths_SVDpp), min(dataframe_lengths_SVDpp)))
print("The average number of recommendations is {:.2f}".format(mean))
print("The standard deviation is {:.2f}".format(res))

The maximum recommendations for a given user is 75 and the minimum is 0
The average number of recommendations is 26.50
The standard deviation is 14.72
