In [1]:
import pandas as pd
import numpy as np
import os
import csv
import requests
import json
import scipy.sparse as sparse
from itertools import islice
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.evaluation import auc_score

data_url = "https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/master/"

In [3]:
def _download(url: str, dest_path: str):

    req = requests.get(url, stream=True)
    req.raise_for_status()

    with open(dest_path, "wb") as fd:
        for chunk in req.iter_content(chunk_size=2 ** 20):
            fd.write(chunk)


def get_data(string):

    dat_string =  string + ".csv"
    ratings_url = data_url + dat_string
    dest_path = "data/" + string + ".csv"

    if not os.path.exists("data"):
        os.makedirs("data")
        
        _download(ratings_url, dest_path)
    else:
        if not os.path.exists(dest_path):
            _download(ratings_url, dest_path)
    
    with open(dest_path, "r") as dat:
        dat = [x for x in csv.DictReader(dat)] 
        return dat


def get_ratings():

    return get_data("ratings")

def get_book_features():

    return get_data("books")

In [5]:
ratings, book_features = get_ratings(), get_book_features()

In [7]:
for line in islice(ratings, 2):
    print(json.dumps(line, indent=4))

{
    "user_id": "1",
    "book_id": "258",
    "rating": "5"
}
{
    "user_id": "2",
    "book_id": "4081",
    "rating": "4"
}


In [9]:
for line in islice(book_features, 1):
    print(json.dumps(line, indent=4))

{
    "book_id": "1",
    "goodreads_book_id": "2767052",
    "best_book_id": "2767052",
    "work_id": "2792775",
    "books_count": "272",
    "isbn": "439023483",
    "isbn13": "9.78043902348e+12",
    "authors": "Suzanne Collins",
    "original_publication_year": "2008.0",
    "original_title": "The Hunger Games",
    "title": "The Hunger Games (The Hunger Games, #1)",
    "language_code": "eng",
    "average_rating": "4.34",
    "ratings_count": "4780653",
    "work_ratings_count": "4942365",
    "work_text_reviews_count": "155254",
    "ratings_1": "66715",
    "ratings_2": "127936",
    "ratings_3": "560092",
    "ratings_4": "1481305",
    "ratings_5": "2706317",
    "image_url": "https://images.gr-assets.com/books/1447303603m/2767052.jpg",
    "small_image_url": "https://images.gr-assets.com/books/1447303603s/2767052.jpg"
}


In [131]:
title_list = {"book_id" : [x['book_id'] for x in book_features], 
              "title" : [x['title'] for x in book_features], 
              "img" : [x['image_url'] for x in book_features]}
title_frame = pd.DataFrame(data = title_list)

def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val)

def path_to_image_html(path):
    return '<img src="'+ path + '" width="50" >'

In [171]:
def string_list(substring):
    substring = substring.lower()
    seen = []
    for item in title_frame.itertuples(index = False):
        if (item.title.lower()).find(substring) == -1:
            continue
        else: 
            seen.append(item)
            
    return pd.DataFrame(data = seen)

def ask_rating(user_id):
    while True:
        book_name = input("Enter book to rate (You could enter a substring):\n")
        
        if len(book_name) == 0:
            return None
            
        elif len(book_name) < 4:
            print("Please enter at least 4 characters!\n")
            continue
        
        search_list = string_list(book_name)
        
        if search_list.empty == True:
            print("...")
            continue

        display_data = search_list.loc[:, search_list.columns != 'book_id']
        display(display_data.head(10).style.format({"img": path_to_image_html}))
        
        while True:
            
            try:
                index_choice = int(input("Choose index of book to rate:\n"))

            except:
                print("Impossible Index!")
                continue

            row = search_list.iloc[int(index_choice)]
            book_id, title = row.loc['book_id'], row['title']
            
            print(f"{title}\n")
            rating = input("Enter rating as a number from 1-5\n")
                
            if rating.isdigit() == False:
                print("Enter an integer!\n")
                    
            elif int(rating) > 5 or int(rating) < 1:
                print("Please enter valid rating")
                rating = input("Enter rating as a number from 1-5\n")
                    
            return {'user_id': user_id, 'book_id': book_id, 'rating': int(rating)}

In [141]:
def scale_weights(weights, interactions):
    
    count_treat = interactions.sum(axis = 0)[0,]
    C = np.squeeze(np.asarray(count_treat))
    
    inv_pscore = np.power(C, -(0.45)*np.ones(len(C)))
    adj_score_mat = weights.multiply(sparse.csr_array(inv_pscore))
    mx_wgt = adj_score_mat.max()
    max_norm_inv = np.power(mx_wgt, -1)
    
    scaled_weights = adj_score_mat.tocoo()*(max_norm_inv*5)
    return scaled_weights

In [167]:
curr_ratings = []
dataset = Dataset()

def rating_store(curr_ratings):
    while True:
        print("Hit enter without text in the book-input to exit rating process!\n")
        user_input = ask_rating('-1')
        if user_input != None:
            curr_ratings.append(user_input)
        else:
            break

rating_store(curr_ratings)

if len(curr_ratings) == 0:
    print("Please enter some ratings! You have entered no ratings\n")
    val = input("Or please confirm you wish to not enter anymore ratings. Y/n:\n")
    if val.lower() == 'n':
        rating_store(curr_ratings)
        
dataset.fit((x['user_id'] for x in ratings+curr_ratings),
            (x['book_id'] for x in ratings+curr_ratings),
            (x['rating'] for x in ratings+curr_ratings))   

Hit enter without text in the book-input to exit rating process!



Enter book to rate (You could enter a substring):
 harry


Unnamed: 0,title,img
0,"Harry Potter and the Sorcerer's Stone (Harry Potter, #1)",
1,"Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)",
2,"Harry Potter and the Order of the Phoenix (Harry Potter, #5)",
3,"Harry Potter and the Chamber of Secrets (Harry Potter, #2)",
4,"Harry Potter and the Goblet of Fire (Harry Potter, #4)",
5,"Harry Potter and the Deathly Hallows (Harry Potter, #7)",
6,"Harry Potter and the Half-Blood Prince (Harry Potter, #6)",
7,"Harry Potter and the Cursed Child - Parts One and Two (Harry Potter, #8)",
8,"Harry Potter Boxset (Harry Potter, #1-7)",
9,"The Lincoln Lawyer (Mickey Haller, #1; Harry Bosch Universe, #16)",


Choose index of book to rate:
 


Impossible Index!


Choose index of book to rate:
 5


Harry Potter and the Deathly Hallows (Harry Potter, #7)



Enter rating as a number from 1-5
 5


Hit enter without text in the book-input to exit rating process!



Enter book to rate (You could enter a substring):
 


In [168]:
curr_ratings

[{'user_id': '-1', 'book_id': '25', 'rating': 5}]

In [334]:
dataset.fit_partial(items=(x['book_id'] for x in book_features),
                    item_features=(x['authors'] for x in book_features))

In [337]:
item_features = dataset.build_item_features(((x['book_id']), [x['authors']])
                                              for x in book_features)

In [339]:
len(curr_ratings)

0

In [341]:
(interactions, weights) = dataset.build_interactions((x['user_id'], x['book_id'], int(x['rating'])) 
                                                      for x in ratings+curr_ratings)

In [343]:
weights = weights.tocsr().tocoo()
interactions = interactions.tocsr().tocoo()

In [345]:
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 53424, num_items 10000.


In [347]:
print(repr(interactions))

<53424x10000 sparse matrix of type '<class 'numpy.int32'>'
	with 5976479 stored elements in COOrdinate format>


In [349]:
scaled_weights = scale_weights(weights, interactions)

In [351]:
mapping = dataset.mapping()

def inv_dict(dicti):
    return dict(zip(dicti.values(), dicti.keys()))
    
inv_mapping = [inv_dict(x) for x in mapping]

In [353]:
model = LightFM(no_components = 20, loss='warp')
%time model.fit(interactions, item_features = item_features, sample_weight = scaled_weights, epochs = 25, num_threads = 7)

CPU times: user 4min 51s, sys: 2.86 s, total: 4min 54s
Wall time: 42.9 s


<lightfm.lightfm.LightFM at 0x7f57c9700a90>

In [355]:
def get_predictions(user_ids, model, weights, item_features):
    num_users, num_items = weights.shape

    weight = weights.tocsr()
        
    for user_id in user_ids:
        user_id = str(user_id)
        in_map = mapping[0][user_id]
    
    
        user_row = weights.tocsr()[in_map, 0:].toarray()[0,]
        scores = model.predict(mapping[0][user_id], np.arange(num_items))
        known_read = [(book_features[int(inv_mapping[2][x])-1]['title'], weight[in_map ,x]) for x in range(num_items) if weight[in_map ,x] > 0]
        top_items = [book_features[int(inv_mapping[2][x])-1]['title'] for x in np.argsort(-scores)] 

        known_read.sort(key=lambda tup: tup[1], reverse = True)
        known_reads = [x[0] for x in known_read]
        known_read_set = set(known_reads)
    
        print(f"User {user_id}")
        print("     Known positives:")

        for x, y in known_read[:7]:
            print(f"        {x}, rated: {y}")
    
        print("     Recommended:")

        count = 0
        for x in top_items:
            if x not in known_read_set:
                print(f"        {x}")
                count+=1
            if count == 6:
                break

In [375]:
get_predictions([9], model, weights, item_features)

User 9
     Known positives:
        The Shadow of the Wind (The Cemetery of Forgotten Books,  #1), rated: 5.0
        Harry Potter and the Prisoner of Azkaban (Harry Potter, #3), rated: 5.0
        The Great Gatsby, rated: 5.0
        1984, rated: 5.0
        Me Talk Pretty One Day, rated: 5.0
        Holidays on Ice, rated: 5.0
        Atonement, rated: 5.0
     Recommended:
        Ender's Game (Ender's Saga, #1)
        The Lion, the Witch, and the Wardrobe (Chronicles of Narnia, #1)
        Little Women (Little Women, #1)
        To Kill a Mockingbird
        Divergent (Divergent, #1)
        The Lightning Thief (Percy Jackson and the Olympians, #1)


In [59]:
for i in book_features[:10]:
    print(i['title'])

The Hunger Games (The Hunger Games, #1)
Harry Potter and the Sorcerer's Stone (Harry Potter, #1)
Twilight (Twilight, #1)
To Kill a Mockingbird
The Great Gatsby
The Fault in Our Stars
The Hobbit
The Catcher in the Rye
Angels & Demons  (Robert Langdon, #1)
Pride and Prejudice


In [135]:
train_auc = auc_score(model, interactions, item_features = item_features).mean()
print('AUC: train %.2f' % (train_auc))

AUC: train 0.96
