In [1]:
import pandas as pd
import numpy as np
import os
import zipfile
import csv
import requests
import json
from itertools import islice
from lightfm.data import Dataset
from lightfm import LightFM
data_url = "https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/master/"
from lightfm.evaluation import auc_score

In [3]:
def _download(url: str, dest_path: str):

    req = requests.get(url, stream=True)
    req.raise_for_status()

    with open(dest_path, "wb") as fd:
        for chunk in req.iter_content(chunk_size=2 ** 20):
            fd.write(chunk)


def get_data(string):

    dat_string =  string + ".csv"
    ratings_url = data_url + dat_string
    dest_path = "data/" + string + ".csv"

    if not os.path.exists("data"):
        os.makedirs("data")
        
        _download(ratings_url, dest_path)
    else:
        if not os.path.exists(dest_path):
            _download(ratings_url, dest_path)
    
    with open(dest_path, "r") as dat:
        dat = [x for x in csv.DictReader(dat)] 
        return dat


def get_ratings():

    return get_data("ratings")

def get_book_features():

    return get_data("books")

In [5]:
ratings, book_features = get_ratings(), get_book_features()

In [7]:
for line in islice(ratings, 2):
    print(json.dumps(line, indent=4))

{
    "user_id": "1",
    "book_id": "258",
    "rating": "5"
}
{
    "user_id": "2",
    "book_id": "4081",
    "rating": "4"
}


In [9]:
for line in islice(book_features, 1):
    print(json.dumps(line, indent=4))

{
    "book_id": "1",
    "goodreads_book_id": "2767052",
    "best_book_id": "2767052",
    "work_id": "2792775",
    "books_count": "272",
    "isbn": "439023483",
    "isbn13": "9.78043902348e+12",
    "authors": "Suzanne Collins",
    "original_publication_year": "2008.0",
    "original_title": "The Hunger Games",
    "title": "The Hunger Games (The Hunger Games, #1)",
    "language_code": "eng",
    "average_rating": "4.34",
    "ratings_count": "4780653",
    "work_ratings_count": "4942365",
    "work_text_reviews_count": "155254",
    "ratings_1": "66715",
    "ratings_2": "127936",
    "ratings_3": "560092",
    "ratings_4": "1481305",
    "ratings_5": "2706317",
    "image_url": "https://images.gr-assets.com/books/1447303603m/2767052.jpg",
    "small_image_url": "https://images.gr-assets.com/books/1447303603s/2767052.jpg"
}


In [11]:
dataset = Dataset()
dataset.fit((x['user_id'] for x in ratings),
            (x['book_id'] for x in ratings),
            (x['rating'] for x in ratings))

In [13]:
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 53424, num_items 10000.


In [15]:
dataset.fit_partial(items=(x['book_id'] for x in book_features),
                    item_features=(x['authors'] for x in book_features))

In [17]:
(interactions, weights) = dataset.build_interactions((x['user_id'], x['book_id'], int(x['rating'])) 
                                                      for x in ratings)

In [19]:
print(repr(interactions))

<53424x10000 sparse matrix of type '<class 'numpy.int32'>'
	with 5976479 stored elements in COOrdinate format>


In [23]:
item_features = dataset.build_item_features(((x['book_id']), [x['authors']])
                                              for x in book_features)

In [25]:
print(repr(item_features))

<10000x14664 sparse matrix of type '<class 'numpy.float32'>'
	with 20000 stored elements in Compressed Sparse Row format>


In [27]:
mapping = dataset.mapping()

def inv_dict(dicti):
    return dict(zip(dicti.values(), dicti.keys()))
    
inv_mapping = [inv_dict(x) for x in mapping]

In [110]:
model = LightFM(no_components = 16, loss='warp')
model.fit(interactions, item_features = item_features, sample_weight = weights)

<lightfm.lightfm.LightFM at 0x7f2a00f58190>

In [112]:
train_auc = auc_score(model, interactions, item_features = item_features).mean()
print('AUC: train %.2f' % (train_auc))

AUC: train 0.92


In [114]:
def get_predictions(user_ids, model, weights, item_features):
    num_users, num_items = weights.shape

    weight = weights.tocsr()
        
    for user_id in user_ids:
        user_id = str(user_id)
        in_map = mapping[0][user_id]
    
    
        user_row = weights.tocsr()[in_map, 0:].toarray()[0,]
        scores = model.predict(mapping[0][user_id], np.arange(num_items))
        known_read = [book_features[int(inv_mapping[2][x])-1]['title'] for x in range(num_items) if weight[in_map ,x] > 0]
        top_items = [book_features[int(inv_mapping[2][x])-1]['title'] for x in np.argsort(-scores)] 

        known_read_set = set(known_read)
    
        print(f"User {user_id}")
        print("     Known positives:")

        for x in known_read[:5] :
            print(f"        {x}")
    
        print("     Recommended:")

        count = 0
        for x in top_items:
            if x not in known_read_set:
                print(f"        {x}")
                count+=1
            if count == 5:
                break

In [116]:
get_predictions([90, 92], model, weights, item_features)

User 90
     Known positives:
        Ender's Game (Ender's Saga, #1)
        The Sun Also Rises
        The Alchemist
        The Adventures of Huckleberry Finn
        The Catcher in the Rye
     Recommended:
        Little Women (Little Women, #1)
        The Great Gatsby
        Slaughterhouse-Five
        Animal Farm
        My Sister's Keeper
User 92
     Known positives:
        Ender's Game (Ender's Saga, #1)
        The Alchemist
        Slaughterhouse-Five
        Holy Cow: An Indian Adventure
        Speaker for the Dead (Ender's Saga, #2)
     Recommended:
        Brave New World
        Animal Farm
        Dune (Dune Chronicles #1)
        Good Omens: The Nice and Accurate Prophecies of Agnes Nutter, Witch
        A Game of Thrones (A Song of Ice and Fire, #1)


In [81]:
for i in book_features[:10]:
    print(i['title'])

The Hunger Games (The Hunger Games, #1)
Harry Potter and the Sorcerer's Stone (Harry Potter, #1)
Twilight (Twilight, #1)
To Kill a Mockingbird
The Great Gatsby
The Fault in Our Stars
The Hobbit
The Catcher in the Rye
Angels & Demons  (Robert Langdon, #1)
Pride and Prejudice
