# Example

In [1]:
import os
import zipfile
import csv

import requests


urls = {
    "books": "https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/6dd165b555a7b47b2dd36743a425776e641ff50c/books.csv",
    "ratings": "https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/6dd165b555a7b47b2dd36743a425776e641ff50c/ratings.csv",
    "book_tags": "https://github.com/zygmuntz/goodbooks-10k/blob/6dd165b555a7b47b2dd36743a425776e641ff50c/book_tags.csv",
    "tags": "https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/6dd165b555a7b47b2dd36743a425776e641ff50c/tags.csv",
    "to_read": "https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/6dd165b555a7b47b2dd36743a425776e641ff50c/to_read.csv",
}

def _download(url: str, dest_path: str):
    req = requests.get(url, stream=True)
    req.raise_for_status()

    with open(dest_path, "wb") as fd:
        for chunk in req.iter_content(chunk_size=2 ** 20):
            fd.write(chunk)


def get_data():
    if not os.path.exists("data"):
        os.makedirs("data")

        for name, url in urls.items():
            _download(url, f"data/{name}.csv")
        
    with open("data/ratings.csv", mode='r', encoding='utf-8') as fp_r:
        with open("data/books.csv", mode='r', encoding='utf-8') as fp_b:
                return (
            csv.DictReader(
                fp_r.readlines(), delimiter=",",
            ),
            csv.DictReader(
                fp_b.readlines(), delimiter=","
            ),
        )



def get_ratings():
    return get_data()[0]


def get_book_features():
    return get_data()[1]

In [2]:
import json
from itertools import islice

ratings, book_features = get_data()

for line in islice(ratings, 2):
    print(json.dumps(line, indent=4))

for line in islice(book_features, 1):
    print(json.dumps(line, indent=4))

{
    "user_id": "1",
    "book_id": "258",
    "rating": "5"
}
{
    "user_id": "2",
    "book_id": "4081",
    "rating": "4"
}
{
    "book_id": "1",
    "goodreads_book_id": "2767052",
    "best_book_id": "2767052",
    "work_id": "2792775",
    "books_count": "272",
    "isbn": "439023483",
    "isbn13": "9.78043902348e+12",
    "authors": "Suzanne Collins",
    "original_publication_year": "2008.0",
    "original_title": "The Hunger Games",
    "title": "The Hunger Games (The Hunger Games, #1)",
    "language_code": "eng",
    "average_rating": "4.34",
    "ratings_count": "4780653",
    "work_ratings_count": "4942365",
    "work_text_reviews_count": "155254",
    "ratings_1": "66715",
    "ratings_2": "127936",
    "ratings_3": "560092",
    "ratings_4": "1481305",
    "ratings_5": "2706317",
    "image_url": "https://images.gr-assets.com/books/1447303603m/2767052.jpg",
    "small_image_url": "https://images.gr-assets.com/books/1447303603s/2767052.jpg"
}


In [3]:
from lightfm.data import Dataset

dataset = Dataset()
dataset.fit((x['user_id'] for x in get_ratings()),
            (x['book_id'] for x in get_ratings()))

dataset.fit_partial(items=(x['book_id'] for x in get_book_features()),
                    item_features=(x['authors'] for x in get_book_features()))

num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 53424, num_items 10000.


In [4]:
(interactions, weights) = dataset.build_interactions(((x['user_id'], x['book_id'])
                                                      for x in get_ratings()))

print(repr(interactions))

<53424x10000 sparse matrix of type '<class 'numpy.int32'>'
	with 5976479 stored elements in COOrdinate format>


In [5]:
item_features = dataset.build_item_features(((x['book_id'], [x['authors']])
                                              for x in get_book_features()))
print(repr(item_features))

<10000x14664 sparse matrix of type '<class 'numpy.float32'>'
	with 20000 stored elements in Compressed Sparse Row format>


In [6]:
f = get_book_features()

In [7]:
print(next(f)['authors'])


Suzanne Collins


In [8]:
from lightfm import LightFM

model = LightFM(loss='bpr')
model.fit(interactions, item_features=item_features, num_threads=12)

<lightfm.lightfm.LightFM at 0x7f8fc04728f0>

In [9]:
from lightfm.evaluation import auc_score
from lightfm.cross_validation import random_train_test_split

(train, test) = random_train_test_split(interactions)

train_auc = auc_score(model, train, item_features=item_features, num_threads=12).mean()
print('Collaborative filtering train AUC: %s' % train_auc)

test_auc = auc_score(model, test, item_features=item_features, train_interactions=train, num_threads=12).mean()
print('Collaborative filtering test AUC: %s' % test_auc)

Collaborative filtering train AUC: 0.82427704
Collaborative filtering test AUC: 0.8249212


# 1. Build LightFM Dataset