In [1]:
import os
import zipfile
import csv

import requests


def _download(url: str, dest_path: str):

    req = requests.get(url, stream=True)
    req.raise_for_status()

    with open(dest_path, "wb") as fd:
        for chunk in req.iter_content(chunk_size=2 ** 20):
            fd.write(chunk)


def get_data():

    ratings_url = ("http://www2.informatik.uni-freiburg.de/" "~cziegler/BX/BX-CSV-Dump.zip")

    if not os.path.exists("data"):
        os.makedirs("data")

        _download(ratings_url, "data/data.zip")

    with zipfile.ZipFile("data/data.zip") as archive:
        return (
            csv.DictReader(
                (x.decode("utf-8", "ignore") for x in archive.open("BX-Book-Ratings.csv")),
                delimiter=";",
            ),
            csv.DictReader(
                (x.decode("utf-8", "ignore") for x in archive.open("BX-Books.csv")), delimiter=";"
            ),
        )


def get_ratings():

    return get_data()[0]


def get_book_features():

    return get_data()[1]

In [2]:
import json
from itertools import islice

ratings, book_features = get_data()

In [3]:
for line in islice(ratings, 2):
    print(json.dumps(line, indent=4))

{
    "User-ID": "276725",
    "ISBN": "034545104X",
    "Book-Rating": "0"
}
{
    "User-ID": "276726",
    "ISBN": "0155061224",
    "Book-Rating": "5"
}


In [4]:
for line in islice(book_features, 1):
    print(json.dumps(line, indent=4))

{
    "ISBN": "0195153448",
    "Book-Title": "Classical Mythology",
    "Book-Author": "Mark P. O. Morford",
    "Year-Of-Publication": "2002",
    "Publisher": "Oxford University Press",
    "Image-URL-S": "http://images.amazon.com/images/P/0195153448.01.THUMBZZZ.jpg",
    "Image-URL-M": "http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg",
    "Image-URL-L": "http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg"
}


In [12]:
df = [x for x in get_ratings()]
df[0]

OrderedDict([('User-ID', '276725'),
             ('ISBN', '034545104X'),
             ('Book-Rating', '0')])

In [13]:
df = [x for x in get_book_features()]
df[0]

OrderedDict([('ISBN', '0195153448'),
             ('Book-Title', 'Classical Mythology'),
             ('Book-Author', 'Mark P. O. Morford'),
             ('Year-Of-Publication', '2002'),
             ('Publisher', 'Oxford University Press'),
             ('Image-URL-S',
              'http://images.amazon.com/images/P/0195153448.01.THUMBZZZ.jpg'),
             ('Image-URL-M',
              'http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg'),
             ('Image-URL-L',
              'http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg')])

In [5]:
from lightfm.data import Dataset

dataset = Dataset()
dataset.fit((x['User-ID'] for x in get_ratings()),
            (x['ISBN'] for x in get_ratings()))



In [16]:
dataset.fit_partial(items=(x['ISBN'] for x in get_book_features()),
                    item_features=(x['Book-Author'] for x in get_book_features()))

In [11]:
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 105283, num_items 340553.


In [14]:
(interactions, weights) = dataset.build_interactions(((x['User-ID'], x['ISBN'])
                                                      for x in get_ratings()))

In [17]:
item_features = dataset.build_item_features(((x['ISBN'], [x['Book-Author']])
                                              for x in get_book_features()))
print(repr(item_features))

<341762x443805 sparse matrix of type '<class 'numpy.float32'>'
	with 613141 stored elements in Compressed Sparse Row format>


In [22]:
len(set([x['ISBN'] for x in df]))

271379

# ARRRRRR

In [23]:
import numpy as np

from lightfm.datasets import fetch_stackexchange

data = fetch_stackexchange('crossvalidated',
                           test_set_fraction=0.1,
                           indicator_features=False,
                           tag_features=True)

train = data['train']
test = data['test']

In [24]:
item_features = data['item_features']
tag_labels = data['item_feature_labels']

In [28]:
train.shape, test.shape, item_features.shape, tag_labels.shape

((3221, 72360), (3221, 72360), (72360, 1246), (1246,))

In [36]:
itf = item_features.toarray()
np.unique(itf)

array([0., 1.], dtype=float32)

In [40]:
tden = train.toarray()
np.unique(tden)

array([0., 1., 2., 3., 4., 5., 6., 8.], dtype=float32)

In [42]:
from lightfm import LightFM

NUM_THREADS = 2
NUM_COMPONENTS = 30
NUM_EPOCHS = 3
ITEM_ALPHA = 1e-6

# Define a new model instance
model = LightFM(loss='warp',
                item_alpha=ITEM_ALPHA,
                no_components=NUM_COMPONENTS)

# Fit the hybrid model. Note that this time, we pass
# in the item features matrix.
model = model.fit(train,
                item_features=item_features,
                epochs=NUM_EPOCHS,
                num_threads=NUM_THREADS)

In [44]:
from lightfm.evaluation import auc_score

In [45]:
# Don't forget the pass in the item features again!
train_auc = auc_score(model,
                      train,
                      item_features=item_features,
                      num_threads=NUM_THREADS).mean()
print('Hybrid training set AUC: %s' % train_auc)

Hybrid training set AUC: 0.85741806


In [47]:
test_auc = auc_score(model,
                    test,
                    train_interactions=train,
                    item_features=item_features,
                    num_threads=NUM_THREADS, check_intersections=False).mean()
print('Hybrid test set AUC: %s' % test_auc)

Hybrid test set AUC: 0.71975696


# MYTASK

In [48]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [52]:
from scipy.sparse import csr_matrix

from lightfm.cross_validation import random_train_test_split
from lightfm import LightFM
from lightfm.evaluation import precision_at_k

In [49]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
df_item = pd.read_csv('input/item-features.csv')
df_user = pd.read_csv('input/user-features.csv')
subm = pd.read_csv('input/sample-submission.csv')

In [50]:
df_item = df_item.drop(columns=['19','27','30','9'])
df_item = df_item.sort_values('item_id').reset_index(drop=True)

df_user = df_user.iloc[:, :2]
df_user = df_user.sort_values('user_id').reset_index(drop=True)

In [648]:
interactions = csr_matrix((train['like'].map({0:-1, 1:1}).values, (train['user_id'] , train['item_id'])))
interactions.shape

(497, 444)

In [135]:
train_inter, test_inter = random_train_test_split(interactions, test_percentage=0.2, 
                                                  random_state=np.random.RandomState(11))

In [593]:
item_features = csr_matrix((df_item.iloc[:, 1:]))
item_features.shape

(444, 28)

In [597]:
user_features = csr_matrix((df_user.iloc[:, [1]]))
user_features.shape

(497, 1)

In [598]:
from sklearn.preprocessing import MinMaxScaler

user_features = csr_matrix(MinMaxScaler().fit_transform(df_user.iloc[:, [0]]))

  return self.partial_fit(X, y)


In [589]:
NUM_THREADS = 4
NUM_COMPONENTS = 60
NUM_EPOCHS = 30
ITEM_ALPHA = 1e-3
USER_ALPHA = 1e-5

# Define a new model instance
model = LightFM(loss='warp',
                user_alpha=USER_ALPHA,
                item_alpha=ITEM_ALPHA,
                no_components=NUM_COMPONENTS, random_state=50)

# Fit the hybrid model. Note that this time, we pass
# in the item features matrix.
model = model.fit(interactions,
#                 user_features=user_features,
#                 item_features=item_features,
                epochs=NUM_EPOCHS,
                num_threads=NUM_THREADS)

In [590]:
# Don't forget the pass in the item features again!
train_score = precision_at_k(model,
                          train_inter,
#                           user_features=user_features,
#                           item_features=item_features,
                          num_threads=NUM_THREADS, k=3).mean()
print('Hybrid precision_at_k: %s' % train_score)  # 0.16307846

Hybrid precision_at_k: 0.5103957


In [591]:
# Don't forget the pass in the item features again!
val_score = precision_at_k(model,
                           test_inter,
                           train_interactions=train_inter,
#                            user_features=user_features,
#                            item_features=item_features,
                           num_threads=NUM_THREADS, k=3).mean()
print('Hybrid precision_at_k: %s' % val_score)  # 0.049079753

Hybrid precision_at_k: 0.21813224


In [649]:
# item_features = csr_matrix((df_item.iloc[:, 1:]))
# item_features.shape

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
item_features = csr_matrix(scaler.fit_transform(df_item.iloc[:, 1:]))

In [651]:
NUM_THREADS = 4
NUM_COMPONENTS = 60
NUM_EPOCHS = 30
ITEM_ALPHA = 1e-3
USER_ALPHA = 3e-5
MAX_SAMPLED = 10

res_score = []
for i in range(10):
    train_inter, test_inter = random_train_test_split(interactions, test_percentage=0.2, 
                                                      random_state=np.random.RandomState(i))
    
    model = LightFM(loss='warp', user_alpha=USER_ALPHA, item_alpha=ITEM_ALPHA, max_sampled=MAX_SAMPLED,
                    no_components=NUM_COMPONENTS, random_state=50, learning_rate=0.03)

    model = model.fit(train_inter,
#                       user_features=user_features,
#                       item_features=item_features,
                      epochs=NUM_EPOCHS,
                      num_threads=NUM_THREADS)
    
    val_score = precision_at_k(model,
                               test_inter,
                               train_interactions=train_inter,
#                                user_features=user_features,
#                                item_features=item_features,
                               num_threads=NUM_THREADS, k=3).mean()
    res_score.append(val_score)
print('Hybrid precision_at_k: %s' % np.mean(res_score))

Hybrid precision_at_k: 0.05200963


NUM_THREADS = 4
NUM_COMPONENTS = 60
NUM_EPOCHS = 30
ITEM_ALPHA = 1e-3
USER_ALPHA = 3e-5
MAX_SAMPLED = 10
Hybrid precision_at_k: 0.05200963

+ user_feat 1
Hybrid precision_at_k: 0.0427072

+ user_feat 0
Hybrid precision_at_k: 0.045940086

+ item_feat
Hybrid precision_at_k: 0.030538598