In [292]:
import numpy as np
import scipy as sp
from typing import Optional

import os
import time
from functools import reduce
from tqdm import tqdm
from pathlib import Path

from lightfm import LightFM
import pandas as pd
from collections import defaultdict
from scipy.sparse import csr_matrix

In [10]:
os.environ['OPENBLAS_NUM_THREADS'] = "1"
datetime_col = 'last_watch_dt'
DATA_PATH = Path("../data/kion_train/")

In [11]:
%%time
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = pd.read_csv(DATA_PATH / 'interactions.csv')

CPU times: user 2.2 s, sys: 325 ms, total: 2.53 s
Wall time: 2.55 s


In [13]:
interactions[datetime_col] = pd.to_datetime(interactions[datetime_col], format='%Y-%m-%d')
interactions.dropna(inplace=True)

In [16]:
interactions['watched'] = pd.cut(
    x=interactions['watched_pct'],
    bins=5,
    labels=[1, 2, 3, 4, 5]
)

In [197]:
items.fillna('Unknown', inplace=True)

items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
items["actors"] = items["actors"].str.lower().str.replace(", ", ",", regex=False).str.split(",")

genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"

actors_feature = items[["item_id", "actors"]].explode("actors")
actors_feature.columns = ["id", "value"]
actors_feature["feature"] = "actors"


content_feature = items.reindex(columns=['item_id', "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"

country_feature = items.reindex(columns=['item_id', "countries"])
country_feature.columns = ["id", "value"]
country_feature["feature"] = "countries"

age_feature = items.reindex(columns=['item_id', "age_rating"])
age_feature.columns = ["id", "value"]
age_feature["feature"] = "age_feature"

studios_feature = items.reindex(columns=['item_id', "studios"])
studios_feature.columns = ["id", "value"]
studios_feature["feature"] = "studios"


genre_feature_bin = pd.get_dummies(genre_feature[['id', 'value']]).groupby('id', as_index=False).sum()
content_feature_bin = pd.get_dummies(content_feature[['id', 'value']]).groupby('id', as_index=False).sum()
studios_feature_bin = pd.get_dummies(studios_feature[['id', 'value']]).groupby('id', as_index=False).sum()
age_feature_bin = pd.get_dummies(age_feature[['id', 'value']]).groupby('id', as_index=False).sum()
country_feature_bin = pd.get_dummies(country_feature[['id', 'value']]).groupby('id', as_index=False).sum()

dfs = [genre_feature_bin, content_feature_bin, studios_feature_bin, age_feature_bin, country_feature_bin]
item_final_features = reduce(lambda left,right: pd.merge(left,right,on='id'), dfs)

# item_final_features.id = item_final_features.id.map(model.mapping['items_mapping'])
# item_final_features = item_final_features.sort_values('id').drop('id', axis=1)
# item_final_features_matrix_sparse = csr_matrix(item_final_features.values)

In [332]:
item_final_features

Unnamed: 0,value_18+,value_no_genre,value_анимация,value_аниме,value_артхаус,value_биография,value_блогер,value_боевики,value_вестерн,value_военные,...,value_Япония,"value_Япония, Великобритания","value_Япония, Канада","value_Япония, Китай","value_Япония, Китай, Республика Корея","value_Япония, Россия","value_Япония, СССР","value_Япония, США","value_Япония, США, Франция","value_Япония, Сингапур"
10329,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2420,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10334,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7599,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15714,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6215,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2285,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
10256,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4378,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [267]:
# Удаляем те юзеры и айтемы, которых нет в основных табличках
interactions = interactions.merge(users.user_id.drop_duplicates(), on='user_id')
interactions = interactions.merge(items.item_id.drop_duplicates(), on='item_id')

In [328]:
class LightFMWrapper:
    """
    Class for fit-perdict LightFM
    """

    def __init__(
        self,
        random_state: int = 42,
        learning_rate: float = 0.05,
        no_components: int = 10,
        item_alpha: float = 0,
        user_alpha: float = 0,
        loss: str = 'warp',
        num_threads: int = 8,
        epochs: int = 1,
        verbose: int = 1,
    ):
        self.loss=loss
        self.no_components=no_components
        self.user_alpha=user_alpha
        self.item_alpha=item_alpha
        self.epochs=epochs
        self.num_threads=num_threads

        self.verbose = verbose
        self.is_fitted = False

        self.mapping: Dict[str, Dict[int, int]] = defaultdict(dict)

        self.weights_matrix = None
        self.users_watched = None

    def get_mappings(self, users, items):
        self.mapping['users_inv_mapping'] = dict(
            enumerate(users['user_id'].unique())
        )
        self.mapping['users_mapping'] = dict({
            v: k for k, v in self.mapping['users_inv_mapping'].items()
        })

        self.mapping['items_inv_mapping'] = dict(
            enumerate(items['item_id'].unique())
        )
        self.mapping['items_mapping'] = dict({
            v: k for k, v in self.mapping['items_inv_mapping'].items()
        })

    def get_matrix(
        self, df: pd.DataFrame,
        user_col: str = 'user_id',
        item_col: str = 'item_id',
        weight_col: str = None,
    ):
        if weight_col:
            weights = df[weight_col].astype(np.float32)
        else:
            weights = np.ones(len(df), dtype=np.float32)

        if hasattr(self.mapping['users_mapping'], 'get') and \
                hasattr(self.mapping['items_mapping'], 'get'):
            interaction_matrix = sp.sparse.coo_matrix((
                weights,
                (
                    df[user_col].map(self.mapping['users_mapping'].get),
                    df[item_col].map(self.mapping['items_mapping'].get)
                )
            ))
        else:
            raise AttributeError

        self.users_watched = df.groupby(user_col).agg({item_col: list})
        return interaction_matrix

    def prepare_additional_features_users(self, features: pd.DataFrame):
        pass

    def prepare_additional_features_items(self, features: pd.DataFrame):
        items.fillna('Unknown', inplace=True)
        items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
        items["actors"] = items["actors"].str.lower().str.replace(", ", ",", regex=False).str.split(",")

        genre_feature = items[["item_id", "genre"]].explode("genre")
        genre_feature.columns = ["id", "value"]
        genre_feature["feature"] = "genre"

        actors_feature = items[["item_id", "actors"]].explode("actors")
        actors_feature.columns = ["id", "value"]
        actors_feature["feature"] = "actors"


        content_feature = items.reindex(columns=['item_id', "content_type"])
        content_feature.columns = ["id", "value"]
        content_feature["feature"] = "content_type"

        country_feature = items.reindex(columns=['item_id', "countries"])
        country_feature.columns = ["id", "value"]
        country_feature["feature"] = "countries"

        age_feature = items.reindex(columns=['item_id', "age_rating"])
        age_feature.columns = ["id", "value"]
        age_feature["feature"] = "age_feature"

        studios_feature = items.reindex(columns=['item_id', "studios"])
        studios_feature.columns = ["id", "value"]
        studios_feature["feature"] = "studios"


        genre_feature_bin = pd.get_dummies(genre_feature[['id', 'value']]).groupby('id', as_index=False).sum()
        content_feature_bin = pd.get_dummies(content_feature[['id', 'value']]).groupby('id', as_index=False).sum()
        studios_feature_bin = pd.get_dummies(studios_feature[['id', 'value']]).groupby('id', as_index=False).sum()
        age_feature_bin = pd.get_dummies(age_feature[['id', 'value']]).groupby('id', as_index=False).sum()
        country_feature_bin = pd.get_dummies(country_feature[['id', 'value']]).groupby('id', as_index=False).sum()

        dfs = [genre_feature_bin, content_feature_bin, studios_feature_bin, age_feature_bin, country_feature_bin]
        item_final_features = reduce(lambda left,right: pd.merge(left,right,on='id'), dfs)

        item_final_features.id = item_final_features.id.map(model.mapping['items_mapping'])
        item_final_features = item_final_features.sort_values('id').drop('id', axis=1)
        item_final_features_matrix_sparse = csr_matrix(item_final_features.values)

        return item_final_features_matrix_sparse

    def fit(
        self,
        train: pd.DataFrame,
        user_features: Optional[pd.DataFrame] = None, # если не none, то делаем фичи
        item_features: Optional[pd.DataFrame] = None, # если не none, то делаем фичи
    ):
        if user_features is not None:
            user_features = self.prepare_additional_features_users(user_features)

        if item_features is not None:
            item_features = self.prepare_additional_features_items(item_features)


        self.get_mappings(users, items)
        self.weights_matrix = self.get_matrix(train).tocsr()

        self.model = LightFM(
            loss=self.loss,
            no_components=self.no_components,
            user_alpha=self.user_alpha,
            item_alpha=self.item_alpha,
        )

        self.model.fit(
            self.weights_matrix,
            epochs=self.epochs,
            user_features=user_features, # csr_matrix of shape [n_users, n_user_features]
            item_features=item_features, # csr_matrix of shape [n_items, n_item_features]
            num_threads=self.num_threads,
            verbose=self.verbose > 0,
        )

        self.is_fitted = True


    def predict(self, user_id: int, n_recs: int = 10):

        if not self.is_fitted:
            raise ValueError("Fit model before predicting")

        if user_id not in self.mapping['users_mapping'].keys(): return []
        user_id = self.mapping['users_mapping'][user_id]


        scores = self.model.predict(user_id, np.arange(len(self.mapping['items_mapping']))) # LightFM
        top_items = np.argsort(-scores)[:n_recs]
        recos = [self.mapping['items_inv_mapping'][inv_reco_item] for inv_reco_item in top_items]

        return recos

In [329]:
model = LightFMWrapper()

In [330]:
model.fit(train=interactions, item_features=items)

Epoch: 100%|██████████| 1/1 [00:05<00:00,  5.99s/it]


In [331]:
model.predict(user_id=176549)

ValueError: The item feature matrix specifies more features than there are estimated feature embeddings: 831 vs 15963.

In [320]:
np.arange(10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [326]:
user_id = model.mapping['users_mapping'][176549]

scores = model.model.predict(user_id, np.arange(10))
top_items = np.argsort(-scores)[:10]

In [327]:
top_items

array([2, 1, 4, 7, 9, 5, 8, 6, 3, 0])