In [1]:
!pip install pymorphy2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dawg-python>=0.7.1 (from pymorphy2)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4 (from pymorphy2)
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docopt>=0.6 (from pymorphy2)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13707 sha

In [2]:
import numpy as np
import pandas as pd
import tensorflow.keras.backend as K
from tqdm.auto import tqdm
import requests
import zipfile as zf

import warnings
warnings.filterwarnings('ignore')

from collections import Counter
from random import randint
from sklearn.metrics.pairwise import euclidean_distances as ED
from tensorflow import keras

from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer

In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Выгрузка данных KION

In [4]:
url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'

req = requests.get(url, stream=True)

with open('kion.zip', 'wb') as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

In [5]:
files = zf.ZipFile('kion.zip','r')
files.extractall()
files.close()

In [6]:
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')
interactions = pd.read_csv('data_original/interactions.csv')

## Предобработка данных

In [7]:
# Rename columns
interactions.rename(columns={
    'last_watch_dt': 'datetime',
    'watched_pct': 'weight'
}, inplace=True)

In [8]:
# Drop all rows with non 10 symbols date 
# And transform remaining dates to pandas datetime 
interactions.drop(interactions[interactions['datetime'].str.len() != 10].index, inplace=True)
interactions['datetime'] = pd.to_datetime(interactions['datetime'], format='%Y-%m-%d')

In [9]:
users = users[users['user_id'].isin(interactions['user_id'])]
items = items[items['item_id'].isin(interactions['item_id'])]

In [10]:
user_cat_feats = ["age", "income", "sex", "kids_flg"]
users_ohe_df = users.user_id
for feature in user_cat_feats:
    ohe_feat_df = pd.get_dummies(users[feature], prefix=feature)
    users_ohe_df = pd.concat([users_ohe_df, ohe_feat_df], axis=1)

In [11]:
item_cat_feats = ['content_type', 'release_year',
                  'for_kids', 'age_rating', 
                  'studios', 'countries', 'directors']

items_ohe_df = items.item_id

for feature in item_cat_feats:
    ohe_feat_df = pd.get_dummies(items[feature], prefix=feature)
    items_ohe_df = pd.concat([items_ohe_df, ohe_feat_df], axis=1) 

In [12]:
print(f"N users before: {interactions.user_id.nunique()}")
print(f"N items before: {interactions.item_id.nunique()}\n")

interactions = interactions[interactions.weight > 10]


users_with_enough_interactions = []

c = Counter(interactions.user_id)
for user_id, entries in c.most_common():
    if entries > 5:
        users_with_enough_interactions.append(user_id)

items_with_enough_interactions = []

c = Counter(interactions.item_id)
for item_id, entries in c.most_common():
    if entries > 10:
        items_with_enough_interactions.append(item_id)

interactions = interactions[interactions.user_id.isin(users_with_enough_interactions)]
interactions = interactions[interactions.item_id.isin(items_with_enough_interactions)]

print(f"N users after: {interactions.user_id.nunique()}")
print(f"N items after: {interactions.item_id.nunique()}")


N users before: 962179
N items before: 15706

N users after: 170681
N items after: 6901


In [13]:
active_users = set(interactions.user_id.unique()).intersection(set(users_ohe_df.user_id.unique()))
active_items = set(interactions.item_id.unique()).intersection(set(items_ohe_df.item_id.unique()))

print(len(active_users))
print(len(active_items))

interactions = interactions[interactions.item_id.isin(active_items)]
interactions = interactions[interactions.user_id.isin(active_users)]

items_ohe_df = items_ohe_df[items_ohe_df.item_id.isin(active_items)]
users_ohe_df = users_ohe_df[users_ohe_df.user_id.isin(active_users)]

139209
6901


In [14]:
interactions["uid"] = interactions["user_id"].astype("category")
interactions["uid"] = interactions["uid"].cat.codes

interactions["iid"] = interactions["item_id"].astype("category")
interactions["iid"] = interactions["iid"].cat.codes

print(sorted(interactions.iid.unique())[:5])
print(sorted(interactions.uid.unique())[:5])
interactions.head()

[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]


Unnamed: 0,user_id,item_id,datetime,total_dur,weight,uid,iid
0,176549,9506,2021-05-11,4250,72.0,22413,3945
1,699317,1659,2021-05-29,8317,100.0,88768,675
3,864613,7638,2021-07-05,14483,100.0,109925,3163
6,1016458,354,2021-08-14,1672,25.0,128826,139
7,884009,693,2021-08-04,703,14.0,112355,279


In [15]:
interactions_vec = np.zeros((interactions.uid.nunique(), 
                             interactions.iid.nunique())) 

for user_id, item_id in zip(interactions.uid, interactions.iid):
    interactions_vec[user_id, item_id] += 1


res = interactions_vec.sum(axis=1)
for i in range(len(interactions_vec)):
    interactions_vec[i] /= res[i]

In [16]:
items_ohe_df = items_ohe_df[items_ohe_df['item_id'] != 11805]
print(interactions.item_id.nunique())
print(items_ohe_df.item_id.nunique())
print(interactions.user_id.nunique())
print(users_ohe_df.user_id.nunique())

print(set(items_ohe_df.item_id.unique()) - set(interactions.item_id.unique()))



6900
6900
139209
139209
set()


In [17]:
iid_to_item_id = interactions[["iid", "item_id"]].drop_duplicates().set_index("iid").to_dict()["item_id"]
item_id_to_iid = interactions[["iid", "item_id"]].drop_duplicates().set_index("item_id").to_dict()["iid"]

uid_to_user_id = interactions[["uid", "user_id"]].drop_duplicates().set_index("uid").to_dict()["user_id"]
user_id_to_uid = interactions[["uid", "user_id"]].drop_duplicates().set_index("user_id").to_dict()["uid"]

In [18]:
items_ohe_df["iid"] = items_ohe_df["item_id"].apply(lambda x: item_id_to_iid[x])
items_ohe_df = items_ohe_df.set_index("iid")

users_ohe_df["uid"] = users_ohe_df["user_id"].apply(lambda x: user_id_to_uid[x])
users_ohe_df = users_ohe_df.set_index("uid")

### Подготовка данных для temporal слоев

In [19]:
# Выбираем по 5 последних взаимодействий юзеров для temporal слоев
interactions_lists = pd.DataFrame(interactions.sort_values(by=["uid", "datetime"]).groupby("uid").iid.apply(lambda x: np.array(x, dtype=np.uint16)[-5:]))

In [20]:
interactions_lists.iid.apply(len).mean() # Просочились юзеры с количество итераций < 5, надо убрать

4.998785998031736

In [21]:
interactions_lists['len'] = interactions_lists.iid.apply(len)

In [22]:
interactions_lists = interactions_lists.query('len == 5')

In [23]:
uid_list_valid = interactions_lists.index.to_list()

In [24]:
interactions_lists.drop(['len'], axis = 1, inplace = True)

## Лосс

In [25]:
def triplet_loss(y_true, y_pred, n_dims=128, alpha=0.4):
    # будем ожидать, что на вход функции прилетит три сконкатенированных 
    # вектора - вектор юзера и два вектора айтема
    anchor = y_pred[:, 0:n_dims]
    positive = y_pred[:, n_dims:n_dims*2]
    negative = y_pred[:, n_dims*2:n_dims*3]

    # считаем расстояния от вектора юзера до вектора хорошего айтема
    pos_dist = K.sum(K.square(anchor - positive), axis=1)
    # и до плохого
    neg_dist = K.sum(K.square(anchor - negative), axis=1)

    # считаем лосс
    basic_loss = pos_dist - neg_dist + alpha
    loss = K.maximum(basic_loss, 0.0) # возвращаем ноль, если лосс отрицательный
 
    return loss

## Текстовые признаки

In [26]:
items['description'] = items['description'].fillna('')

In [27]:
def normalize(text):
    punct = '!"#$%&()*\+,-\./:;<=>?@\[\]^_`{|}~„“«»†*\—/\-‘’–'
    res = [w.strip(punct) for w in word_tokenize(text)]
    res = [w.lower() for w in res if w != '']
    return res

In [28]:
items['clear_description'] = items['description'].apply(lambda x: normalize(x))

In [29]:
sw = stopwords.words('russian')

In [30]:
def filter_stopwords(text):
    return [w for w in text if w not in sw]

In [31]:
items['clear_description'] = items['clear_description'].apply(lambda x: filter_stopwords(x))

In [32]:
morph = MorphAnalyzer()

items['clear_description'] = items['clear_description'].apply(lambda x: [morph.parse(w)[0].normalized.word for w in x])

In [33]:
items['clear_description'] = items['clear_description'].apply(lambda x: ' '.join(x))

In [34]:
vectorizer = CountVectorizer(min_df=3, max_features=300)
text_vectorized = pd.DataFrame(vectorizer.fit_transform(items['clear_description']).toarray(), columns=vectorizer.get_feature_names_out())
text_vectorized.shape

(15706, 300)

In [35]:
text_vectorized['item_id'] = items['item_id']
items_ohe_df = items_ohe_df.merge(text_vectorized, on='item_id', how='left')
items_ohe_df = items_ohe_df.fillna(0)

In [36]:
users_ohe_df = users_ohe_df.loc[uid_list_valid]

## Подготовка загрузчика данных

In [37]:
def generator(items, users, interactions, interactions_lists, batch_size=16):
    while True:
        uid_meta = []
        uid_interaction = []
        uid_temporal = []
        pos = []
        neg = []
        for _ in range(batch_size):
            # берем рандомный uid
            uid_i = randint(0, interactions.shape[0]-1)
            # id хорошего айтема
            pos_i = np.random.choice(range(interactions.shape[1]), p=interactions[uid_i])
            # id плохого айтема
            neg_i = np.random.choice(range(interactions.shape[1]))
            # фичи юзера
            uid_meta.append(users.iloc[uid_i])
            # вектор айтемов, с которыми юзер взаимодействовал
            uid_interaction.append(interactions_vec[uid_i])
            # вектор последовательностей взаимодействий (по 5 штук)
            uid_temporal.append(interactions_lists.iloc[uid_i].iid)
            # фичи хорошего айтема
            pos.append(items.iloc[pos_i])
            # фичи плохого айтема
            neg.append(items.iloc[neg_i])
            
        temporal = np.reshape(np.array(uid_temporal), (batch_size, 5, 1)) # Ресайзим, чтобы не было проблем с размерностью
        yield [np.array(uid_meta), np.array(uid_interaction), temporal, np.array(pos), np.array(neg)], [np.array(uid_meta), np.array(uid_interaction), temporal]



In [38]:
gen = generator(
    items=items_ohe_df.drop(["item_id"], axis=1), 
    users=users_ohe_df.drop(["user_id"], axis=1), 
    interactions=interactions_vec,
    interactions_lists = interactions_lists,
)

ret = next(gen)


print(f"вектор фичей юзера: {ret[0][0].shape}")
print(f"вектор взаимодействий юзера с айтемами: {ret[0][1].shape}")
print(f"последовательность взаимодействий юзера: {ret[0][2].shape}")
print(f"вектор 'хорошего' айтема: {ret[0][3].shape}")
print(f"вектор 'плохого' айтема: {ret[0][4].shape}")

print()
print(f"вектор фичей юзера: {ret[1][0].shape}")
print(f"вектор взаимодействий юзера с айтемами: {ret[1][1].shape}")
print(f"последовательность взаимодействий юзера: {ret[1][2].shape}")

вектор фичей юзера: (16, 16)
вектор взаимодействий юзера с айтемами: (16, 6900)
последовательность взаимодействий юзера: (16, 5, 1)
вектор 'хорошего' айтема: (16, 8993)
вектор 'плохого' айтема: (16, 8993)

вектор фичей юзера: (16, 16)
вектор взаимодействий юзера с айтемами: (16, 6900)
последовательность взаимодействий юзера: (16, 5, 1)


In [39]:
N_FACTORS = 128

ITEM_MODEL_SHAPE = (items_ohe_df.drop(["item_id"], axis=1).shape[1], ) 
USER_META_MODEL_SHAPE = (users_ohe_df.drop(["user_id"], axis=1).shape[1], )

USER_INTERACTION_MODEL_SHAPE = (interactions_vec.shape[1], )
USER_TEMPORAL_MODEL_SHAPE = (5, interactions_lists.shape[1], )

print(f"N_FACTORS: {N_FACTORS}")
print(f"ITEM_MODEL_SHAPE: {ITEM_MODEL_SHAPE}") # add text features
print(f"USER_META_MODEL_SHAPE: {USER_META_MODEL_SHAPE}")
print(f"USER_INTERACTION_MODEL_SHAPE: {USER_INTERACTION_MODEL_SHAPE}")
print(f"USER_TEMPORAL_MODEL_SHAPE: {USER_TEMPORAL_MODEL_SHAPE}")

N_FACTORS: 128
ITEM_MODEL_SHAPE: (8993,)
USER_META_MODEL_SHAPE: (16,)
USER_INTERACTION_MODEL_SHAPE: (6900,)
USER_TEMPORAL_MODEL_SHAPE: (5, 1)


## Архитектура нейронки

In [40]:
def item_model(n_factors=N_FACTORS):
    inp = keras.layers.Input(shape=ITEM_MODEL_SHAPE)
    
    layer_1 = keras.layers.Dense(N_FACTORS, activation='elu', use_bias=False,
                               kernel_regularizer=keras.regularizers.l2(1e-6),
                               activity_regularizer=keras.regularizers.l2(l2=1e-6))(inp)

    layer_2 = keras.layers.Dense(N_FACTORS, activation='elu', use_bias=False,
                             kernel_regularizer=keras.regularizers.l2(1e-6),
                             activity_regularizer=keras.regularizers.l2(l2=1e-6))(layer_1)
    
    add = keras.layers.Add()([layer_1, layer_2])
    
    out = keras.layers.Dense(N_FACTORS, activation='linear', use_bias=False,
                             kernel_regularizer=keras.regularizers.l2(1e-6),
                             activity_regularizer=keras.regularizers.l2(l2=1e-6))(add)
    
    return keras.models.Model(inp, out)

In [41]:
def user_model(n_factors=N_FACTORS):
    inp_meta = keras.layers.Input(shape=USER_META_MODEL_SHAPE)
    inp_interaction = keras.layers.Input(shape=USER_INTERACTION_MODEL_SHAPE)
    inp_temporal = keras.layers.Input(shape =USER_TEMPORAL_MODEL_SHAPE)

    layer_1_meta = keras.layers.Dense(N_FACTORS, activation='elu', use_bias=False,
                                 kernel_regularizer=keras.regularizers.l2(1e-6),
                                 activity_regularizer=keras.regularizers.l2(l2=1e-6))(inp_meta)

    layer_1_interaction = keras.layers.Dense(N_FACTORS, activation='elu', use_bias=False,
                                 kernel_regularizer=keras.regularizers.l2(1e-6),
                                 activity_regularizer=keras.regularizers.l2(l2=1e-6))(inp_interaction)

    # Temporal слой
    layer_1_temporal = (keras.layers.LSTM(64)(inp_temporal))

    layer_2_meta = keras.layers.Dense(N_FACTORS, activation='elu', use_bias=False,
                                 kernel_regularizer=keras.regularizers.l2(1e-6),
                                 activity_regularizer=keras.regularizers.l2(l2=1e-6))(layer_1_meta)
    

    add = keras.layers.Add()([layer_1_meta, layer_2_meta])
    
    concat_meta_interaction = keras.layers.Concatenate()([add, layer_1_interaction, layer_1_temporal])
    
    out = keras.layers.Dense(N_FACTORS, activation='linear', use_bias=False,
                             kernel_regularizer=keras.regularizers.l2(1e-6),
                             activity_regularizer=keras.regularizers.l2(l2=1e-6))(concat_meta_interaction)
    
    return keras.models.Model([inp_meta, inp_interaction, inp_temporal], out)

In [42]:
i2v = item_model()
u2v = user_model()

In [43]:
ancor_meta_in = keras.layers.Input(shape=USER_META_MODEL_SHAPE)
ancor_interaction_in = keras.layers.Input(shape=USER_INTERACTION_MODEL_SHAPE)
ancor_temporal_in = keras.layers.Input(shape = USER_TEMPORAL_MODEL_SHAPE)

pos_in = keras.layers.Input(shape=ITEM_MODEL_SHAPE)
neg_in = keras.layers.Input(shape=ITEM_MODEL_SHAPE)

ancor = u2v([ancor_meta_in, ancor_interaction_in, ancor_temporal_in])
pos = i2v(pos_in)
neg = i2v(neg_in)

res = keras.layers.Concatenate(name="concat_ancor_pos_neg")([ancor, pos, neg])

model = keras.models.Model([ancor_meta_in, ancor_interaction_in, ancor_temporal_in, pos_in, neg_in], res)

In [46]:
model_name = 'recsys_resnet_linear'

decay = keras.callbacks.ReduceLROnPlateau(monitor='loss', patience=2, factor=0.8, verbose=1)
early_stopping =  keras.callbacks.EarlyStopping(monitor='loss', patience=4)

opt = keras.optimizers.Adam(lr=0.001)
model.compile(loss=triplet_loss, optimizer=opt, run_eagerly=True)



## Непосредственное обучение модели

In [47]:
model.fit(
    generator(
        items=items_ohe_df.drop(["item_id"], axis=1), 
        users=users_ohe_df.drop(["user_id"], axis=1), 
        interactions=interactions_vec,
        interactions_lists = interactions_lists,
        batch_size=16,
    ), 
    steps_per_epoch=25, 
    epochs=5, 
    initial_epoch=0,
    callbacks=[decay, early_stopping],
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fb6615f33a0>

In [48]:
# берем рандомного юзера
rand_uid = np.random.choice(list(users_ohe_df.index))

# получаем фичи юзера и вектор его просмотров айтемов
user_meta_feats = users_ohe_df.drop(["user_id"], axis=1).iloc[rand_uid]
user_interaction_vec = interactions_vec[rand_uid]
user_temporal_vec = interactions_lists.iloc[rand_uid].iid

# берем рандомный айтем
rand_iid = np.random.choice(list(items_ohe_df.index))
# получаем фичи айтема
item_feats = items_ohe_df.drop(["item_id"], axis=1).iloc[rand_iid]

# получаем вектор юзера
user_vec = u2v.predict([np.array(user_meta_feats).reshape(1, -1), 
                        np.array(user_interaction_vec).reshape(1, -1),
                        np.array(user_temporal_vec).reshape(1, -1)])

# и вектор айтема
item_vec = i2v.predict(np.array(item_feats).reshape(1, -1))

# считаем расстояние между вектором юзера и вектором айтема
ED(user_vec, item_vec)



array([[2.8275933]], dtype=float32)

In [49]:
# получаем фичи всех айтемов
items_feats = items_ohe_df.drop(["item_id"], axis=1).to_numpy()
# получаем векторы всех айтемов
items_vecs = i2v.predict(items_feats)

# считаем расстояния
dists = ED(user_vec, items_vecs)



## Вот тут кончаются ресурсы в коллабе (((

In [50]:
users_meta_feats = users_ohe_df.drop(["user_id"], axis=1)
users_interaction_vec = interactions_vec
user_temporal_vec = interactions_lists.iid

In [None]:
users_vec = u2v.predict([np.array(users_meta_feats), 
                        np.array(users_interaction_vec),
                        np.array(user_temporal_vec)])

In [None]:
dists = ED(users_vec, items_vecs)

In [None]:
top10_iids = np.argsort(dists, axis=1)[:,:10]

In [None]:
top10_iids_item = [iid_to_item_id[iid] for iid in top10_iids.reshape(-1)]

In [None]:
top10_iids_item = np.array(top10_iids_item).reshape(top10_iids.shape)

In [None]:
df_dssm = pd.DataFrame({'user_id': [uid_to_user_id[uid] for uid in np.arange(top10_iids_item.shape[0])]})

In [None]:
df_dssm['item_id'] = list(top10_iids_item)

In [None]:
df_dssm = df_dssm.explode('item_id')
df_dssm['rank'] = df_dssm.groupby('user_id').cumcount() + 1
df_dssm = df_dssm.groupby('user_id').agg({'item_id': list}).reset_index()

In [None]:
df_dssm.head()