In [3]:
! pip install --no-cache-dir --upgrade git+https://github.com/evfro/polara.git@develop#egg=polara

Collecting polara
  Cloning https://github.com/evfro/polara.git (to revision develop) to /tmp/pip-install-252azr9i/polara_b0cee97c23df4d9bac9cbb836a3b6ecf
  Running command git clone -q https://github.com/evfro/polara.git /tmp/pip-install-252azr9i/polara_b0cee97c23df4d9bac9cbb836a3b6ecf
  Running command git checkout -b develop --track origin/develop
  Switched to a new branch 'develop'
  Branch 'develop' set up to track remote branch 'develop' from 'origin'.
Building wheels for collected packages: polara
  Building wheel for polara (setup.py) ... [?25l[?25hdone
  Created wheel for polara: filename=polara-0.7.2.dev0-py3-none-any.whl size=87177 sha256=b2d5dbcd8117558e0fe8d9be2c78e9fe37a9d57e26aece8eb369ae14a283ae72
  Stored in directory: /tmp/pip-ephem-wheel-cache-5mlbd3to/wheels/19/88/81/920c4189a6b0b92f19b02f18fcb44ded22ae852f7b09ab2e28
Successfully built polara
Installing collected packages: polara
Successfully installed polara-0.7.2.dev0


In [4]:
from polara import SVDModel
from polara.recommender.data import RecommenderData
import numpy as np
import pandas as pd
from tqdm import tqdm

## Data preprocessing and analysis 

In [6]:
memes = pd.read_csv('memes.csv', names=['meme_id', 'file_id', 'author_id', 'timestamp', 
                                        'filetype', 'unique_id', 'caption' ], header=None)
users_memes = pd.read_csv('users_memes.csv', names=['id__', 'chat_id', 'meme_id', 'reaction',
                                        'timestamp', 'message_id', 'timestamp_reaction'], header=None)
users = pd.read_csv('users.csv', names=['chat_id', 'name', '?', 'username',
                                        'date_add', 'is_fresh', '??'], header=None)

In [7]:
users.head()

Unnamed: 0,chat_id,name,?,username,date_add,is_fresh,??
0,259156066,Аринка-карантинка,259156066,chameleon_lizard,2021-12-10 22:22:39.000000,True,
1,577915728,Спиридон,577915728,Blackaddder,2021-12-11 05:57:23.000000,True,
2,439924549,oleg,439924549,ol3gka,2021-12-11 06:21:47.000000,True,
3,411123163,MX,411123163,mx_bz,2022-01-19 07:31:46.000000,False,
4,401426372,Аня,401426372,anvanche,2021-12-11 07:50:22.000000,True,


In [8]:
users_memes.head()

Unnamed: 0,id__,chat_id,meme_id,reaction,timestamp,message_id,timestamp_reaction
0,1008,354637850,570,1002,2022-01-18 21:53:30.000000,7669,
1,1018,354637850,597,1002,2022-01-18 23:36:17.000000,7689,
2,1028,354637850,130,1002,2022-01-18 23:56:20.000000,7715,
3,1038,354637850,2619,1002,2022-01-19 00:46:39.000000,7745,
4,1048,354637850,576,1002,2022-01-19 01:10:00.000000,7769,


In [9]:
np.sum(~memes.caption.isna())

2907

In [10]:
memes.head()

Unnamed: 0,meme_id,file_id,author_id,timestamp,filetype,unique_id,caption
0,2811,AgACAgQAAxkBAAJJWGHqZIz4f3WSyvH0XIFxhvPjwqWxAA...,2106431824,2022-01-21 07:45:16.000000,photo,,
1,2826,AgACAgIAAxkBAAKIG2H3n_djkRg9MMUShOzCMHRcIm6mAA...,354637850,2022-01-31 08:38:15.000000,photo,AQADfL0xG4l3uUt9,
2,2827,AgACAgQAAxkBAAL2pGIEAAH2wjbF8DWHGiNewBn_UnrE6A...,354637850,2022-02-09 17:59:18.000000,photo,AQADyqoxGw8rrVF-,
3,2876,AgACAgQAAxkBAAL-aWIEBLnt94wyXCXhygRdYajavWbvAA...,2106431824,2022-02-09 18:15:21.000000,photo,AQADBqsxG3SVDFB-,"kvin, where is your corona"
4,2925,AgACAgQAAxkBAAEBAiBiBAetqQPwhJcXSKpCGXgI4lgbIw...,2106431824,2022-02-09 18:27:57.000000,photo,AQADUa0xGzmh5VJ9,Why do you keep coming back?


In [11]:
data = users_memes[['chat_id', 'meme_id', 'reaction', 'timestamp']].copy()
data.sort_values(by='timestamp', inplace=True) # sort data by time

REACTION2VALUE = {1001: -1, 107: -1, 106: -1, 105: -1, 104: -1,
                  103: 1, 102: 1, 101: 1, 100: 1, 1000: 1, 1002 : None} # 1002 means empty

data.reaction = users_memes.reaction.map(REACTION2VALUE)
data = data.dropna(axis='index', how='any', subset=['reaction'])

data.head()

Unnamed: 0,chat_id,meme_id,reaction,timestamp
882,354637850,398,-1.0,2021-12-08 20:35:19.000000
883,354637850,222,-1.0,2021-12-08 20:35:35.000000
884,481807223,387,1.0,2021-12-08 20:37:15.000000
885,354886383,61,-1.0,2021-12-08 20:39:05.000000
886,354886383,182,-1.0,2021-12-08 20:39:11.000000


In [12]:
train_data = data.copy()
train_data.drop_duplicates(subset=['chat_id', 'meme_id'], inplace=True)

In [13]:
train_data.reaction.value_counts()

-1.0    37292
 1.0    33166
Name: reaction, dtype: int64

In [14]:
print("num of unique users: ", len(train_data.chat_id.unique()))
print("num of unique memes: ", len(train_data.meme_id.unique()))
print("num of users that have at least 10 reactions: ", np.sum(train_data.chat_id.value_counts().values >= 10))

# let's leave only those users that have at least 10 reactions on memes
n_reactions = 10
train_data = train_data.loc[
    train_data
    .groupby('chat_id')['chat_id']
    .transform('count') >= n_reactions
]

num of unique users:  1060
num of unique memes:  6240
num of users that have at least 10 reactions:  873


In [15]:
data_model_1 = RecommenderData(train_data, 'chat_id', 'meme_id', 'reaction')
data_model_1.warm_start = False
data_model_1.holdout_size = 1
data_model_1.test_ratio = 0.2

train_val = data_model_1.training
holdout_test = data_model_1.test[1] #holdout
data_indexing_tvt = data_model_1.index


data_model_2 = RecommenderData(train_val, 'chat_id', 'meme_id', 'reaction')
data_model_2.warm_start = False
data_model_2.holdout_size = 1
data_model_2.test_ratio = 0.2

train = data_model_2.training
holdout_val = data_model_2.test[1] #holdout
data_indexing_tv = data_model_2.index



Preparing data...
Users are not uniformly ordered! Unable to split test set reliably.
6 unique meme_id entities within 6 holdout interactions were filtered. Reason: not in the training data.
Done.
There are 69486 events in the training and 169 events in the holdout.
Preparing data...
Users are not uniformly ordered! Unable to split test set reliably.
4 unique meme_id entities within 4 holdout interactions were filtered. Reason: not in the training data.
Done.
There are 69311 events in the training and 171 events in the holdout.


In [16]:
print(data_model_1._random_holdout, data_model_1._negative_prediction)

False False


In [17]:
print(f'train size = {train.shape[0]}, val size = {holdout_val.shape[0]}, test size = {holdout_test.shape[0]}')

train size = 69311, val size = 171, test size = 169


## Build Pure SVD Model with best rank

In [18]:
for rank in [10, 15, 30, 50, 100, 200, 300, 400, 500,  600]:
  print(f'-----RANK={rank}------')
  svd = SVDModel(data_model_2)
  svd.rank = rank
  svd.build()
  print(svd.evaluate('main'))

-----RANK=10------
PureSVD training time: 0.071s
[Relevance(hr=0.07602339181286549), Ranking(arhr=0.03222639933166249, mrr=0.032226399331662485)]
-----RANK=15------
PureSVD training time: 0.085s
[Relevance(hr=0.08187134502923976), Ranking(arhr=0.03233314768402487, mrr=0.03233314768402488)]
-----RANK=30------
PureSVD training time: 0.132s
[Relevance(hr=0.08771929824561403), Ranking(arhr=0.03558201058201058, mrr=0.03558201058201058)]
-----RANK=50------
PureSVD training time: 0.124s
[Relevance(hr=0.05847953216374269), Ranking(arhr=0.026949317738791424, mrr=0.026949317738791424)]
-----RANK=100------
PureSVD training time: 0.276s
[Relevance(hr=0.07017543859649122), Ranking(arhr=0.011791051703332405, mrr=0.011791051703332403)]
-----RANK=200------
PureSVD training time: 0.749s
[Relevance(hr=0.029239766081871343), Ranking(arhr=0.005198180636777129, mrr=0.005198180636777128)]
-----RANK=300------
PureSVD training time: 1.161s
[Relevance(hr=0.03508771929824561), Ranking(arhr=0.0142369813422445, m

In [19]:
print('-----training on train+val and evaluaeted on test------')
svd = SVDModel(data_model_1)
svd.rank = 30
svd._topk = 10
svd.build()
print(svd.evaluate('main'))


-----training on train+val and evaluaeted on test------
PureSVD training time: 0.118s
[Relevance(hr=0.08875739644970414), Ranking(arhr=0.0350662158354466, mrr=0.035066215835446606)]


## Gradient Boosting

In [22]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve
from lightgbm import LGBMClassifier,plot_importance
from matplotlib import pyplot as plt
from itertools import product

from evaluation import topn_recommendations, model_evaluate, downvote_seen_items

In [23]:
data_description = dict(
    users = train['chat_id'].name,
    items = train['meme_id'].name,
    feedback = 'reaction',
    n_users = len(train['chat_id']),
    n_items = len(train['meme_id']),
    test_users = holdout_val[train['chat_id'].name].values
)

testset = data_description['test_users']
seen_data = train[train.chat_id.isin(testset)]

In [25]:
svd = SVDModel(data_model_2)
svd.rank = 30
svd.build()

item_factors = svd.factors['meme_id']
user_factors = svd.factors['chat_id']

PureSVD training time: 0.174s


In [26]:
def get_vectorized_data(df, user_factors, item_factors):
  X = []
  y_true = []
  for i in tqdm(range(df.shape[0])):
    user_features = user_factors[int(df.iloc[i]['chat_id']), :]
    item_features = item_factors[int(df.iloc[i]['meme_id']), :]
    features = np.concatenate((user_features,item_features), axis=0)
    y = (int(df.iloc[i]['reaction']) + 1) / 2 # here 1 stand for like and 0 for dislike
    y_true.append(y)
    X.append(features)
  X = np.array(X)

  return X, y_true


def boosting_scoring(model, holdout, user_factors, item_factors):
  scores = []
  for id_ in tqdm(holdout['chat_id'].values):
    user_features = user_factors[id_, :]
    user_matrix = item_factors.shape[0] * [user_features]
    user_matrix = np.array(user_matrix)
    features = np.concatenate((user_matrix, item_factors), axis=1)
    score = model.predict_proba(features)[:, 1]
    scores.append(score)
  return scores

In [27]:
X_train, y_train = get_vectorized_data(train, user_factors, item_factors)
X_val, y_val = get_vectorized_data(holdout_val, user_factors, item_factors)

100%|██████████| 69311/69311 [00:41<00:00, 1676.16it/s]
100%|██████████| 171/171 [00:00<00:00, 525.40it/s]


In [28]:
light = LGBMClassifier(n_estimators=200, learning_rate=0.05,
                      min_child_samples=30, num_leaves=127)
light.fit(X_train, y_train)


LGBMClassifier(learning_rate=0.05, min_child_samples=30, n_estimators=200,
               num_leaves=127)

In [29]:
scores = boosting_scoring(light, holdout_val, user_factors, item_factors)


100%|██████████| 171/171 [00:13<00:00, 13.10it/s]


In [30]:
downvote_seen_items(np.array(scores), seen_data, data_description)
recs = topn_recommendations(np.array(scores), topn=20)
print('HR={:.3}, MRR={:.3}, COV={:.3}'.format(*model_evaluate(recs, holdout_val, data_description,  topn=10)))

HR=0.0351, MRR=0.00743, COV=0.0123


## Search for best parameters for LightGBM

In [459]:
ranks = [10, 15, 30, 50, 100, 200, 300, 400, 500,  600]
nestims = [50, 100,  200, 300, 400, 500]
learning_rates = [0.0001, 0.001,  0.01, 0.1]
max_depths = [3, 5, 7, 9]

params = product(ranks, nestims, learning_rates, max_depths)

In [None]:
results = []
for param in params:
  svd = SVDModel(data_model_2)
  svd.rank = param[0]
  svd.build()

  item_factors = svd.factors['meme_id']
  user_factors = svd.factors['chat_id']

  X_train, y_train = get_vectorized_data(train, user_factors, item_factors)

  light = LGBMClassifier(n_estimators=param[1], learning_rate=param[2], max_depth=param[3])
  light.fit(X_train, y_train)

  scores = boosting_scoring(light, holdout_val, user_factors, item_factors)

  downvote_seen_items(np.array(scores), seen_data, data_description)
  recs = topn_recommendations(np.array(scores), topn=10)
  print(param)
  hr, mrr, cov = model_evaluate(recs, holdout_val, data_description,  topn=10)
  print('HR={:.3}, MRR={:.3}, COV={:.3}'.format(*model_evaluate(recs, holdout_val, data_description,  topn=10)))

  results.append([*param, hr, mrr])



100%|██████████| 171/171 [00:09<00:00, 18.08it/s]


(30, 200, 0.1, 9)
HR=0.0468, MRR=0.0101, COV=0.00717
PureSVD training time: 0.080s


 17%|█▋        | 11919/69311 [00:05<00:25, 2267.96it/s]


KeyboardInterrupt: ignored

In [17]:
ranks = [10, 50, 100, 300]
nestims = [200, 300, 400]
learning_rates = [0.01]
max_depths = [3, 5, 7]

params = product(ranks, nestims, learning_rates, max_depths)

In [18]:
results = []
for param in params:
  svd = SVDModel(data_model_2)
  svd.rank = param[0]
  svd.build()

  item_factors = svd.factors['meme_id']
  user_factors = svd.factors['chat_id']

  X_train, y_train = get_vectorized_data(train, user_factors, item_factors)

  light = LGBMClassifier(n_estimators=param[1], learning_rate=param[2], max_depth=param[3])
  light.fit(X_train, y_train)

  scores = boosting_scoring(light, holdout_val, user_factors, item_factors)

  downvote_seen_items(np.array(scores), seen_data, data_description)
  recs = topn_recommendations(np.array(scores), topn=10)
  print(param)
  hr, mrr, cov = model_evaluate(recs, holdout_val, data_description,  topn=10)
  print('HR={:.3}, MRR={:.3}, COV={:.3}'.format(*model_evaluate(recs, holdout_val, data_description,  topn=10)))

  results.append([*param, hr, mrr])

PureSVD training time: 0.056s


100%|██████████| 69311/69311 [01:05<00:00, 1062.65it/s]
100%|██████████| 171/171 [00:08<00:00, 21.26it/s]


(10, 200, 0.01, 3)
HR=0.0819, MRR=0.0292, COV=0.000606
PureSVD training time: 0.091s


100%|██████████| 69311/69311 [00:50<00:00, 1383.76it/s]
100%|██████████| 171/171 [00:09<00:00, 17.64it/s]


(10, 200, 0.01, 5)
HR=0.0643, MRR=0.0189, COV=0.00234
PureSVD training time: 0.301s


100%|██████████| 69311/69311 [00:48<00:00, 1424.42it/s]
100%|██████████| 171/171 [00:11<00:00, 14.51it/s]


(10, 200, 0.01, 7)
HR=0.0702, MRR=0.0281, COV=0.00287
PureSVD training time: 0.060s


100%|██████████| 69311/69311 [00:30<00:00, 2268.89it/s]
100%|██████████| 171/171 [00:07<00:00, 24.30it/s]


(10, 300, 0.01, 3)
HR=0.0526, MRR=0.0194, COV=0.00149
PureSVD training time: 0.037s


100%|██████████| 69311/69311 [00:29<00:00, 2363.33it/s]
100%|██████████| 171/171 [00:10<00:00, 15.95it/s]


(10, 300, 0.01, 5)
HR=0.0526, MRR=0.0137, COV=0.0027
PureSVD training time: 0.041s


100%|██████████| 69311/69311 [00:31<00:00, 2227.40it/s]
100%|██████████| 171/171 [00:11<00:00, 14.59it/s]


(10, 300, 0.01, 7)
HR=0.076, MRR=0.0275, COV=0.00378
PureSVD training time: 0.043s


100%|██████████| 69311/69311 [00:53<00:00, 1289.20it/s]
100%|██████████| 171/171 [00:18<00:00,  9.41it/s]


(10, 400, 0.01, 3)
HR=0.076, MRR=0.0238, COV=0.00173
PureSVD training time: 0.081s


100%|██████████| 69311/69311 [00:36<00:00, 1874.07it/s]
100%|██████████| 171/171 [00:14<00:00, 12.02it/s]


(10, 400, 0.01, 5)
HR=0.0643, MRR=0.0193, COV=0.0029
PureSVD training time: 0.046s


100%|██████████| 69311/69311 [00:30<00:00, 2308.88it/s]
100%|██████████| 171/171 [00:16<00:00, 10.09it/s]


(10, 400, 0.01, 7)
HR=0.0643, MRR=0.021, COV=0.0042
PureSVD training time: 0.116s


100%|██████████| 69311/69311 [00:30<00:00, 2289.20it/s]
100%|██████████| 171/171 [00:05<00:00, 30.73it/s]


(50, 200, 0.01, 3)
HR=0.0585, MRR=0.0187, COV=0.00101
PureSVD training time: 0.117s


100%|██████████| 69311/69311 [00:29<00:00, 2327.24it/s]
100%|██████████| 171/171 [00:07<00:00, 21.68it/s]


(50, 200, 0.01, 5)
HR=0.076, MRR=0.0183, COV=0.00309
PureSVD training time: 0.123s


100%|██████████| 69311/69311 [00:30<00:00, 2289.01it/s]
100%|██████████| 171/171 [00:08<00:00, 19.72it/s]


(50, 200, 0.01, 7)
HR=0.0643, MRR=0.0193, COV=0.00349
PureSVD training time: 0.113s


 58%|█████▊    | 40358/69311 [00:16<00:12, 2374.95it/s]


KeyboardInterrupt: ignored

In [31]:
data_description = dict(
    users = train_val['chat_id'].name,
    items = train_val['meme_id'].name,
    feedback = 'reaction',
    n_users = len(train_val['chat_id']),
    n_items = len(train_val['meme_id']),
    test_users = holdout_test[train_val['chat_id'].name].values
)

testset = data_description['test_users']
seen_data = train_val[train_val.chat_id.isin(testset)]

In [36]:
best_params = {'rank' : 10,
               'n_estimators' : 500,
               'learning_rate' : 0.01,
               'max_depth' : 3}

svd = SVDModel(data_model_1)
svd.rank = best_params['rank']
svd.build()

item_factors = svd.factors['meme_id']
user_factors = svd.factors['chat_id']

X_train, y_train = get_vectorized_data(train_val, user_factors, item_factors)

light = LGBMClassifier(n_estimators=best_params['n_estimators'], 
                       learning_rate=best_params['learning_rate'],
                       max_depth=best_params['max_depth'])
light.fit(X_train, y_train)

scores = boosting_scoring(light, holdout_test, user_factors, item_factors)

downvote_seen_items(np.array(scores), seen_data, data_description)
recs = topn_recommendations(np.array(scores), topn=10)

hr, mrr, cov = model_evaluate(recs, holdout_test, data_description,  topn=10)
print('--------test--------')
print('HR={:.3}, MRR={:.3}, COV={:.3}'.format(*model_evaluate(recs, holdout_test, data_description,  topn=10)))

PureSVD training time: 0.044s


100%|██████████| 69486/69486 [00:31<00:00, 2216.32it/s]
100%|██████████| 169/169 [00:12<00:00, 13.13it/s]

--------test--------
HR=0.0936, MRR=0.0342, COV=0.0017





## TF

In [77]:
from polara.lib.tensor import hooi

In [78]:
def tf_model_build(config, data, data_description):
    userid = data_description["users"]
    itemid = data_description["items"]
    feedback = data_description["feedback"]
    train_users = data_description['train_users']

    idx = data[data.chat_id.isin(train_users)][[userid, itemid, feedback]].values
    idx[:, -1] = idx[:, -1] - data_description['min_rating'] # works only for integer ratings!
    val = np.ones(idx.shape[0], dtype='f8')
    
    n_users = data_description["n_users"]
    n_items = data_description["n_items"]
    n_ratings = data_description["n_ratings"]
    shape = (n_users, n_items, n_ratings)
    
    core_shape = config['mlrank']
    num_iters = config["num_iters"]
    
    u0, u1, u2, g = hooi(
        idx, val, shape, core_shape,
        return_core=False, num_iters=num_iters,
        parallel_ttm=False, growth_tol=0.01,
    )
    return u0, u1, u2


def tf_scoring(params, data, data_description):
    user_factors, item_factors, feedback_factors = params
    item_factors = item_factors[:, 1:]
    userid = data_description["users"]
    itemid = data_description["items"]
    feedback = data_description["feedback"]
    test_users = data_description['test_users']

    data = data[data.userid.isin(test_users)].sort_values(userid)
    useridx = data[userid].values
    itemidx = data[itemid].values
    ratings = data[feedback].values
    ratings = ratings - data_description['min_rating'] # works only for integer ratings!
    
    tensor_outer = tensor_outer_at('cpu')
    # use the fact that test data is sorted by users for reduction:
    scores = tensor_outer(
        1.0,
        item_factors,
        feedback_factors,
        itemidx,
        ratings
    )
    scores = np.add.reduceat(scores, np.r_[0, np.where(np.diff(useridx))[0]+1])
    scores = np.tensordot(
        scores,
        feedback_factors[-2:, :].sum(axis=0),
        axes=(2, 0)
    ).dot(item_factors.T)
    return scores

In [79]:
data_description = dict(
    users = train['chat_id'].name,
    items = train['meme_id'].name,
    feedback = 'reaction',
    n_users = len(train['chat_id']),
    n_items = len(train['meme_id']),
    n_ratings = train.reaction.nunique(),
    test_users = holdout_val[train['chat_id'].name].values,
    train_users = np.unique(train.chat_id),
    min_rating = -1
)

testset = data_description['test_users']
seen_data = train[train.chat_id.isin(testset)]

In [80]:
config = {
    'mlrank': (30, 200, 5),
    "num_iters": 5,
}

In [None]:
tf_params = tf_model_build(config, train, data_description)
tf_scores = tf_scoring(tf_params, holdout_val, data_description)