# RecTools with features - VACANCIES

### import libraries

In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from implicit.nearest_neighbours import TFIDFRecommender, BM25Recommender
from implicit.als import AlternatingLeastSquares
from lightfm import LightFM

from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import ImplicitALSWrapperModel
from rectools.models import LightFMWrapperModel, ImplicitItemKNNWrapperModel, RandomModel, PopularModel

from numpy import genfromtxt



In [2]:
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics, MAP, MCC, MRR, IntraListDiversity, F1Beta, Accuracy, NDCG, IntraListDiversity
from rectools.model_selection import TimeRangeSplitter, cross_validate

In [3]:
from rectools.metrics.distances import PairwiseHammingDistanceCalculator

### load data

In [12]:
vacancy = pd.read_csv(
    "вакансии_и_компании_2024-09-06T14_50_18.537362Z.csv",
    engine="python",  # Because of 2-chars separators
    usecols=['ID Вакансии', 'Название вакансии']
    )
vacancy.columns = ['ID', 'Title']

In [13]:
vacancyViewsCH = pd.read_csv(
    "просмотры_вакансии__clickhouse__2024-09-06T14_47_10.910563Z.csv",
    engine="python",  # Because of 2-chars separators
    header=0,
    names=[Columns.Datetime, Columns.User, Columns.Item]
    )
vacancyViewsCH['weight'] = np.full(vacancyViewsCH.shape[0], 1)
len(vacancyViewsCH)

289

In [14]:
vacancyViews = pd.read_csv(
    "просмотры_вакансии_2024-09-06T14_47_50.817721Z.csv",
    engine="python",  # Because of 2-chars separators
    header=0,
    names=[Columns.Datetime, Columns.User, Columns.Item]
    )
vacancyViews['weight'] = np.full(vacancyViews.shape[0], 1)
len(vacancyViews)

779

In [15]:
vacancyViews = vacancyViews.loc[(vacancyViews["datetime"] < min(vacancyViewsCH['datetime']))]
len(vacancyViews)
#views = pd.concat([vacancyViews, vacancyViewsCH], ignore_index=True)
#views = views.sort_values('datetime').reset_index(drop=True)
#-------
#views = vacancyViewsCH
#views
#-------

236

In [16]:
responses = pd.read_csv(
    "отклики_на_вакансии_2024-09-06T14_48_19.070423Z.csv",
    engine="python",  # Because of 2-chars separators
    #usecols=['CreatedAt', 'VacancyId', 'UserId'],
    header=0,
    names=[Columns.Datetime, Columns.Item, Columns.User, Columns.Weight]
    )
responses['weight'] = np.full(responses.shape[0], 2)
responses = responses[['datetime', 'user_id', 'item_id', 'weight']]
responses

Unnamed: 0,datetime,user_id,item_id,weight
0,"2022-2-7, 14:00",3436,220,2
1,"2022-2-20, 14:02",3577,216,2
2,"2022-3-29, 17:28",3642,220,2
3,"2022-6-18, 07:22",6815,216,2
4,"2022-7-1, 21:21",12792,216,2
5,"2022-5-23, 07:57",9650,216,2
6,"2023-3-23, 22:36",9519,216,2
7,"2023-5-5, 09:47",9707,225,2
8,"2023-5-10, 01:01",18496,225,2
9,"2023-7-17, 11:38",3655,225,2


In [17]:
data = pd.concat([vacancyViews, vacancyViewsCH, responses], axis=0)
data = data.sort_values('datetime').reset_index(drop=True)
data['datetime'] = pd.to_datetime(data['datetime'])
data.set_index('datetime', inplace=True)
data = data.groupby(['user_id', 'item_id', data.index.date]).head(1)
data.reset_index(inplace=True)
data

Unnamed: 0,datetime,user_id,item_id,weight
0,2021-12-15 09:03:00,914,216,1
1,2021-12-22 16:40:00,3359,219,1
2,2021-12-23 06:12:00,2749,219,1
3,2021-12-23 12:29:00,3361,216,1
4,2021-12-23 12:31:00,3361,219,1
...,...,...,...,...
385,2024-08-27 14:11:00,16156,246,2
386,2024-08-28 14:30:00,16156,246,1
387,2024-08-31 18:08:00,16156,246,1
388,2024-09-01 17:48:00,16364,231,1


### train & test

In [20]:
split_index = int(len(data) * 0.6)

train = data.iloc[:split_index]
test = data.iloc[split_index:]
print(f"Всего при разделении Train: {len(train)}, Test: {len(test)}")

train_users = set(train['user_id'])

test = test[test['user_id'].isin(train_users)]

missing_users = set(test['user_id']) - train_users

if missing_users:
    missing_data = test[test['user_id'].isin(missing_users)]
    train = pd.concat([train, missing_data])
    test = test[~test['user_id'].isin(missing_users)]

print(f"При проверке Train: {len(train)}, Test: {len(test)}")

Всего при разделении Train: 234, Test: 156
При проверке Train: 234, Test: 22


### load features

In [26]:
users = pd.read_csv(
    "все_студенты__без_пд__2024-09-06T14_56_40.946035Z.csv",
    engine="python",  # Because of 2-chars separators
    header=0,
    usecols=["ID ИОТ", "Курс", "Специальность", "Уровень образования"]
    )
users = users.rename(columns={"ID ИОТ": "user_id", "Курс": "course", "Специальность": "spec", "Уровень образования": "graduation" })

In [27]:
users = users.loc[users["user_id"].isin(train["user_id"])].copy()
len(users)

153

In [28]:
user_features_frames = []
for feature in ["course", "spec", "graduation"]:
    feature_frame = users.reindex(columns=["user_id", feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.sort_values("id")

Unnamed: 0,id,value,feature
0,1305,2,course
14702,1305,Программная инженерия,spec
0,1305,Аспирантура,graduation
14702,1305,2,course
14702,1305,Магистратура,graduation
...,...,...,...
7769,19139,1,course
7769,19139,Магистратура,graduation
10593,22947,1,course
10593,22947,Аспирантура,graduation


In [29]:
def select_one_value(group):
    return group.iloc[-1]

user_features_cleaned = user_features.groupby(['id', 'feature'], as_index=False).apply(select_one_value).reset_index(drop=True)

user_features_cleaned

  user_features_cleaned = user_features.groupby(['id', 'feature'], as_index=False).apply(select_one_value).reset_index(drop=True)


Unnamed: 0,id,value,feature
0,1305,4,course
1,1305,Бакалавриат,graduation
2,1305,Программная инженерия,spec
3,1306,4,course
4,1306,Бакалавриат,graduation
...,...,...,...
316,19139,Магистратура,graduation
317,19139,Электроэнергетика и электротехника,spec
318,22947,1,course
319,22947,Аспирантура,graduation


### create dataset

In [30]:
sparse_features_dataset = Dataset.construct(
    train,
    user_features_df=user_features_cleaned,
    cat_user_features=["graduation", "spec", "course"],
    make_dense_user_features=False
)

### TESTING ZONE

In [17]:
dataset = Dataset.construct(df_train)

In [18]:
model = ImplicitALSWrapperModel(AlternatingLeastSquares(10, num_threads=1))
model.fit(dataset)
recos = model.recommend(
    users=df_train[Columns.User].unique(),
    dataset=dataset,
    k=10,
    filter_viewed=True,
)

  check_blas_config()


In [19]:
recos

Unnamed: 0,user_id,item_id,score,rank
0,3577,239,0.089959,1
1,3577,238,0.084164,2
2,3577,241,0.084156,3
3,3577,232,0.079766,4
4,3577,230,0.055226,5
...,...,...,...,...
895,11021,241,0.007284,6
896,11021,238,0.007283,7
897,11021,226,0.004657,8
898,11021,220,0.001832,9


### model create

In [31]:
model = ImplicitALSWrapperModel(
    model=AlternatingLeastSquares(
        factors=20,             
        regularization=0.1,
        iterations=200,
        num_threads=32
    )
)
model.fit(
    dataset=sparse_features_dataset
)

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

  check_blas_config()


  0%|          | 0/1 [00:00<?, ?it/s]

<rectools.models.implicit_als.ImplicitALSWrapperModel at 0x2baa52e6990>

### get recomendations

In [33]:
recos = model.recommend(
    users=test[Columns.User].unique(),
    dataset=sparse_features_dataset,
    k=10,
    filter_viewed=True
)

In [34]:
recos

Unnamed: 0,user_id,item_id,score,rank
0,11438,229,0.400437,1
1,11438,216,0.240213,2
2,11438,224,0.119248,3
3,11438,219,0.118452,4
4,11438,230,0.103761,5
...,...,...,...,...
95,1305,212,0.061439,6
96,1305,219,0.049373,7
97,1305,229,0.046860,8
98,1305,232,0.043929,9


### metrics

In [38]:
merged = pd.merge(recos, test, on=['user_id', 'item_id'], how='left', indicator=True)
merged

Unnamed: 0,user_id,item_id,score,rank,datetime,weight,_merge
0,11438,229,0.400437,1,NaT,,left_only
1,11438,216,0.240213,2,NaT,,left_only
2,11438,224,0.119248,3,NaT,,left_only
3,11438,219,0.118452,4,NaT,,left_only
4,11438,230,0.103761,5,NaT,,left_only
...,...,...,...,...,...,...,...
95,1305,212,0.061439,6,NaT,,left_only
96,1305,219,0.049373,7,NaT,,left_only
97,1305,229,0.046860,8,NaT,,left_only
98,1305,232,0.043929,9,NaT,,left_only


In [40]:
# MAP
def average_precision(recs, test_items):
    hits = 0
    sum_precisions = 0
    for i, item in enumerate(recs['item_id'], start=1):
        if item in test_items['item_id'].values:
            hits += 1
            precision_at_i = hits / i
            sum_precisions += precision_at_i
    if hits == 0:
        return 0
    return sum_precisions / hits

def serendipity_for_user(user_recs, user_relevant, user_history):
    serendipity_count = 0
    for rec_item in user_recs['item_id']:
        is_relevant = rec_item in user_relevant['item_id'].values
        is_unexpected = rec_item not in user_history['item_id'].values
        if is_relevant and is_unexpected:
            serendipity_count += 1
    return serendipity_count / len(user_recs) if len(user_recs) > 0 else 0

def temporal_diversity(user_recs, item_timestamps):
    rec_timestamps = user_recs['item_id'].map(item_timestamps)
    if rec_timestamps.isnull().all():
        return 0
    return np.std(rec_timestamps.dropna())

def calculate_popularity_bias(recommendations, all_data, k):
    from collections import Counter
    
    all_items = [item for sublist in all_data for item in sublist]
    item_counts = Counter(all_items)
    total_items = len(all_items)
    
    recommended_items = [item for sublist in recommendations for item in sublist[:k]]
    recommended_item_counts = Counter(recommended_items)
    
    recommended_item_popularity = [
        item_counts[item] / total_items if item in item_counts else 0
        for item in recommended_items
    ]
    
    avg_popularity = sum(recommended_item_popularity) / len(recommended_item_popularity)
    
    return avg_popularity

def calculate_metrics(recommendations, all_data, test_data, beta=1, k=10):
    merged = pd.merge(recommendations, test_data, on=['user_id', 'item_id'], how='left', indicator=True)

    TP = merged['_merge'].value_counts().get('both', 0)
    FP = merged['_merge'].value_counts().get('left_only', 0)
    FN = test_data.shape[0] - TP
    
    # precision
    if TP + FP == 0:
        precision = 0
    else:
        precision = TP / (TP + FP)

    print("precision:", precision)
    
    # recall
    if TP + FN == 0:
        recall = 0
    else:
        recall = TP / (TP + FN)

    print("recall:", recall)
    
    # F1-score
    if precision + recall == 0:
        f1_score = 0
    else:
        f1_score = 2 * (precision * recall) / (precision + recall)
    
    # F1-beta
    if precision + recall == 0:
        f1_beta = 0
    else:
        f1_beta = (1 + beta**2) * (precision * recall) / (beta**2 * precision + recall)
    
    print("f1_beta:", f1_beta)

    users = recommendations['user_id'].unique()
    ap_sum = 0
    for user in users:
        user_recs = recommendations[recommendations['user_id'] == user]
        user_test = test_data[test_data['user_id'] == user]
        ap_sum += average_precision(user_recs, user_test)
    
    map_score = ap_sum / len(users)
    print("MAP:", map_score)

    # DCG@k
    def dcg_at_k(recs, test_items, k):
        dcg = 0.0
        for i, item in enumerate(recs['item_id'][:k], start=1):
            if item in test_items['item_id'].values:
                dcg += 1 / np.log2(i + 1) 
        return dcg
    
    # IDCG@k
    def idcg_at_k(test_items, k):
        ideal_rel_count = min(k, len(test_items)) 
        idcg = sum([1 / np.log2(i + 1) for i in range(1, ideal_rel_count + 1)])
        return idcg

    # NDCG@k
    ndcg_sum = 0
    for user in users:
        user_recs = recommendations[recommendations['user_id'] == user]
        user_test = test_data[test_data['user_id'] == user]
        dcg = dcg_at_k(user_recs, user_test, k)
        idcg = idcg_at_k(user_test, k)
        if idcg > 0:
            ndcg = dcg / idcg
        else:
            ndcg = 0
        ndcg_sum += ndcg

    ndcg_at_k = ndcg_sum / len(users)
    print(f"NDCG@{k}:", ndcg_at_k)

    # MRR
    def reciprocal_rank(recs, test_items):
        for i, item in enumerate(recs['item_id'], start=1):
            if item in test_items['item_id'].values:
                return 1 / i
        return 0

    mrr_sum = 0
    for user in users:
        user_recs = recommendations[recommendations['user_id'] == user]
        user_test = test_data[test_data['user_id'] == user]
        mrr_sum += reciprocal_rank(user_recs, user_test)

    mrr_score = mrr_sum / len(users)
    print("MRR:", mrr_score)

    # Novelty
    item_popularity = all_data['item_id'].value_counts()
    max_popularity = item_popularity.max()

    def novelty(item, max_popularity, item_popularity):
        popularity = item_popularity.get(item, 0)
        return 1 - (popularity / max_popularity)

    novelty_sum = 0
    for user in users:
        user_recs = recommendations[recommendations['user_id'] == user]
        novelty_user_sum = user_recs['item_id'].apply(lambda item: novelty(item, max_popularity, item_popularity)).sum()
        novelty_sum += novelty_user_sum / len(user_recs)

    novelty_score = novelty_sum / len(users)
    print("Novelty:", novelty_score)

    serendipity_sum = 0
    temporal_diversity_sum = 0
    users = recommendations['user_id'].unique()
    
    for user in users:
        user_recs = recommendations[recommendations['user_id'] == user].head(k)
        user_relevant = test_data[test_data['user_id'] == user]
        user_history = all_data[(all_data['user_id'] == user) & (all_data['item_id'].isin(test_data['item_id']) == False)]
        serendipity_sum += serendipity_for_user(user_recs, user_relevant, user_history)
        
    
    serendipity_score = serendipity_sum / len(users) if len(users) > 0 else 0
    print("Serendipity:", serendipity_score)

    # item coverage
    all_items = all_data['item_id'].unique()
    recommended_items = recommendations['item_id'].unique()
    item_coverage = len(recommended_items) / len(all_items) if len(all_items) > 0 else 0
    print("Item Coverage:", item_coverage)

    # User coverage
    # total_users = all_data['user_id'].unique()
    # recommended_users = recommendations['user_id'].unique()
    # user_coverage = len(recommended_users) / len(total_users) if len(total_users) > 0 else 0
    # print("User Coverage:", user_coverage)
    
    popularity_bias = calculate_popularity_bias(recommendations, all_data, k)
    print("popularity_bias:", popularity_bias)
    
    pass

In [41]:
calculate_metrics(recos, data, test)

precision: 0.05
recall: 0.22727272727272727
f1_beta: 0.08196721311475409
MAP: 0.12178571428571427
NDCG@10: 0.133253133932843
MRR: 0.15178571428571427
Novelty: 0.7372413793103447
Serendipity: 0.05
Item Coverage: 0.75
popularity_bias: 0.08229813664596275


In [35]:
precision = Precision(k=10)
recall = Recall(k=10)
f1 = F1Beta(k=10)

In [36]:
precision_value = precision.calc(reco=recos, interactions=test)
print(f"precision: {precision_value}")

recall_value = recall.calc(reco=recos, interactions=test)
print(f"recall: {recall_value}")


precision: 0.05
recall: 0.2833333333333333


In [102]:
recos

Unnamed: 0,user_id,item_id,score,rank
0,3577,229,0.537619,1
1,3577,234,0.422106,2
2,3577,224,0.293691,3
3,3577,225,0.258978,4
4,3577,230,0.256801,5
5,3577,232,0.172771,6
6,3577,233,0.121096,7
7,3577,239,0.110655,8
8,3577,226,0.10573,9
9,3577,241,0.102491,10
