# RecTools with features - PROJECTS

### import libraries

In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from implicit.nearest_neighbours import TFIDFRecommender, BM25Recommender
from implicit.als import AlternatingLeastSquares
from lightfm import LightFM

from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import ImplicitALSWrapperModel
from rectools.models import LightFMWrapperModel, ImplicitItemKNNWrapperModel, RandomModel, PopularModel

from numpy import genfromtxt



In [2]:
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics, MAP, MCC, MRR, IntraListDiversity, F1Beta, Accuracy, NDCG, IntraListDiversity
from rectools.model_selection import TimeRangeSplitter, cross_validate

In [3]:
from rectools.metrics.distances import PairwiseHammingDistanceCalculator

### load data

In [4]:
projects = pd.read_csv(
    "проекты_и_компании_2024-09-13T14_31_07.415586Z.csv",
    engine="c",
    usecols=['ID Проекта', 'Название']
    )
projects.columns = ['ID', 'Title']

In [5]:
oldViews = pd.read_csv(
    "просмотры_проектов_2024-09-20T10_22_58.285588Z.csv",
    engine="python",  # Because of 2-chars separators
    header=0,
    names=[Columns.Datetime, Columns.User, Columns.Item]
    )
oldViews['weight'] = np.full(oldViews.shape[0], 1)
len(oldViews)

5680

In [6]:
views = pd.read_csv(
    "просмотры_проектов__clickhouse__2024-09-13T14_24_44.471163Z.csv",
    engine="python",  # Because of 2-chars separators
    header=0,
    names=[Columns.Datetime, Columns.User, Columns.Item]
    )
views['weight'] = np.full(views.shape[0], 1)
len(views)

11963

In [7]:
responses = pd.read_csv(
    "участие_в_проектах_2024-09-13T14_49_23.189163Z.csv",
    engine="python",
    header=0,
    usecols=['Дата', 'ID Проекта', 'ID Пользователя'],
    )
responses['weight'] = np.full(responses.shape[0], 2)
responses = responses.rename(columns={'Дата': Columns.Datetime, 'ID Проекта': Columns.Item, 'ID Пользователя': Columns.User, 'weight': Columns.Weight})
responses = responses[['datetime', 'user_id', 'item_id', 'weight']]

In [8]:
oldViews = oldViews.loc[(oldViews["datetime"] < min(views['datetime']))]
len(oldViews)

311

In [9]:
data = pd.concat([oldViews, views, responses], axis=0)
data = data.sort_values('datetime').reset_index(drop=True)
data['datetime'] = pd.to_datetime(data['datetime'])
data.set_index('datetime', inplace=True)
data = data.groupby(['user_id', 'item_id', data.index.date]).head(1)
data.reset_index(inplace=True)
data

Unnamed: 0,datetime,user_id,item_id,weight
0,2022-01-12 06:15:00,802,1,1
1,2022-01-19 09:44:00,1350,1,1
2,2022-01-21 17:00:00,914,1,1
3,2022-01-25 17:35:00,914,1,1
4,2022-01-26 19:17:00,1350,1,1
...,...,...,...,...
3712,2024-09-05 15:11:00,16447,131,1
3713,2024-09-05 17:18:00,23917,131,1
3714,2024-09-05 17:18:00,23917,129,1
3715,2024-09-06 11:45:00,10948,129,1


### train & test

In [10]:
split_index = int(len(data) * 0.9)

train = data.iloc[:split_index]
test = data.iloc[split_index:]
print(f"Всего при разделении Train: {len(train)}, Test: {len(test)}")

train_users = set(train['user_id'])

test = test[test['user_id'].isin(train_users)]

missing_users = set(test['user_id']) - train_users

if missing_users:
    missing_data = test[test['user_id'].isin(missing_users)]
    train = pd.concat([train, missing_data])
    test = test[~test['user_id'].isin(missing_users)]

print(f"При проверке Train: {len(train)}, Test: {len(test)}")

Всего при разделении Train: 3345, Test: 372
При проверке Train: 3345, Test: 244


### load features

In [27]:
users = pd.read_csv(
    "все_студенты__без_пд__2024-09-06T14_56_40.946035Z.csv",
    engine="python",  # Because of 2-chars separators
    header=0,
    usecols=["ID ИОТ", "Курс", "Специальность", "Уровень образования"]
    )
users = users.rename(columns={"ID ИОТ": "user_id", "Курс": "course", "Специальность": "spec", "Уровень образования": "graduation" })

In [28]:
users = users.loc[users["user_id"].isin(train["user_id"])].copy()
len(users)

374

In [29]:
user_features_frames = []
for feature in ["course", "spec", "graduation"]:
    feature_frame = users.reindex(columns=["user_id", feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.sort_values("id")

Unnamed: 0,id,value,feature
0,1305,2,course
14702,1305,2,course
17992,1305,Бакалавриат,graduation
17992,1305,4,course
0,1305,Аспирантура,graduation
...,...,...,...
10486,22815,1,course
10486,22815,Информатика и вычислительная техника,spec
10555,22884,Магистратура,graduation
10555,22884,1,course


In [30]:
def select_one_value(group):
    return group.iloc[-1]

user_features_cleaned = user_features.groupby(['id', 'feature'], as_index=False).apply(select_one_value).reset_index(drop=True)

user_features_cleaned

  user_features_cleaned = user_features.groupby(['id', 'feature'], as_index=False).apply(select_one_value).reset_index(drop=True)


Unnamed: 0,id,value,feature
0,1305,4,course
1,1305,Бакалавриат,graduation
2,1305,Программная инженерия,spec
3,1307,4,course
4,1307,Бакалавриат,graduation
...,...,...,...
940,22815,Магистратура,graduation
941,22815,Информатика и вычислительная техника,spec
942,22884,1,course
943,22884,Магистратура,graduation


In [15]:
item_features = pd.read_csv(
    "Cosine_Similarity_new.csv",
    engine="python",  # Because of 2-chars separators
    )
print(item_features.shape)

(134, 134)


In [16]:
my_data = genfromtxt("Cosine_Similarity_new.csv", delimiter=',')

In [17]:
projectsM = pd.read_csv(
    "проекты_и_компании_2024-09-13T14_31_07.415586Z.csv",
    engine="c",
    usecols=['ID Проекта']
    )
projectsM.columns = ['ID']

my_data_f = my_data[1:]
projectsM['CosSimularity'] = ''

i = 0
for md in my_data_f:
    projectsM['CosSimularity'][i] = my_data_f[i]
    i+=1

cossimularity_feature = projectsM[["ID", "CosSimularity"]].explode("CosSimularity")
cossimularity_feature.columns = ["id", "value"]

cossimularity_feature['feature'] = ''

for i in range(len(my_data[0])):
    cossimularity_feature.at[i, "feature"] = my_data[0]

md_list = my_data[0].tolist()


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  projectsM['CosSimularity'][i] = my_data_f[i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  projectsM['CosSimu

In [18]:
cossimularity_feature_train = cossimularity_feature.loc[cossimularity_feature["id"].isin(train["item_id"])].copy()
cossimularity_feature_train.head()

Unnamed: 0,id,value,feature
0,1,1.0,0.0
0,1,0.36081,1.0
0,1,0.372803,2.0
0,1,0.887113,3.0
0,1,0.857938,4.0


### create dataset

In [19]:
sparse_features_dataset = Dataset.construct(
    train,
    user_features_df=user_features_cleaned,
    cat_user_features=["graduation", "spec", "course"],
    make_dense_user_features=False
)

dataset_full_features = Dataset.construct(
    train,
    user_features_df=user_features,
    cat_user_features=["graduation", "spec", "course"],
    make_dense_user_features=False,
    item_features_df=cossimularity_feature_train,
    cat_item_features=md_list
)

### model create

In [20]:
model = ImplicitALSWrapperModel(
    model=AlternatingLeastSquares(
        factors=20,             
        regularization=0.1,
        iterations=200,
        num_threads=32
    )
)

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

  check_blas_config()


In [21]:
model.fit(
    dataset=dataset_full_features
)



  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

<rectools.models.implicit_als.ImplicitALSWrapperModel at 0x2e4484f6550>

### get recomendations

In [22]:
recos = model.recommend(
    users=test[Columns.User].unique(),
    dataset=dataset_full_features,
    k=10,
    filter_viewed=True
)

In [23]:
closedProjects = pd.read_csv(
    "время_жизни_проектов_2024-09-15T14_24_25.319296Z.csv"
)
closedProjects['createdAt'] = pd.to_datetime(closedProjects['createdAt'], format='%Y-%m-%d, %H:%M')
closedProjects = closedProjects.loc[closedProjects.groupby('id')['createdAt'].idxmax()]
closedProjects = closedProjects.loc[(closedProjects["status"] == 'active')]
clearRecos = recos.loc[recos['item_id'].isin(closedProjects['id'])]
clearRecos

Unnamed: 0,user_id,item_id,score,rank
0,16599,113,0.801880,1
1,16599,110,0.677278,2
2,16599,96,0.489506,3
3,16599,126,0.476577,4
4,16599,86,0.466310,5
...,...,...,...,...
291,10726,97,0.808479,2
292,10726,85,0.791667,3
293,10726,87,0.753628,4
295,10726,82,0.595871,6


In [24]:
precision = Precision(k=10)
recall = Recall(k=10)
f1 = F1Beta(k=10)

precision_value = precision.calc(reco=recos, interactions=test)
print(f"precision: {precision_value}")

recall_value = recall.calc(reco=recos, interactions=test)
print(f"recall: {recall_value}")

precision: 0.07666666666666667
recall: 0.07194885361552027


### My Metrics

In [25]:
merged = pd.merge(recos, test, on=['user_id', 'item_id'], how='left', indicator=True)
merged

Unnamed: 0,user_id,item_id,score,rank,datetime,weight,_merge
0,16599,113,0.801880,1,NaT,,left_only
1,16599,110,0.677278,2,NaT,,left_only
2,16599,96,0.489506,3,NaT,,left_only
3,16599,126,0.476577,4,NaT,,left_only
4,16599,86,0.466310,5,NaT,,left_only
...,...,...,...,...,...,...,...
299,10726,82,0.595871,6,NaT,,left_only
300,10726,84,0.591661,7,NaT,,left_only
301,10726,104,0.420670,8,NaT,,left_only
302,10726,8,0.351159,9,NaT,,left_only


In [26]:
# MAP
def average_precision(recs, test_items):
    hits = 0
    sum_precisions = 0
    for i, item in enumerate(recs['item_id'], start=1):
        if item in test_items['item_id'].values:
            hits += 1
            precision_at_i = hits / i
            sum_precisions += precision_at_i
    if hits == 0:
        return 0
    return sum_precisions / hits

In [27]:
def serendipity_for_user(user_recs, user_relevant, user_history):
    serendipity_count = 0
    for rec_item in user_recs['item_id']:
        is_relevant = rec_item in user_relevant['item_id'].values
        is_unexpected = rec_item not in user_history['item_id'].values
        if is_relevant and is_unexpected:
            serendipity_count += 1
    return serendipity_count / len(user_recs) if len(user_recs) > 0 else 0

In [28]:
def calculate_popularity_bias(recommendations, all_data, k):
    from collections import Counter
    
    all_items = [item for sublist in all_data for item in sublist]
    item_counts = Counter(all_items)
    total_items = len(all_items)
    
    recommended_items = [item for sublist in recommendations for item in sublist[:k]]
    recommended_item_counts = Counter(recommended_items)
    
    recommended_item_popularity = [
        item_counts[item] / total_items if item in item_counts else 0
        for item in recommended_items
    ]
    
    avg_popularity = sum(recommended_item_popularity) / len(recommended_item_popularity)
    
    return avg_popularity


In [29]:
def calculate_metrics(recommendations, all_data, test_data, beta=1, k=10):
    merged = pd.merge(recommendations, test_data, on=['user_id', 'item_id'], how='left', indicator=True)

    TP = merged['_merge'].value_counts().get('both', 0)
    FP = merged['_merge'].value_counts().get('left_only', 0)
    FN = test_data.shape[0] - TP
    
    # precision
    if TP + FP == 0:
        precision = 0
    else:
        precision = TP / (TP + FP)

    print("precision:", precision)
    
    # recall
    if TP + FN == 0:
        recall = 0
    else:
        recall = TP / (TP + FN)

    print("recall:", recall)
    
    # F1-score
    if precision + recall == 0:
        f1_score = 0
    else:
        f1_score = 2 * (precision * recall) / (precision + recall)
    
    # F1-beta
    if precision + recall == 0:
        f1_beta = 0
    else:
        f1_beta = (1 + beta**2) * (precision * recall) / (beta**2 * precision + recall)
    
    print("f1_beta:", f1_beta)

    users = recommendations['user_id'].unique()
    ap_sum = 0
    for user in users:
        user_recs = recommendations[recommendations['user_id'] == user]
        user_test = test_data[test_data['user_id'] == user]
        ap_sum += average_precision(user_recs, user_test)
    
    map_score = ap_sum / len(users)
    print("MAP:", map_score)

    # DCG@k
    def dcg_at_k(recs, test_items, k):
        dcg = 0.0
        for i, item in enumerate(recs['item_id'][:k], start=1):
            if item in test_items['item_id'].values:
                dcg += 1 / np.log2(i + 1) 
        return dcg
    
    # IDCG@k
    def idcg_at_k(test_items, k):
        ideal_rel_count = min(k, len(test_items)) 
        idcg = sum([1 / np.log2(i + 1) for i in range(1, ideal_rel_count + 1)])
        return idcg

    # NDCG@k
    ndcg_sum = 0
    for user in users:
        user_recs = recommendations[recommendations['user_id'] == user]
        user_test = test_data[test_data['user_id'] == user]
        dcg = dcg_at_k(user_recs, user_test, k)
        idcg = idcg_at_k(user_test, k)
        if idcg > 0:
            ndcg = dcg / idcg
        else:
            ndcg = 0
        ndcg_sum += ndcg

    ndcg_at_k = ndcg_sum / len(users)
    print(f"NDCG@{k}:", ndcg_at_k)

    # MRR
    def reciprocal_rank(recs, test_items):
        for i, item in enumerate(recs['item_id'], start=1):
            if item in test_items['item_id'].values:
                return 1 / i
        return 0

    mrr_sum = 0
    for user in users:
        user_recs = recommendations[recommendations['user_id'] == user]
        user_test = test_data[test_data['user_id'] == user]
        mrr_sum += reciprocal_rank(user_recs, user_test)

    mrr_score = mrr_sum / len(users)
    print("MRR:", mrr_score)

    # Novelty
    item_popularity = all_data['item_id'].value_counts()
    max_popularity = item_popularity.max()

    def novelty(item, max_popularity, item_popularity):
        popularity = item_popularity.get(item, 0)
        return 1 - (popularity / max_popularity)

    novelty_sum = 0
    for user in users:
        user_recs = recommendations[recommendations['user_id'] == user]
        novelty_user_sum = user_recs['item_id'].apply(lambda item: novelty(item, max_popularity, item_popularity)).sum()
        novelty_sum += novelty_user_sum / len(user_recs)

    novelty_score = novelty_sum / len(users)
    print("Novelty:", novelty_score)

    serendipity_sum = 0
    temporal_diversity_sum = 0
    users = recommendations['user_id'].unique()
    
    for user in users:
        user_recs = recommendations[recommendations['user_id'] == user].head(k)
        user_relevant = test_data[test_data['user_id'] == user]
        user_history = all_data[(all_data['user_id'] == user) & (all_data['item_id'].isin(test_data['item_id']) == False)]
        serendipity_sum += serendipity_for_user(user_recs, user_relevant, user_history)
        
    
    serendipity_score = serendipity_sum / len(users) if len(users) > 0 else 0
    print("Serendipity:", serendipity_score)

    # item coverage
    all_items = all_data['item_id'].unique()
    recommended_items = recommendations['item_id'].unique()
    item_coverage = len(recommended_items) / len(all_items) if len(all_items) > 0 else 0
    print("Item Coverage:", item_coverage)

    # User coverage
    # total_users = all_data['user_id'].unique()
    # recommended_users = recommendations['user_id'].unique()
    # user_coverage = len(recommended_users) / len(total_users) if len(total_users) > 0 else 0
    # print("User Coverage:", user_coverage)
    
    popularity_bias = calculate_popularity_bias(recommendations, all_data, k)
    print("popularity_bias:", popularity_bias)
    
    pass

In [30]:
calculate_metrics(recos, data, test)

precision: 0.0756578947368421
recall: 0.0942622950819672
f1_beta: 0.08394160583941607
MAP: 0.10507495590828925
NDCG@10: 0.06905027057623207
MRR: 0.11865079365079365
Novelty: 0.6634561403508772
Serendipity: 0.06333333333333334
Item Coverage: 0.6036036036036037
popularity_bias: 0.08229813664596275


### RecTools Metrics

In [81]:
n_splits = 3

splitter = TimeRangeSplitter(
    test_size="45D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [82]:
splitter.get_test_fold_borders(dataset_full_features.interactions) #dataset_full_features

[(Timestamp('2024-02-06 00:00:00'), Timestamp('2024-03-22 00:00:00')),
 (Timestamp('2024-03-22 00:00:00'), Timestamp('2024-05-06 00:00:00')),
 (Timestamp('2024-05-06 00:00:00'), Timestamp('2024-06-20 00:00:00'))]

In [83]:
# Take few simple models to compare
models = {
    "random": RandomModel(random_state=42),
    "popular": PopularModel(),
    "most_raited": PopularModel(popularity="sum_weight"),
    "tfidf_k=5": ImplicitItemKNNWrapperModel(model=TFIDFRecommender(K=5)),
    "tfidf_k=10": ImplicitItemKNNWrapperModel(model=TFIDFRecommender(K=10)),
    "bm25_k=5_k1=0.05_b=0.1": ImplicitItemKNNWrapperModel(model=BM25Recommender(K=5, K1=0.05, B=0.1)),
    "Implicit ALS": ImplicitALSWrapperModel(AlternatingLeastSquares(10, num_threads=32))
}

In [84]:
metrics = {
    "prec@1": Precision(k=1),
    "prec@10": Precision(k=10),
    'Serendipity@1': Serendipity(k=1),
    'Serendipity@5': Serendipity(k=5),
    'Serendipity@10': Serendipity(k=10),
    'MIUF@1': MeanInvUserFreq(k=1),
    'MIUF@5': MeanInvUserFreq(k=5),
    'MIUF@10': MeanInvUserFreq(k=10),
    'MAP@1': MAP(k=1),
    'MAP@5': MAP(k=5),
    'MAP@10': MAP(k=10),
    "recall": Recall(k=10),
    "MCC": MCC(k=10),
    "MRR": MRR(k=10),
    "F1Beta": F1Beta(k=10)
}

K_RECS = 10

In [85]:
%%time

# For each fold generate train and test part of dataset
# Then fit every model, generate recommendations and calculate metrics

cv_results = cross_validate(
    dataset=dataset_full_features, #sparse_features_dataset
    splitter=splitter,
    models=models,
    metrics=metrics,
    k=K_RECS,
    filter_viewed=True,
)



  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

CPU times: total: 9min 29s
Wall time: 1min 9s


In [86]:
pivot_results = (
    pd.DataFrame(cv_results["metrics"])
    .drop(columns="i_split")
    .groupby(["model"], sort=False)
    .agg(["mean", "std"])
)
mean_metric_subset = [(metric, "mean") for metric in pivot_results.columns.levels[0]]
(
    pivot_results.style
    .highlight_min(subset=mean_metric_subset, color='lightcoral', axis=0)
    .highlight_max(subset=mean_metric_subset, color='lightgreen', axis=0)
)

Unnamed: 0_level_0,prec@1,prec@1,prec@10,prec@10,recall,recall,MCC,MCC,F1Beta,F1Beta,MRR,MRR,MAP@1,MAP@1,MAP@5,MAP@5,MAP@10,MAP@10,MIUF@1,MIUF@1,MIUF@5,MIUF@5,MIUF@10,MIUF@10,Serendipity@1,Serendipity@1,Serendipity@5,Serendipity@5,Serendipity@10,Serendipity@10
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2
random,0.296537,0.453167,0.070617,0.081389,0.093625,0.027742,-0.002932,0.055437,0.061613,0.05882,0.130964,0.112687,0.022068,0.036111,0.036399,0.030565,0.0409,0.035293,6.036442,0.720489,5.620882,0.567543,5.646531,0.402837,0.01678,0.016168,0.012822,0.008985,0.009942,0.007762
popular,0.162879,0.184754,0.109686,0.108213,0.264547,0.186421,0.075925,0.117843,0.112023,0.075052,0.209298,0.165563,0.049676,0.06633,0.081921,0.07304,0.105563,0.094998,2.058405,0.099824,2.557499,0.198343,2.845187,0.396627,0.001426,0.001468,0.002925,0.002332,0.002566,0.002038
most_raited,0.060606,0.104973,0.117641,0.130029,0.246592,0.148471,0.074475,0.11351,0.110898,0.07872,0.145954,0.130342,0.002635,0.004564,0.050845,0.045806,0.075629,0.056136,2.271741,0.2784,2.583101,0.199998,2.887215,0.420142,0.000326,0.000564,0.000766,0.001102,0.00122,0.001099
tfidf_k=5,0.095238,0.164957,0.12895,0.038516,0.377574,0.206036,0.124255,0.076044,0.143122,0.036629,0.207721,0.042786,0.005285,0.009154,0.084028,0.03571,0.120113,0.046835,4.624132,1.609672,4.710961,1.425555,4.631237,1.522039,0.012983,0.022487,0.028308,0.024622,0.029904,0.02301
tfidf_k=10,0.197511,0.081498,0.129383,0.038866,0.323481,0.166742,0.108301,0.05829,0.133011,0.013428,0.224486,0.057635,0.049587,0.06543,0.101849,0.078748,0.137319,0.087124,4.496633,1.624961,4.612741,1.578665,4.650831,1.474509,0.013757,0.021817,0.020307,0.015932,0.019059,0.013498
bm25_k=5_k1=0.05_b=0.1,0.459957,0.344161,0.101245,0.011943,0.259986,0.190121,0.072411,0.084278,0.111122,0.031948,0.333378,0.10091,0.069722,0.052353,0.128632,0.105108,0.144251,0.10969,3.590691,1.534996,4.008786,1.614586,4.079742,1.615409,0.046824,0.039146,0.022412,0.020398,0.023399,0.018833
Implicit ALS,0.143939,0.129233,0.209091,0.113909,0.394748,0.172369,0.187108,0.079666,0.191713,0.058682,0.270107,0.1413,0.047313,0.06781,0.104473,0.060773,0.154436,0.081389,3.072142,0.113229,3.568491,0.550056,3.748742,0.69772,0.02509,0.034565,0.02555,0.007935,0.019361,0.008401


In [87]:
pd.DataFrame(cv_results["splits"])

Unnamed: 0,i_split,start,end,train,train_users,train_items,test,test_users,test_items
0,0,2024-02-06,2024-03-22,487,166,79,101,14,22
1,1,2024-03-22,2024-05-06,2645,289,93,93,11,30
2,2,2024-05-06,2024-06-20,2860,296,93,14,8,14


### ❗❗❗ TESTING ZONE ❗❗❗

In [50]:
def calculate_precision(recos, df_test):
    test_data_grouped = df_test.groupby('user_id')['item_id'].apply(set).to_dict()
    
    precisions = []
    
    for user_id, group in recos.groupby('user_id'):
        recommended_items = set(group['item_id'].head(15))
        true_items = test_data_grouped.get(user_id, set())
        correct_recommendations = len(recommended_items & true_items)
        precision = correct_recommendations / len(recommended_items) if recommended_items else 0
        precisions.append(precision)
    
    return sum(precisions) / len(precisions) if precisions else 0

In [51]:
calculate_precision(clearRecos, test)

0.07345238095238096

In [48]:
n_splits = 3

splitter = TimeRangeSplitter(
    test_size="70D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

splits = splitter.get_test_fold_borders(sparse_features_dataset.interactions)
splits

[(Timestamp('2023-12-16 00:00:00'), Timestamp('2024-02-24 00:00:00')),
 (Timestamp('2024-02-24 00:00:00'), Timestamp('2024-05-04 00:00:00')),
 (Timestamp('2024-05-04 00:00:00'), Timestamp('2024-07-13 00:00:00'))]

In [None]:
part_of_data = data[(pd.to_datetime(data['datetime']) >= splts[0][0]) & (pd.to_datetime(data['datetime']) <= sp[1])]
split_index = int(len(part_of_data) * 0.7)

train = part_of_data.iloc[:split_index]
test = part_of_data.iloc[split_index:]
print(f"Всего при разделении Train: {len(train)}, Test: {len(test)}")

In [26]:
def select_one_value(group):
    return group.iloc[-1]

def time_window_evaluation(data, splts, users):
    for sp in splts:
        # train & test split
        part_of_data = data[(pd.to_datetime(data['datetime']) >= splts[0][0]) & (pd.to_datetime(data['datetime']) <= sp[1])]
        split_index = int(len(part_of_data) * 0.7)
    
        train = part_of_data.iloc[:split_index]
        test = part_of_data.iloc[split_index:]
        print(f"Всего при разделении Train: {len(train)}, Test: {len(test)}")
        
        train_users = set(train['user_id'])
        test = test[test['user_id'].isin(train_users)]
        missing_users = set(test['user_id']) - train_users
        
        if missing_users:
            missing_data = test[test['user_id'].isin(missing_users)]
            train = pd.concat([train, missing_data])
            test = test[~test['user_id'].isin(missing_users)]
        
        print(f"При проверке Train: {len(train)}, Test: {len(test)}")

        # get user_features
        users = users.loc[users["user_id"].isin(train["user_id"])].copy()
        user_features_frames = []
        for feature in ["course", "spec", "graduation"]:
            feature_frame = users.reindex(columns=["user_id", feature])
            feature_frame.columns = ["id", "value"]
            feature_frame["feature"] = feature
            user_features_frames.append(feature_frame)
        user_features = pd.concat(user_features_frames)
        user_features.sort_values("id")
        
        user_features_cleaned = user_features.groupby(['id', 'feature'], as_index=False).apply(select_one_value).reset_index(drop=True)
        
        # create dataset
        sparse_features_dataset = Dataset.construct(
            train,
            user_features_df=user_features_cleaned,
            cat_user_features=["graduation", "spec", "course"],
            make_dense_user_features=False
        )

        # model biulding
        model = ImplicitALSWrapperModel(
            model=AlternatingLeastSquares(
                factors=20,             
                regularization=0.1,
                iterations=200,
                num_threads=32
            )
        )
        model.fit(
            dataset=sparse_features_dataset
        )

        # get recos
        recos = model.recommend(
            users=test[Columns.User].unique(),
            dataset=sparse_features_dataset,
            k=10,
            filter_viewed=True
        )

        # metric
        precision = Precision(k=10)
        recall = Recall(k=10)
        f1 = F1Beta(k=10)
        
        precision_value = precision.calc(reco=recos, interactions=test)
        print(f"precision: {precision_value}")
        
        recall_value = recall.calc(reco=recos, interactions=test)
        print(f"recall: {recall_value}")


def number_window_evaluation(data, users):
    koef = 0.5
    data_index = round(len(data) * koef) 
    data_index
    
    while(koef < 1):
        data_index = round(len(data) * koef)
        train = data.iloc[:data_index]
        test = data.iloc[data_index:(round(len(data) * (koef + 0.05)))]
        koef+=0.05
        koef = round(koef, 2)
        print(len(train), len(test), koef)
        
        train_users = set(train['user_id'])
        test = test[test['user_id'].isin(train_users)]
        missing_users = set(test['user_id']) - train_users
        
        if missing_users:
            missing_data = test[test['user_id'].isin(missing_users)]
            train = pd.concat([train, missing_data])
            test = test[~test['user_id'].isin(missing_users)]
        
        print(f"При проверке Train: {len(train)}, Test: {len(test)}", '\n')

        # get user_features
        users = users.loc[users["user_id"].isin(train["user_id"])].copy()
        user_features_frames = []
        for feature in ["course", "spec", "graduation"]:
            feature_frame = users.reindex(columns=["user_id", feature])
            feature_frame.columns = ["id", "value"]
            feature_frame["feature"] = feature
            user_features_frames.append(feature_frame)
        user_features = pd.concat(user_features_frames)
        user_features.sort_values("id")
        
        user_features_cleaned = user_features.groupby(['id', 'feature'], as_index=False).apply(select_one_value).reset_index(drop=True)
        
        # create dataset
        sparse_features_dataset = Dataset.construct(
            train,
            user_features_df=user_features_cleaned,
            cat_user_features=["graduation", "spec", "course"],
            make_dense_user_features=False
        )

        # model biulding
        model = ImplicitALSWrapperModel(
            model=AlternatingLeastSquares(
                factors=20,             
                regularization=0.1,
                iterations=200,
                num_threads=32
            )
        )
        model.fit(
            dataset=sparse_features_dataset
        )

        # get recos
        recos = model.recommend(
            users=test[Columns.User].unique(),
            dataset=sparse_features_dataset,
            k=10,
            filter_viewed=True
        )

        # metric
        precision = Precision(k=10)
        recall = Recall(k=10)
        f1 = F1Beta(k=10)
        
        precision_value = precision.calc(reco=recos, interactions=test)
        print(f"precision: {precision_value}")
        
        recall_value = recall.calc(reco=recos, interactions=test)
        print(f"recall: {recall_value}")

In [50]:
time_window_evaluation(data, splits, users)

Всего при разделении Train: 1189, Test: 510
При проверке Train: 1189, Test: 473


  user_features_cleaned = user_features.groupby(['id', 'feature'], as_index=False).apply(select_one_value).reset_index(drop=True)


  0%|          | 0/1 [00:00<?, ?it/s]

  user_features_cleaned = user_features.groupby(['id', 'feature'], as_index=False).apply(select_one_value).reset_index(drop=True)


precision: 0.15384615384615383
recall: 0.15961565802547925
Всего при разделении Train: 1703, Test: 730
При проверке Train: 1703, Test: 423




  0%|          | 0/1 [00:00<?, ?it/s]

  user_features_cleaned = user_features.groupby(['id', 'feature'], as_index=False).apply(select_one_value).reset_index(drop=True)


precision: 0.1
recall: 0.08773454063284768
Всего при разделении Train: 2195, Test: 942
При проверке Train: 2195, Test: 196




  0%|          | 0/1 [00:00<?, ?it/s]

precision: 0.05500000000000001
recall: 0.09648504273504273


In [31]:
number_window_evaluation(data, users)

1858 186 0.55
При проверке Train: 1858, Test: 151 



  user_features_cleaned = user_features.groupby(['id', 'feature'], as_index=False).apply(select_one_value).reset_index(drop=True)
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

  check_blas_config()


  0%|          | 0/1 [00:00<?, ?it/s]

  user_features_cleaned = user_features.groupby(['id', 'feature'], as_index=False).apply(select_one_value).reset_index(drop=True)


precision: 0.0888888888888889
recall: 0.09212001989779768
2044 186 0.6
При проверке Train: 2044, Test: 174 





  0%|          | 0/1 [00:00<?, ?it/s]

  user_features_cleaned = user_features.groupby(['id', 'feature'], as_index=False).apply(select_one_value).reset_index(drop=True)


precision: 0.02564102564102564
recall: 0.02650215281794229
2230 186 0.65
При проверке Train: 2230, Test: 170 





  0%|          | 0/1 [00:00<?, ?it/s]

  user_features_cleaned = user_features.groupby(['id', 'feature'], as_index=False).apply(select_one_value).reset_index(drop=True)


precision: 0.09736842105263159
recall: 0.1484751651856915
2416 186 0.7
При проверке Train: 2416, Test: 111 





  0%|          | 0/1 [00:00<?, ?it/s]

  user_features_cleaned = user_features.groupby(['id', 'feature'], as_index=False).apply(select_one_value).reset_index(drop=True)


precision: 0.11904761904761904
recall: 0.12767676767676767
2602 186 0.75
При проверке Train: 2602, Test: 169 





  0%|          | 0/1 [00:00<?, ?it/s]

  user_features_cleaned = user_features.groupby(['id', 'feature'], as_index=False).apply(select_one_value).reset_index(drop=True)


precision: 0.051515151515151514
recall: 0.14742063492063492
2788 186 0.8
При проверке Train: 2788, Test: 59 





  0%|          | 0/1 [00:00<?, ?it/s]

  user_features_cleaned = user_features.groupby(['id', 'feature'], as_index=False).apply(select_one_value).reset_index(drop=True)


precision: 0.047058823529411764
recall: 0.10512820512820513
2974 185 0.85
При проверке Train: 2974, Test: 14 





  0%|          | 0/1 [00:00<?, ?it/s]

precision: 0.0
recall: 0.0
3159 186 0.9
При проверке Train: 3159, Test: 119 



  user_features_cleaned = user_features.groupby(['id', 'feature'], as_index=False).apply(select_one_value).reset_index(drop=True)


  0%|          | 0/1 [00:00<?, ?it/s]

  user_features_cleaned = user_features.groupby(['id', 'feature'], as_index=False).apply(select_one_value).reset_index(drop=True)


precision: 0.095
recall: 0.10015151515151514
3345 186 0.95
При проверке Train: 3345, Test: 124 





  0%|          | 0/1 [00:00<?, ?it/s]

  user_features_cleaned = user_features.groupby(['id', 'feature'], as_index=False).apply(select_one_value).reset_index(drop=True)


precision: 0.036000000000000004
recall: 0.05252910052910053
3531 186 1.0
При проверке Train: 3531, Test: 126 





  0%|          | 0/1 [00:00<?, ?it/s]

precision: 0.08421052631578947
recall: 0.12601679637690716


In [25]:
koef = 0.5
data_index = round(len(data) * koef) 
data_index

while(koef < 1):
    data_index = round(len(data) * koef)
    train = data.iloc[:data_index]
    test = data.iloc[data_index:(round(len(data) * (koef + 0.05)))]
    koef+=0.05
    koef = round(koef, 2)
    print(len(train), len(test), koef)
    
    train_users = set(train['user_id'])
    test = test[test['user_id'].isin(train_users)]
    missing_users = set(test['user_id']) - train_users
    
    if missing_users:
        missing_data = test[test['user_id'].isin(missing_users)]
        train = pd.concat([train, missing_data])
        test = test[~test['user_id'].isin(missing_users)]
    
    print(f"При проверке Train: {len(train)}, Test: {len(test)}", '\n')

1858 186 0.55
При проверке Train: 1858, Test: 151 

2044 186 0.6
При проверке Train: 2044, Test: 174 

2230 186 0.65
При проверке Train: 2230, Test: 170 

2416 186 0.7
При проверке Train: 2416, Test: 111 

2602 186 0.75
При проверке Train: 2602, Test: 169 

2788 186 0.8
При проверке Train: 2788, Test: 59 

2974 185 0.85
При проверке Train: 2974, Test: 14 

3159 186 0.9
При проверке Train: 3159, Test: 119 

3345 186 0.95
При проверке Train: 3345, Test: 124 

3531 186 1.0
При проверке Train: 3531, Test: 126 



In [None]:
data_train
#while 
print(len(data))
print(round((len(data) / 100)*99))

In [26]:
from pandasgui import show

show(data)

PandasGUI INFO — pandasgui.gui — Opening PandasGUI
  show(data)
  show(data)

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`



<pandasgui.gui.PandasGui at 0x1c86123a320>