In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

user_id, item_id = [], []

In [4]:
with open('train', 'r') as file:

    rows = file.readlines()

    for i, row in enumerate(range(len(rows))):
        likes = [int(j) for j in rows[row].strip().split(' ')][::-1]
        user = [i] * len(likes)
        user_id.extend(user)
        item_id.extend(likes)


df = pd.DataFrame({
    'user': user_id,
    'item': item_id
})

df.to_csv('train_2.csv', index=False)

In [6]:
df = pd.read_csv("train_2.csv")
print(df.head())

   user    item
0     0  388242
1     0  278503
2     0  102795
3     0  470957
4     0  159637


In [None]:
# Добавляем колонку с информацией о свежести лайков
df["order"] = df.groupby("user").cumcount()
print(df.tail(25))

In [21]:
users = df["user"].unique()
print(users)

[    0     1     2 ... 30495 30496 30497]


In [42]:
class UsersKFoldPOut:
    def __init__(self, n_folds: int, p: int, random_seed: int = 23):
        self.n_folds = n_folds
        self.p = p
        self.random_seed = random_seed

    def split(self, df: pd.DataFrame):
        # выявляем юзеров
        users = df["user"].unique()
        kfold = KFold(
            n_splits=self.n_folds, shuffle=True, random_state=self.random_seed
        )

        for train_samples, test_samples in kfold.split(users):
            train_mask = df["user"].isin(train_samples)
            test_mask = df["user"].isin(test_samples) & (df["order"] < self.p)
            yield train_mask, test_mask


In [43]:
n_folds = 3
p = 3

cv = UsersKFoldPOut(n_folds=n_folds, p=p)

In [44]:
for i, (train_mask, test_mask) in enumerate(cv.split(df)):
    train = df[train_mask]
    test = df[test_mask]

    # Проверяем нет пересечений юзеров в трейн и тесте
    assert (
        len(set(train["user"].unique()).intersection(test["user"].unique())) == 0
    ), "Найдены пересечения"
    # Проверяем, что в тест попало не больше p последних треков
    assert (
        test.groupby("user").count().values.max() <= p
    ), f"Количество последних треков в тесте больше чем {p}"
   
    print(f"Fold#{i} | Train: {train.shape[0]}, Test: {test.shape[0]}")


Fold#0 | Train: 2897443, Test: 30498
Fold#1 | Train: 2885875, Test: 30498
Fold#2 | Train: 2893772, Test: 30498
