In [24]:
import pandas as pd
import numpy as np
from tqdm import tqdm

## Преобработка данных

In [26]:
likes_tables = []

with open('train', 'r') as train_file:
    for i, tracks_line in enumerate(tqdm(train_file.readlines())):
        # Split each line to get list of tracks ids
        tracks_ids = [int(n) for n in tracks_line.split()]
        # Reverse tracks ids list to make last like the most relevant one
        # And make it a numpy array
        tracks_ids = np.array(tracks_ids[::-1])
        # And reshape it to create a column with tracks ids
        tracks_ids_column = tracks_ids.reshape(-1,1)

        # Create a column with user pseudo id
        user_id_column = np.full_like(tracks_ids_column, i)
        
        # Horizontal stack columns to create a part of future dataset
        user_likes = np.hstack((user_id_column, tracks_ids_column))
        # And push it to the list
        likes_tables.append(user_likes)
        
# Vectical stack list values to create united matrix with 2 columns
# And then create a dataset from it
df = pd.DataFrame(np.vstack(likes_tables), columns = ['user_id', 'track_id'])    


100%|██████████████████████████████████████████████████████| 1160084/1160084 [00:40<00:00, 28681.62it/s]


In [28]:
# Define order column
df['order'] = df.groupby('user_id').cumcount()

## UserKFoldLeavePOut

In [39]:
class UsersKFoldPOut():
    def __init__(self, n_folds: int, p: int, random_seed: int = 23):
        self.n_folds = n_folds
        self.p = p
        self.random_seed = random_seed
    
    def split(self, df: pd.DataFrame):
        # Set random seed
        np.random.seed(self.random_seed)
        
        # Get unique users list and its length
        users = df['user_id'].unique()
        users_count = len(users)
        # Shuffle users list
        np.random.shuffle(users)
        
        # Calculate fold sizes
        fold_sizes = np.full(self.n_folds, users_count // self.n_folds, dtype=int)
        fold_sizes[: users_count % self.n_folds] += 1
        
        current = 0
        for fold_size in fold_sizes:
            # Get borders of test fold
            start, stop = current, current + fold_size
            # Get users list for test fold
            test_fold_users = users[start:stop]
            
            # Create test mask without taking into account the number of interactions
            test_mask = df['user_id'].isin(test_fold_users) 
            # Create train mask as opposite to test mask
            train_mask = ~test_mask
            # Modify test mask to leave only first p interactions in test
            test_mask &= df['order'] < self.p
            
            current = stop
            
            yield train_mask, test_mask

## Доп функции для проверки правильности

In [46]:
def check_intersections(train: pd.DataFrame, test: pd.DataFrame):
    '''Checking for intersections by user_id for the train and test'''
    train_users = set(train['user_id'].unique())
    test_users = set(test['user_id'].unique())
    
    intersection = train_users.intersection(test_users)
    
    return len(intersection) == 0

In [47]:
def check_pout(test: pd.DataFrame, p: int):
    '''Checking for the presence of no more than k likes in the test'''
    return (test.groupby('user_id').track_id.count() <= p).all()

In [51]:
from functools import reduce

def check_unique_folds(test_users: np.ndarray):
    '''Checking for the uniqueness of folds'''
    intersection = reduce(np.intersect1d, (test_users))
    
    return len(intersection) == 0

## Проверка правильности разбиения 

In [53]:
n_folds = 3
p = 3

cv = UsersKFoldPOut(n_folds = n_folds, p = p)

test_users = []

for i, (train_mask, test_mask) in enumerate(cv.split(df)):
    train = df[train_mask]
    test = df[test_mask]
    
    print(f'Fold#{i} | Train size: {train.shape[0]}, Test size: {test.shape[0]}')
    print(f'Users in train and test are different: {check_intersections(train, test)}')
    print(f'There are no more than {p} likes for each user: {check_pout(test, p)}')
    
    test_users.append(test.user_id.unique())
    
print(f'All folds are unique: {check_unique_folds(test_users)}')



Fold#0 | Train size: 62769950, Test size: 1160085
Users in train and test are different: True
There are no more than 3 likes for each user: True
Fold#1 | Train size: 62749697, Test size: 1160085
Users in train and test are different: True
There are no more than 3 likes for each user: True
Fold#2 | Train size: 62857621, Test size: 1160082
Users in train and test are different: True
There are no more than 3 likes for each user: True
All folds are unique: True
