In [1]:
import pandas as pd
import numpy as np
import time
import sys
import gc
from sklearn.preprocessing import StandardScaler
import pyarrow.parquet as pq

In [2]:
# Функция-помощник для получения размеров всех объектов в мегабайтах
def get_memory_usage_mb():
    # Получаем все объекты в памяти
    objects = gc.get_objects()
    # Собираем информацию о переменных в текущем пространстве имен
    memory_usage = [
        (name, sys.getsizeof(obj) / (1024 * 1024))  # Преобразуем байты в мегабайты
        for name, obj in globals().items()
        if name != "memory_usage_mb"  # Исключаем текущую функцию
    ]
    # Сортируем по убыванию размера
    memory_usage.sort(key=lambda x: x[1], reverse=True)
    return memory_usage

# Загрузка датасетов и подготовка фич в трейне

In [3]:
train = pd.read_parquet('/kaggle/input/eda-simpmod-fi-vk-2024-a01-s06-datasets/train_interactions.parquet')
users = pd.read_parquet('/kaggle/input/eda-simpmod-fi-vk-2024-a01-s06-datasets/users_meta.parquet.parquet')
items = pd.read_parquet('/kaggle/input/eda-simpmod-fi-vk-2024-a01-s06-datasets/items_meta.parquet.parquet')
test = pd.read_csv('/kaggle/input/eda-simpmod-fi-vk-2024-a01-s06-datasets/test_pairs.csv.csv')
subm = pd.read_csv('/kaggle/input/eda-simpmod-fi-vk-2024-a01-s06-datasets/sample_submission.csv')

In [4]:
# path = r'D:\Книги\Программирование\ML_contests_хакатоны\VK_RecSys_Challenge_2024'
# train = pd.read_parquet(path + r'\train_interactions.parquet')
# users = pd.read_parquet(path + r'\users_meta.parquet.parquet')
# items = pd.read_parquet(path + r'\items_meta.parquet.parquet')
# test = pd.read_csv(path + r'\test_pairs.csv.csv')
# subm = pd.read_csv(path + r'\sample_submission.csv.csv')

In [5]:
# path = r'C:\Users\vsemis\files\VK_recsys_huck_2024'
# train = pd.read_parquet(path + r'\train_interactions.parquet')
# users = pd.read_parquet(path + r'\users_meta.parquet')
# items = pd.read_parquet(path + r'\items_meta.parquet')
# test = pd.read_csv(path + r'\test_pairs.csv')
# subm = pd.read_csv(path + r'\sample_submission.csv')

In [6]:
print(train.shape)
print(train.head(2))

(145667282, 7)
   user_id  item_id  timespent  like  dislike  share  bookmarks
0     3810   138979          6     0        0      0          0
1   101874   331160          6     0        0      0          0


In [7]:
# Первая часть датасета, фичи:
# 'duration', 'attention_video_median', 'attention_mean', 'duration_median', 'gender_liked_mode', 'gender_difference'

In [8]:
train = train.drop(columns={'share', 'bookmarks', 'dislike'})

In [9]:
print(train.shape)
print(train.head(2))

(145667282, 4)
   user_id  item_id  timespent  like
0     3810   138979          6     0
1   101874   331160          6     0


In [10]:
# Генерация фичи duration (просто переносим duration из items в train по item_id)
train = train.merge(items[['item_id', 'duration']], on='item_id', how='left')

In [11]:
# Генерация фичи attention (timespent/duration), (доля просмотра конкретного видео конкретным пользователем. Характеризует частный интерес)
train['attention'] = train['timespent'] / train['duration']
train['attention'] = train['attention'].astype('float32')

In [12]:
# Генерация фичи attention_video_median (медиана типичной продолжительности просматра конкретного видео всеми пользователями)
counts = train.groupby('item_id')['attention'].median().astype('float32')
train = train.merge(counts, on='item_id', suffixes=('', '_video_median'))

In [13]:
# Генерация фичи attention_mean (среднее время просмотра видео конкретным пользователем. Характеризует интерес пользователя, среднее)
counts = train.groupby('user_id')['attention'].mean().astype('float32')
train = train.merge(counts, on='user_id', suffixes=('', '_mean'))

In [14]:
# Генерация фичи duration_median (медиана продолжительности видео, просматриваемых пользователем. Характеризует интерес пользователя к метрике)
counts = train.groupby('user_id')['duration'].median().astype('float32')
train = train.merge(counts, on='user_id', suffixes=('', '_median'))

In [15]:
train = train.drop(columns={'attention'})

In [16]:
# Генерация фичи gender
train = train.merge(users[['user_id', 'gender']], on='user_id', how='left')
train.head(2)

Unnamed: 0,user_id,item_id,timespent,like,duration,attention_video_median,attention_mean,duration_median,gender
0,3810,138979,6,0,54,0.722222,0.538317,34.0,1
1,101874,331160,6,0,6,1.0,0.728776,27.0,2


In [17]:
train = train.drop(columns={'timespent'})

In [18]:
# Генерация фичи gender_liked_mode (Мода пола лайкнувшего видео)
counts = train[train['like'] == 1]  # Селектор
gender_mode = counts.groupby('item_id')['gender'].agg(lambda x: x.mode()[0]).reset_index()  # 1. Группируем
gender_mode.columns = ['item_id', 'gender_liked_mode']
train = train.merge(gender_mode, on='item_id', how='left')  # 2. Сливаем результат с исходным датафреймом train
del counts, gender_mode
train['gender_liked_mode'] = train['gender_liked_mode'].fillna(0)

In [19]:
train.isna().sum()

user_id                   0
item_id                   0
like                      0
duration                  0
attention_video_median    0
attention_mean            0
duration_median           0
gender                    0
gender_liked_mode         0
dtype: int64

In [20]:
train.gender_liked_mode.value_counts()

gender_liked_mode
2.0    91956455
1.0    49533625
0.0     4177202
Name: count, dtype: int64

In [21]:
train['gender_liked_mode'] = train['gender_liked_mode'].astype('float32')

In [22]:
# Генерация фичи gender_difference (Совпадает ли пол пользователя с модой пола тех, кто лайкает видео)
train['gender_difference'] = (train['gender'] == train['gender_liked_mode']).astype('int8')

In [23]:
train = train.drop(columns={'gender', 'like'})

In [24]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user_id,145667282.0,91459.706068,52720.587338,0.0,45984.0,90883.0,137123.0,183403.0
item_id,145667282.0,171930.6424,97417.790409,0.0,88571.0,173393.0,255384.0,337726.0
duration,145667282.0,33.38507,25.225117,5.0,16.0,25.0,47.0,180.0
attention_video_median,145667282.0,0.750927,0.279549,0.005587,0.558824,0.833333,1.0,3.4
attention_mean,145667282.0,0.717738,0.237502,0.06499,0.558533,0.696722,0.84061,7.224946
duration_median,145667282.0,27.728163,11.037155,6.0,20.0,26.0,35.0,110.0
gender_liked_mode,145667282.0,1.602605,0.544817,0.0,1.0,2.0,2.0,2.0
gender_difference,145667282.0,0.748442,0.433909,0.0,0.0,1.0,1.0,1.0


In [25]:
print(train['attention_video_median'].quantile([0.01, 0.99]))
print(train['attention_mean'].quantile([0.01, 0.99]))
print(train['duration_median'].quantile([0.01, 0.99]))

0.01    0.076923
0.99    1.200000
Name: attention_video_median, dtype: float64
0.01    0.281933
0.99    1.450355
Name: attention_mean, dtype: float64
0.01     9.0
0.99    55.0
Name: duration_median, dtype: float64


In [26]:
lower, upper = train['attention_video_median'].quantile([0.01, 0.99])
train['attention_video_median'] = train['attention_video_median'].clip(lower, upper)
lower, upper = train['attention_mean'].quantile([0.01, 0.99])
train['attention_mean'] = train['attention_mean'].clip(lower, upper)
lower, upper = train['duration_median'].quantile([0.01, 0.99])
train['duration_median'] = train['duration_median'].clip(lower, upper)

In [27]:
scaler = StandardScaler()
train['attention_video_median'] = scaler.fit_transform(train[['attention_video_median']])
train['attention_mean'] = scaler.fit_transform(train[['attention_mean']])
train['duration_median'] = scaler.fit_transform(train[['duration_median']])

In [28]:
train['attention_video_median'] = train['attention_video_median'].astype('float16')
train['attention_mean'] = train['attention_mean'].astype('float16')
train['duration_median'] = train['duration_median'].astype('float16')
train['gender_liked_mode'] = train['gender_liked_mode'].astype('int8')

In [29]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145667282 entries, 0 to 145667281
Data columns (total 8 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   user_id                 uint32 
 1   item_id                 uint32 
 2   duration                uint8  
 3   attention_video_median  float16
 4   attention_mean          float16
 5   duration_median         float16
 6   gender_liked_mode       int8   
 7   gender_difference       int8   
dtypes: float16(3), int8(2), uint32(2), uint8(1)
memory usage: 2.3 GB


In [30]:
# Перенос фич в тест

In [31]:
counts = train.groupby('item_id', as_index=False).agg({
    'duration': 'first',
    'attention_video_median': 'first'
})

test = test.merge(
    counts,         # Используем агрегированный датасет
    on='item_id',   # Объединяем
    how='left'      # Используем left join
)

In [32]:
counts = train.groupby('user_id', as_index=False).agg({
    'attention_mean': 'first',
    'duration_median': 'first'
})

test = test.merge(
    counts,         # Используем агрегированный датасет
    on='user_id',   # Объединяем
    how='left'      # Используем left join
)

In [33]:
counts = train.groupby('item_id', as_index=False).agg({
    'gender_liked_mode': 'first'
})

test = test.merge(
    counts,         # Используем агрегированный датасет
    on='item_id',   # Объединяем
    how='left'      # Используем left join
)

In [34]:
# Генерация фичи gender
test = test.merge(users[['user_id', 'gender']], on='user_id', how='left')
test.head(2)

Unnamed: 0,user_id,item_id,duration,attention_video_median,attention_mean,duration_median,gender_liked_mode,gender
0,1,7363,9,0.103699,0.336182,-0.430664,2,2
1,1,73770,5,0.912598,0.336182,-0.430664,2,2


In [35]:
# Генерация фичи gender_difference (Совпадает ли пол пользователя с модой пола тех, кто лайкает видео)
test['gender_difference'] = (test['gender'] == test['gender_liked_mode']).astype('int8')

In [36]:
test = test.drop(columns={'gender'})

In [37]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145667282 entries, 0 to 145667281
Data columns (total 8 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   user_id                 uint32 
 1   item_id                 uint32 
 2   duration                uint8  
 3   attention_video_median  float16
 4   attention_mean          float16
 5   duration_median         float16
 6   gender_liked_mode       int8   
 7   gender_difference       int8   
dtypes: float16(3), int8(2), uint32(2), uint8(1)
memory usage: 2.3 GB


In [38]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1655120 entries, 0 to 1655119
Data columns (total 8 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   user_id                 1655120 non-null  int64  
 1   item_id                 1655120 non-null  int64  
 2   duration                1655120 non-null  uint8  
 3   attention_video_median  1655120 non-null  float16
 4   attention_mean          1655120 non-null  float16
 5   duration_median         1655120 non-null  float16
 6   gender_liked_mode       1655120 non-null  int8   
 7   gender_difference       1655120 non-null  int8   
dtypes: float16(3), int64(2), int8(2), uint8(1)
memory usage: 39.5 MB


In [39]:
train.to_parquet('/kaggle/working/train_interactions_full_data_0_5_featues.parquet')

In [40]:
# Печать всех переменных и их размеров в мегабайтах
for var, size in get_memory_usage_mb():
    print(f"{var}: {size:.6f} MB")

train: 2361.625612 MB
items: 41.548659 MB
test: 39.461273 MB
subm: 37.882828 MB
counts: 1.610545 MB
users: 1.049583 MB
_i2: 0.001889 MB
_i18: 0.001520 MB
__: 0.001327 MB
_24: 0.001327 MB
_i4: 0.001212 MB
_i14: 0.000989 MB
_19: 0.000955 MB
_i13: 0.000943 MB
_i12: 0.000937 MB
_i31: 0.000907 MB
_i32: 0.000904 MB
_i11: 0.000895 MB
StandardScaler: 0.000862 MB
_i33: 0.000821 MB
_i22: 0.000690 MB
_i35: 0.000682 MB
_i10: 0.000558 MB
_i3: 0.000539 MB
_i7: 0.000517 MB
_i40: 0.000472 MB
_ih: 0.000450 MB
In: 0.000450 MB
_i26: 0.000446 MB
_i16: 0.000407 MB
_i34: 0.000399 MB
_i5: 0.000366 MB
_i28: 0.000325 MB
_i27: 0.000299 MB
_oh: 0.000221 MB
Out: 0.000221 MB
_i25: 0.000209 MB
_i1: 0.000189 MB
_: 0.000185 MB
_34: 0.000185 MB
_16: 0.000183 MB
_i30: 0.000143 MB
open: 0.000137 MB
get_memory_usage_mb: 0.000137 MB
_i: 0.000127 MB
_i39: 0.000127 MB
_i21: 0.000116 MB
__doc__: 0.000108 MB
_i8: 0.000105 MB
_i23: 0.000091 MB
_i15: 0.000086 MB
_i17: 0.000086 MB
_i6: 0.000084 MB
_i9: 0.000084 MB
_i20: 0.000083

In [41]:
train = pd.read_parquet('/kaggle/input/eda-simpmod-fi-vk-2024-a01-s06-datasets/train_interactions.parquet')

In [42]:
train = train.drop(columns={'timespent'})

In [43]:
# Генерация фичи user_like_mean (Среднее количество лайков, которые ставит пользователь)
counts = train.groupby('user_id')['like'].mean().astype('float32')
train = train.merge(counts, on='user_id', suffixes=('', '_mean'))

In [44]:
# Генерация фичи user_dislike_mean (Среднее количество дизлайков, которые ставит пользователь)
counts = train.groupby('user_id')['dislike'].mean().astype('float32')
train = train.merge(counts, on='user_id', suffixes=('', '_mean'))

In [45]:
# Генерация фичи user_share_mean (Среднее количество "поделиться" видео, активность пользователя)
counts = train.groupby('user_id')['share'].mean().astype('float32')
train = train.merge(counts, on='user_id', suffixes=('', '_mean'))

In [46]:
# Генерация фичи user_bookmarks_mean (Среднее количество помещений в закладки, активность пользователя)
counts = train.groupby('user_id')['bookmarks'].mean().astype('float32')
train = train.merge(counts, on='user_id', suffixes=('', '_mean'))

In [47]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145667282 entries, 0 to 145667281
Data columns (total 10 columns):
 #   Column          Dtype  
---  ------          -----  
 0   user_id         uint32 
 1   item_id         uint32 
 2   like            uint8  
 3   dislike         uint8  
 4   share           uint8  
 5   bookmarks       uint8  
 6   like_mean       float32
 7   dislike_mean    float32
 8   share_mean      float32
 9   bookmarks_mean  float32
dtypes: float32(4), uint32(2), uint8(4)
memory usage: 3.8 GB


In [48]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user_id,145667282.0,91459.706068,52720.587338,0.0,45984.0,90883.0,137123.0,183403.0
item_id,145667282.0,171930.6424,97417.790409,0.0,88571.0,173393.0,255384.0,337726.0
like,145667282.0,0.046601,0.210783,0.0,0.0,0.0,0.0,1.0
dislike,145667282.0,0.000393,0.019823,0.0,0.0,0.0,0.0,1.0
share,145667282.0,0.010798,0.10335,0.0,0.0,0.0,0.0,1.0
bookmarks,145667282.0,0.000986,0.031382,0.0,0.0,0.0,0.0,1.0
like_mean,145667282.0,0.046601,0.096407,0.0,0.000826,0.005698,0.042683,1.0
dislike_mean,145667282.0,0.000393,0.003471,0.0,0.0,0.0,0.0,0.927083
share_mean,145667282.0,0.010798,0.02057,0.0,0.001198,0.004251,0.011895,0.849866
bookmarks_mean,145667282.0,0.000986,0.008639,0.0,0.0,0.0,0.0,0.775051


In [49]:
print(train['like_mean'].quantile([0.01, 0.99]))
print(train['dislike_mean'].quantile([0.01, 0.99]))
print(train['share_mean'].quantile([0.01, 0.99]))
print(train['bookmarks_mean'].quantile([0.01, 0.99]))

0.01    0.000000
0.99    0.482428
Name: like_mean, dtype: float64
0.01    0.000000
0.99    0.006175
Name: dislike_mean, dtype: float64
0.01    0.00000
0.99    0.09434
Name: share_mean, dtype: float64
0.01    0.000000
0.99    0.016907
Name: bookmarks_mean, dtype: float64


In [50]:
lower, upper = train['like_mean'].quantile([0.01, 0.99])
train['like_mean'] = train['like_mean'].clip(lower, upper)
lower, upper = train['dislike_mean'].quantile([0.01, 0.99])
train['dislike_mean'] = train['dislike_mean'].clip(lower, upper)
lower, upper = train['share_mean'].quantile([0.01, 0.99])
train['share_mean'] = train['share_mean'].clip(lower, upper)
lower, upper = train['bookmarks_mean'].quantile([0.01, 0.99])
train['bookmarks_mean'] = train['bookmarks_mean'].clip(lower, upper)

In [51]:
scaler = StandardScaler()
train['like_mean'] = scaler.fit_transform(train[['like_mean']])
train['dislike_mean'] = scaler.fit_transform(train[['dislike_mean']])
train['share_mean'] = scaler.fit_transform(train[['share_mean']])
train['bookmarks_mean'] = scaler.fit_transform(train[['bookmarks_mean']])

In [52]:
train['like_mean'] = train['like_mean'].astype('float16')
train['dislike_mean'] = train['dislike_mean'].astype('float16')
train['share_mean'] = train['share_mean'].astype('float16')
train['bookmarks_mean'] = train['bookmarks_mean'].astype('float16')

In [53]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145667282 entries, 0 to 145667281
Data columns (total 10 columns):
 #   Column          Dtype  
---  ------          -----  
 0   user_id         uint32 
 1   item_id         uint32 
 2   like            uint8  
 3   dislike         uint8  
 4   share           uint8  
 5   bookmarks       uint8  
 6   like_mean       float16
 7   dislike_mean    float16
 8   share_mean      float16
 9   bookmarks_mean  float16
dtypes: float16(4), uint32(2), uint8(4)
memory usage: 2.7 GB


In [54]:
# Генерация фичи like_video_mean (среднее количество like для конкретного видео по всему датасету, метрика популярности видео)
counts = train.groupby('item_id')['like'].mean().astype('float32')
train = train.merge(counts, on='item_id', suffixes=('', '_video_mean'))

In [55]:
# Генерация фичи dislike_video_mean (среднее количество dislike для конкретного видео по всему датасету, метрика популярности видео)
counts = train.groupby('item_id')['dislike'].mean().astype('float32')
train = train.merge(counts, on='item_id', suffixes=('', '_video_mean'))

In [56]:
# Генерация фичи share_video_mean (среднее количество share для конкретного видео по всему датасету, метрика популярности видео)
counts = train.groupby('item_id')['share'].mean().astype('float32')
train = train.merge(counts, on='item_id', suffixes=('', '_video_mean'))

In [57]:
# Генерация фичи bookmarks_video_mean (среднее количество bookmarks для конкретного видео по всему датасету, метрика популярности видео)
counts = train.groupby('item_id')['bookmarks'].mean().astype('float32')
train = train.merge(counts, on='item_id', suffixes=('', '_video_mean'))

In [58]:
del counts

In [59]:
train = train.drop(columns={'like', 'dislike', 'share', 'bookmarks'})

In [60]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145667282 entries, 0 to 145667281
Data columns (total 10 columns):
 #   Column                Dtype  
---  ------                -----  
 0   user_id               uint32 
 1   item_id               uint32 
 2   like_mean             float16
 3   dislike_mean          float16
 4   share_mean            float16
 5   bookmarks_mean        float16
 6   like_video_mean       float32
 7   dislike_video_mean    float32
 8   share_video_mean      float32
 9   bookmarks_video_mean  float32
dtypes: float16(4), float32(4), uint32(2)
memory usage: 4.3 GB


In [61]:
train.describe().T

  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan
  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan
  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan
  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user_id,145667282.0,91459.706068,52720.587338,0.0,45984.0,90883.0,137123.0,183403.0
item_id,145667282.0,171930.6424,97417.790409,0.0,88571.0,173393.0,255384.0,337726.0
like_mean,145667282.0,,0.0,-0.509277,-0.5,-0.445312,-0.0298,4.910156
dislike_mean,145667282.0,,0.0,-0.30249,-0.30249,-0.30249,-0.30249,6.734375
share_mean,145667282.0,,0.0,-0.635254,-0.561035,-0.372314,0.100342,5.199219
bookmarks_mean,145667282.0,,0.0,-0.263672,-0.263672,-0.263672,-0.263672,7.550781
like_video_mean,145667282.0,0.046601,0.053281,0.0,0.005342,0.022901,0.077443,0.9
dislike_video_mean,145667282.0,0.000393,0.001077,0.0,0.0,0.000153,0.000492,0.166667
share_video_mean,145667282.0,0.010798,0.013812,0.0,0.001671,0.005417,0.01487,0.461538
bookmarks_video_mean,145667282.0,0.000986,0.002556,0.0,0.0,0.000321,0.001095,0.272727


In [62]:
print(train['like_video_mean'].quantile([0.01, 0.99]))
print(train['dislike_video_mean'].quantile([0.01, 0.99]))
print(train['share_video_mean'].quantile([0.01, 0.99]))
print(train['bookmarks_video_mean'].quantile([0.01, 0.99]))

0.01    0.000000
0.99    0.207707
Name: like_video_mean, dtype: float64
0.01    0.000000
0.99    0.003311
Name: dislike_video_mean, dtype: float64
0.01    0.000000
0.99    0.060465
Name: share_video_mean, dtype: float64
0.01    0.000000
0.99    0.009756
Name: bookmarks_video_mean, dtype: float64


In [63]:
lower, upper = train['like_video_mean'].quantile([0.0, 0.99])
train['like_video_mean'] = train['like_video_mean'].clip(lower, upper)
lower, upper = train['dislike_video_mean'].quantile([0.0, 0.99])
train['dislike_video_mean'] = train['dislike_video_mean'].clip(lower, upper)
lower, upper = train['share_video_mean'].quantile([0.0, 0.99])
train['share_video_mean'] = train['share_video_mean'].clip(lower, upper)
lower, upper = train['bookmarks_video_mean'].quantile([0.0, 0.99])
train['bookmarks_video_mean'] = train['bookmarks_video_mean'].clip(lower, upper)

In [64]:
scaler = StandardScaler()
train['like_video_mean'] = scaler.fit_transform(train[['like_video_mean']])
train['dislike_video_mean'] = scaler.fit_transform(train[['dislike_video_mean']])
train['share_video_mean'] = scaler.fit_transform(train[['share_video_mean']])
train['bookmarks_video_mean'] = scaler.fit_transform(train[['bookmarks_video_mean']])

In [65]:
train['like_video_mean'] = train['like_video_mean'].astype('float16')
train['dislike_video_mean'] = train['dislike_video_mean'].astype('float16')
train['share_video_mean'] = train['share_video_mean'].astype('float16')
train['bookmarks_video_mean'] = train['bookmarks_video_mean'].astype('float16')

In [66]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145667282 entries, 0 to 145667281
Data columns (total 10 columns):
 #   Column                Dtype  
---  ------                -----  
 0   user_id               uint32 
 1   item_id               uint32 
 2   like_mean             float16
 3   dislike_mean          float16
 4   share_mean            float16
 5   bookmarks_mean        float16
 6   like_video_mean       float16
 7   dislike_video_mean    float16
 8   share_video_mean      float16
 9   bookmarks_video_mean  float16
dtypes: float16(8), uint32(2)
memory usage: 3.3 GB


In [67]:
# Перенос фич в тест

In [68]:
counts = train.groupby('user_id', as_index=False).agg({
    'like_mean': 'first',
    'dislike_mean': 'first',
    'share_mean': 'first',
    'bookmarks_mean': 'first'
})

test = test.merge(
    counts,         # Используем агрегированный датасет
    on='user_id',   # Объединяем
    how='left'      # Используем left join
)

In [69]:
counts = train.groupby('item_id', as_index=False).agg({
    'like_video_mean': 'first',
    'dislike_video_mean': 'first',
    'share_video_mean': 'first',
    'bookmarks_video_mean': 'first'
})

test = test.merge(
    counts,         # Используем агрегированный датасет
    on='item_id',   # Объединяем
    how='left'      # Используем left join
)

In [70]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1655120 entries, 0 to 1655119
Data columns (total 16 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   user_id                 1655120 non-null  int64  
 1   item_id                 1655120 non-null  int64  
 2   duration                1655120 non-null  uint8  
 3   attention_video_median  1655120 non-null  float16
 4   attention_mean          1655120 non-null  float16
 5   duration_median         1655120 non-null  float16
 6   gender_liked_mode       1655120 non-null  int8   
 7   gender_difference       1655120 non-null  int8   
 8   like_mean               1655120 non-null  float16
 9   dislike_mean            1655120 non-null  float16
 10  share_mean              1655120 non-null  float16
 11  bookmarks_mean          1655120 non-null  float16
 12  like_video_mean         1655120 non-null  float16
 13  dislike_video_mean      1655120 non-null  float16
 14  sh

In [71]:
train.to_parquet('/kaggle/working/train_interactions_full_data_6_13_featues.parquet')

In [72]:
train = pd.read_parquet('/kaggle/input/eda-simpmod-fi-vk-2024-a01-s06-datasets/train_interactions.parquet')

In [73]:
# Генерация фичи source_id и фичи source_id_total_views (это будет id и мера популярности канала, общее количество просмотров)
train = train.merge(items[['item_id', 'source_id']], on='item_id', how='left')
counts = train.source_id.value_counts()
counts = counts.to_frame().reset_index()
counts.columns = ['source_id', 'source_id_total_views']
counts['source_id_total_views'] = counts['source_id_total_views'].astype('uint32')
train = train.merge(counts, on='source_id', how='left')

In [74]:
"""Слишком тяжело
# Генерация фичи source_id_familiar (Смотрел ли пользователь видео с этого же канала)
# Создаем копию нужных колонок для обработки
res = train[['user_id', 'source_id']]

# Сортируем для удобства обработки
res = res.sort_values(by=['user_id', 'source_id']).reset_index()

# Инициализация массивов
user_ids = res['user_id'].to_numpy()
source_ids = res['source_id'].to_numpy()
source_id_familiar = np.zeros(len(res), dtype=int)

# Отслеживание "знакомых" источников
seen_sources = {}
for i in range(len(user_ids)):
    user_id = user_ids[i]
    source_id = source_ids[i]
    if user_id not in seen_sources:
        seen_sources[user_id] = set()
    if source_id in seen_sources[user_id]:
        source_id_familiar[i] = 1
    else:
        seen_sources[user_id].add(source_id)

# Добавляем колонку с результатами в res
res['source_id_familiar'] = source_id_familiar

# Восстанавливаем исходный порядок строк train
train['source_id_familiar'] = res.sort_values(by='index')['source_id_familiar'].to_numpy().astype('int8')
del res
del source_id_familiar
"""

'Слишком тяжело\n# Генерация фичи source_id_familiar (Смотрел ли пользователь видео с этого же канала)\n# Создаем копию нужных колонок для обработки\nres = train[[\'user_id\', \'source_id\']]\n\n# Сортируем для удобства обработки\nres = res.sort_values(by=[\'user_id\', \'source_id\']).reset_index()\n\n# Инициализация массивов\nuser_ids = res[\'user_id\'].to_numpy()\nsource_ids = res[\'source_id\'].to_numpy()\nsource_id_familiar = np.zeros(len(res), dtype=int)\n\n# Отслеживание "знакомых" источников\nseen_sources = {}\nfor i in range(len(user_ids)):\n    user_id = user_ids[i]\n    source_id = source_ids[i]\n    if user_id not in seen_sources:\n        seen_sources[user_id] = set()\n    if source_id in seen_sources[user_id]:\n        source_id_familiar[i] = 1\n    else:\n        seen_sources[user_id].add(source_id)\n\n# Добавляем колонку с результатами в res\nres[\'source_id_familiar\'] = source_id_familiar\n\n# Восстанавливаем исходный порядок строк train\ntrain[\'source_id_familiar\'] 

In [75]:
train = train.merge(users[['user_id', 'age']], on='user_id', how='left')  # Генерация фичи age

In [76]:
# Генерация фичи age_liked_median (медиана возраста пользователя, лайкнувшего конкретное видео по всему датасету, метрика популярности видео)
counts = train[train['like'] == 1]  # Селектор
counts = counts.groupby('item_id')['age'].median().astype('int16')
train = train.merge(counts, on='item_id', suffixes=('', '_liked_median'), how='left')

In [77]:
# Генерация фичи age_disliked_median (медиана возраста пользователя, лайкнувшего конкретное видео по всему датасету)
counts = train[train['dislike'] == 1]  # Селектор
counts = counts.groupby('item_id')['age'].median().astype('float32')
train = train.merge(counts, on='item_id', suffixes=('', '_disliked_median'), how='left')

In [78]:
# Генерация фичи age_difference (Разница между возрастом пользователя и медианой возраста тех, кто лайкает видео)
train['age_difference'] = train['age_liked_median'] - train['age']

In [79]:
train['age_liked_median'] = train['age_liked_median'].fillna(train['age_liked_median'].median())
train['age_liked_median'] = train['age_liked_median'].astype('int8')
train['age_difference'] = train['age_difference'].fillna(train['age_liked_median'].median())
train['age_difference'] = train['age_difference'].astype('int8')

In [80]:
train = train.drop(columns={'age', 'timespent', 'share', 'bookmarks'})

In [81]:
train['source_id_total_views'] = np.sqrt(train['source_id_total_views'])

In [82]:
train['age_difference'] = train['age_difference'].astype('int8')
train['age_disliked_median'] = train['age_disliked_median'].fillna(train['age_disliked_median'].median())
train['age_disliked_median'] = train['age_disliked_median'].astype('int8')
train['source_id_total_views'] = train['source_id_total_views'].astype('float32')

In [83]:
train = train.drop(columns={'source_id'})

In [84]:
# Генерация фичи item_id_total_views (это будет мера популярности видео, Количество просмотров)
counts = train['item_id'].value_counts().reset_index()
counts.columns = ['item_id', 'item_id_total_views']
counts['item_id_total_views'] = counts['item_id_total_views'].astype('uint16')
train = train.merge(counts, on='item_id', how='left')
train.head(2)

Unnamed: 0,user_id,item_id,like,dislike,source_id_total_views,age_liked_median,age_disliked_median,age_difference,item_id_total_views
0,3810,138979,0,0,213.728806,26,33,-10,629
1,101874,331160,0,0,7.071068,45,33,-7,50


In [85]:
# Генерация фичи like_dislike_value, отношение лайков к дизлакам для каждого видео. Метрика популярности видео
# Извлекаем данные в numpy-массивы
item_ids = train['item_id'].to_numpy()
likes = train['like'].to_numpy()
dislikes = train['dislike'].to_numpy()

# Уникальные item_id и их индексы
unique_item_ids, inverse_indices = np.unique(item_ids, return_inverse=True)

# Суммируем лайки и дизлайки для каждого item_id
like_sums = np.bincount(inverse_indices, weights=likes)
dislike_sums = np.bincount(inverse_indices, weights=dislikes)

# Вычисляем отношение лайков к дизлайкам, избегая деления на ноль
like_to_dislike_ratio = like_sums / (dislike_sums + 1e-9)

# Создаём словарь с результатами
item_like_dislike_ratios = dict(zip(unique_item_ids, like_to_dislike_ratio))

# Присваиваем рассчитанное значение обратно в train
train['like_dislike_value'] = np.vectorize(item_like_dislike_ratios.get)(train['item_id'])
train['like_dislike_value'] = train['like_dislike_value'].astype('float32')

In [86]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145667282 entries, 0 to 145667281
Data columns (total 10 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   user_id                uint32 
 1   item_id                uint32 
 2   like                   uint8  
 3   dislike                uint8  
 4   source_id_total_views  float32
 5   age_liked_median       int8   
 6   age_disliked_median    int8   
 7   age_difference         int8   
 8   item_id_total_views    uint16 
 9   like_dislike_value     float32
dtypes: float32(2), int8(3), uint16(1), uint32(2), uint8(2)
memory usage: 3.1 GB


In [87]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user_id,145667282.0,91459.71,52720.59,0.0,45984.0,90883.0,137123.0,183403.0
item_id,145667282.0,171930.6,97417.79,0.0,88571.0,173393.0,255384.0,337726.0
like,145667282.0,0.04660118,0.2107831,0.0,0.0,0.0,0.0,1.0
dislike,145667282.0,0.0003931013,0.01982289,0.0,0.0,0.0,0.0,1.0
source_id_total_views,145667282.0,293.6469,191.7715,2.645751,144.176971,246.123947,415.3769,876.246
age_liked_median,145667282.0,31.62894,6.769684,18.0,26.0,31.0,36.0,60.0
age_disliked_median,145667282.0,33.3288,6.522,18.0,32.0,33.0,34.0,60.0
age_difference,145667282.0,-0.4148293,10.59889,-42.0,-7.0,0.0,5.0,42.0
item_id_total_views,145667282.0,4501.559,4503.145,2.0,786.0,3019.0,6925.0,19622.0
like_dislike_value,145667282.0,49717960000.0,193773600000.0,0.0,17.0,343.75,12000000000.0,3140000000000.0


In [88]:
print(train['source_id_total_views'].quantile([0.01, 0.99]))
print(train['item_id_total_views'].quantile([0.01, 0.99]))
print(train['like_dislike_value'].quantile([0.01, 0.99]))

0.01     23.811762
0.99    852.478760
Name: source_id_total_views, dtype: float64
0.01       24.0
0.99    17926.0
Name: item_id_total_views, dtype: float64
0.01    0.000000e+00
0.99    1.005000e+12
Name: like_dislike_value, dtype: float64


In [89]:
lower, upper = train['source_id_total_views'].quantile([0.01, 0.99])
train['source_id_total_views'] = train['source_id_total_views'].clip(lower, upper)
lower, upper = train['item_id_total_views'].quantile([0.01, 0.99])
train['item_id_total_views'] = train['item_id_total_views'].clip(lower, upper)
lower, upper = train['like_dislike_value'].quantile([0.01, 0.99])
train['like_dislike_value'] = train['like_dislike_value'].clip(lower, upper)

In [90]:
scaler = StandardScaler()
train['source_id_total_views'] = scaler.fit_transform(train[['source_id_total_views']])
train['item_id_total_views'] = scaler.fit_transform(train[['item_id_total_views']])
train['like_dislike_value'] = scaler.fit_transform(train[['like_dislike_value']])

In [91]:
train['source_id_total_views'] = train['source_id_total_views'].astype('float16')
train['item_id_total_views'] = train['item_id_total_views'].astype('float16')
train['like_dislike_value'] = train['like_dislike_value'].astype('float16')

In [92]:
# Последняя фича - таргет like_dislike
train['like_dislike'] = train['like'] - train['dislike']
train = train.drop(columns=['like', 'dislike'])

In [93]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145667282 entries, 0 to 145667281
Data columns (total 9 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   user_id                uint32 
 1   item_id                uint32 
 2   source_id_total_views  float16
 3   age_liked_median       int8   
 4   age_disliked_median    int8   
 5   age_difference         int8   
 6   item_id_total_views    float16
 7   like_dislike_value     float16
 8   like_dislike           uint8  
dtypes: float16(3), int8(3), uint32(2), uint8(1)
memory usage: 2.4 GB


In [94]:
# Перенос фич в тест. Третий и последний раз

In [95]:
test = test.merge(items[['item_id', 'source_id']], on='item_id', how='left')

In [96]:
train = train.merge(items[['item_id', 'source_id']], on='item_id', how='left')

In [97]:
counts = train.groupby('source_id', as_index=False).agg({
    'source_id_total_views': 'first'
})

test = test.merge(
    counts,         # Используем агрегированный датасет
    on='source_id',   # Объединяем
    how='left'      # Используем left join
)

In [98]:
"""Слишком тяжело
# Фича source_id_familiar в тесте
counts = train.groupby(['source_id', 'user_id'], as_index=False).agg({
    'source_id_familiar': 'last'  # Или 'max', если нужно объединить значения
})

# Объединяем с test
test = test.merge(
    counts,          # Используем агрегированный датасет
    on=['source_id', 'user_id'],  # Объединяем по колонкам source_id и user_id
    how='left'       # Используем left join для сохранения всех строк из test
)

# Заполняем пропуски в test['source_id_familiar'] нулями
test['source_id_familiar'] = test['source_id_familiar'].fillna(0).astype(int)
"""

"Слишком тяжело\n# Фича source_id_familiar в тесте\ncounts = train.groupby(['source_id', 'user_id'], as_index=False).agg({\n    'source_id_familiar': 'last'  # Или 'max', если нужно объединить значения\n})\n\n# Объединяем с test\ntest = test.merge(\n    counts,          # Используем агрегированный датасет\n    on=['source_id', 'user_id'],  # Объединяем по колонкам source_id и user_id\n    how='left'       # Используем left join для сохранения всех строк из test\n)\n\n# Заполняем пропуски в test['source_id_familiar'] нулями\ntest['source_id_familiar'] = test['source_id_familiar'].fillna(0).astype(int)\n"

In [99]:
train = train.drop(columns={'source_id'})
test = test.drop(columns={'source_id'})

In [100]:
counts = train.groupby('item_id', as_index=False).agg({
    'age_liked_median': 'first',
    'age_disliked_median': 'first'
})

test = test.merge(
    counts,         # Используем агрегированный датасет
    on='item_id',   # Объединяем
    how='left'      # Используем left join
)

In [101]:
test = test.merge(users[['user_id', 'age']], on='user_id', how='left')  # Временно вставим age
# Генерация фичи age_difference (Разница между возрастом пользователя и медианой возраста тех, кто лайкает видео)
test['age_difference'] = test['age_liked_median'] - test['age']

In [102]:
test.columns

Index(['user_id', 'item_id', 'duration', 'attention_video_median',
       'attention_mean', 'duration_median', 'gender_liked_mode',
       'gender_difference', 'like_mean', 'dislike_mean', 'share_mean',
       'bookmarks_mean', 'like_video_mean', 'dislike_video_mean',
       'share_video_mean', 'bookmarks_video_mean', 'source_id_total_views',
       'age_liked_median', 'age_disliked_median', 'age', 'age_difference'],
      dtype='object')

In [103]:
test = test.drop(columns={'age'})

In [104]:
counts = train.groupby('item_id', as_index=False).agg({
    'item_id_total_views': 'first',
    'like_dislike_value': 'first'
})

test = test.merge(
    counts,         # Используем агрегированный датасет
    on='item_id',   # Объединяем
    how='left'      # Используем left join
)

In [105]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1655120 entries, 0 to 1655119
Data columns (total 22 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   user_id                 1655120 non-null  int64  
 1   item_id                 1655120 non-null  int64  
 2   duration                1655120 non-null  uint8  
 3   attention_video_median  1655120 non-null  float16
 4   attention_mean          1655120 non-null  float16
 5   duration_median         1655120 non-null  float16
 6   gender_liked_mode       1655120 non-null  int8   
 7   gender_difference       1655120 non-null  int8   
 8   like_mean               1655120 non-null  float16
 9   dislike_mean            1655120 non-null  float16
 10  share_mean              1655120 non-null  float16
 11  bookmarks_mean          1655120 non-null  float16
 12  like_video_mean         1655120 non-null  float16
 13  dislike_video_mean      1655120 non-null  float16
 14  sh

In [106]:
test.to_parquet('/kaggle/working/test_0_20_featues.parquet')

In [107]:
train.to_parquet('/kaggle/working/train_interactions_full_data_14_20_featues.parquet')

In [108]:
del train

In [109]:
# Файлы трейна слишком тяжёлые, и скачать их не получается.
# Пришлось вырезать из них части, и сопоставлять, собирая чанки, но с полными строками.

In [110]:
file_path = '/kaggle/working/train_interactions_full_data_0_5_featues.parquet'
table = pq.read_table(file_path, columns=None)
train_1_0_15 = table.to_pandas().iloc[:25_000_000]

In [111]:
file_path = '/kaggle/working/train_interactions_full_data_6_13_featues.parquet'
table = pq.read_table(file_path, columns=None)
train_2_0_15 = table.to_pandas().iloc[:25_000_000]

In [112]:
file_path = '/kaggle/working/train_interactions_full_data_14_20_featues.parquet'
table = pq.read_table(file_path, columns=None)
train_3_0_15 = table.to_pandas().iloc[:25_000_000]

In [113]:
train_1_0_15.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000000 entries, 0 to 24999999
Data columns (total 8 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   user_id                 uint32 
 1   item_id                 uint32 
 2   duration                uint8  
 3   attention_video_median  float16
 4   attention_mean          float16
 5   duration_median         float16
 6   gender_liked_mode       int8   
 7   gender_difference       int8   
dtypes: float16(3), int8(2), uint32(2), uint8(1)
memory usage: 405.3 MB


In [114]:
train_2_0_15.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000000 entries, 0 to 24999999
Data columns (total 10 columns):
 #   Column                Dtype  
---  ------                -----  
 0   user_id               uint32 
 1   item_id               uint32 
 2   like_mean             float16
 3   dislike_mean          float16
 4   share_mean            float16
 5   bookmarks_mean        float16
 6   like_video_mean       float16
 7   dislike_video_mean    float16
 8   share_video_mean      float16
 9   bookmarks_video_mean  float16
dtypes: float16(8), uint32(2)
memory usage: 572.2 MB


In [115]:
train_3_0_15.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000000 entries, 0 to 24999999
Data columns (total 9 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   user_id                uint32 
 1   item_id                uint32 
 2   source_id_total_views  float16
 3   age_liked_median       int8   
 4   age_disliked_median    int8   
 5   age_difference         int8   
 6   item_id_total_views    float16
 7   like_dislike_value     float16
 8   like_dislike           uint8  
dtypes: float16(3), int8(3), uint32(2), uint8(1)
memory usage: 429.2 MB


In [116]:
train = pd.concat(
    [train_1_0_15, train_2_0_15.iloc[:, 2:], train_3_0_15.iloc[:, 2:]],  # Выбираем все столбцы первого и все кроме первого из второго
    axis=1  # Присоединяем по столбцам
)
train.head()

Unnamed: 0,user_id,item_id,duration,attention_video_median,attention_mean,duration_median,gender_liked_mode,gender_difference,like_mean,dislike_mean,...,dislike_video_mean,share_video_mean,bookmarks_video_mean,source_id_total_views,age_liked_median,age_disliked_median,age_difference,item_id_total_views,like_dislike_value,like_dislike
0,3810,138979,54,-0.09845,-0.783203,0.57373,1,1,-0.495117,-0.30249,...,-0.632324,-0.822754,-0.572754,-0.41748,26,33,-10,-0.863281,-0.048676,0
1,101874,331160,6,0.912598,0.057037,-0.065369,2,1,-0.509277,-0.30249,...,-0.632324,2.273438,-0.572754,-1.410156,45,33,-7,-0.992676,-0.260498,0
2,150332,73709,16,0.002628,-1.084961,-0.521973,1,1,-0.112183,0.429688,...,0.677734,1.90332,-0.195068,0.734863,35,36,11,0.506348,-0.301514,0
3,4982,189745,25,0.621094,-0.486328,1.487305,1,1,-0.50293,-0.30249,...,-0.632324,-0.535645,-0.572754,-1.385742,19,33,-21,-0.822754,-0.294678,0
4,149601,289643,23,-0.19519,-0.771973,0.391113,1,1,-0.504883,-0.30249,...,4.601562,3.859375,1.3125,0.365234,23,24,-11,-0.852539,-0.301514,0


In [117]:
del train_1_0_15, train_2_0_15, train_3_0_15

In [118]:
train.to_parquet('/kaggle/working/train_interactions_0_25_data_all_featues.parquet')

In [119]:
# 25_000_000:50_000_000
file_path = '/kaggle/working/train_interactions_full_data_0_5_featues.parquet'
table = pq.read_table(file_path, columns=None)
train_1 = table.to_pandas().iloc[25_000_000:50_000_000]
file_path = '/kaggle/working/train_interactions_full_data_6_13_featues.parquet'
table = pq.read_table(file_path, columns=None)
train_2 = table.to_pandas().iloc[25_000_000:50_000_000]
file_path = '/kaggle/working/train_interactions_full_data_14_20_featues.parquet'
table = pq.read_table(file_path, columns=None)
train_3 = table.to_pandas().iloc[25_000_000:50_000_000]

train = pd.concat(
    [train_1, train_2.iloc[:, 2:], train_3.iloc[:, 2:]],  # Выбираем все столбцы первого и все кроме первого из второго
    axis=1  # Присоединяем по столбцам
)
train.head()

del train_1, train_2, train_3

train.to_parquet('/kaggle/working/train_interactions_25_50_data_all_featues.parquet')

In [120]:
# 50_000_000:75_000_000
file_path = '/kaggle/working/train_interactions_full_data_0_5_featues.parquet'
table = pq.read_table(file_path, columns=None)
train_1 = table.to_pandas().iloc[50_000_000:75_000_000]
file_path = '/kaggle/working/train_interactions_full_data_6_13_featues.parquet'
table = pq.read_table(file_path, columns=None)
train_2 = table.to_pandas().iloc[50_000_000:75_000_000]
file_path = '/kaggle/working/train_interactions_full_data_14_20_featues.parquet'
table = pq.read_table(file_path, columns=None)
train_3 = table.to_pandas().iloc[50_000_000:75_000_000]

train = pd.concat(
    [train_1, train_2.iloc[:, 2:], train_3.iloc[:, 2:]],  # Выбираем все столбцы первого и все кроме первого из второго
    axis=1  # Присоединяем по столбцам
)
train.head()

del train_1, train_2, train_3

train.to_parquet('/kaggle/working/train_interactions_50_75_data_all_featues.parquet')

In [121]:
# 75_000_000:100_000_000
file_path = '/kaggle/working/train_interactions_full_data_0_5_featues.parquet'
table = pq.read_table(file_path, columns=None)
train_1 = table.to_pandas().iloc[75_000_000:100_000_000]
file_path = '/kaggle/working/train_interactions_full_data_6_13_featues.parquet'
table = pq.read_table(file_path, columns=None)
train_2 = table.to_pandas().iloc[75_000_000:100_000_000]
file_path = '/kaggle/working/train_interactions_full_data_14_20_featues.parquet'
table = pq.read_table(file_path, columns=None)
train_3 = table.to_pandas().iloc[75_000_000:100_000_000]

train = pd.concat(
    [train_1, train_2.iloc[:, 2:], train_3.iloc[:, 2:]],  # Выбираем все столбцы первого и все кроме первого из второго
    axis=1  # Присоединяем по столбцам
)
train.head()

del train_1, train_2, train_3

train.to_parquet('/kaggle/working/train_interactions_75_100_data_all_featues.parquet')

In [122]:
# 100_000_000:125_000_000
file_path = '/kaggle/working/train_interactions_full_data_0_5_featues.parquet'
table = pq.read_table(file_path, columns=None)
train_1 = table.to_pandas().iloc[100_000_000:125_000_000]
file_path = '/kaggle/working/train_interactions_full_data_6_13_featues.parquet'
table = pq.read_table(file_path, columns=None)
train_2 = table.to_pandas().iloc[100_000_000:125_000_000]
file_path = '/kaggle/working/train_interactions_full_data_14_20_featues.parquet'
table = pq.read_table(file_path, columns=None)
train_3 = table.to_pandas().iloc[100_000_000:125_000_000]

train = pd.concat(
    [train_1, train_2.iloc[:, 2:], train_3.iloc[:, 2:]],  # Выбираем все столбцы первого и все кроме первого из второго
    axis=1  # Присоединяем по столбцам
)
train.head()

del train_1, train_2, train_3

train.to_parquet('/kaggle/working/train_interactions_100_125_data_all_featues.parquet')

In [123]:
# 125_000_000:
file_path = '/kaggle/working/train_interactions_full_data_0_5_featues.parquet'
table = pq.read_table(file_path, columns=None)
train_1 = table.to_pandas().iloc[125_000_000:]
file_path = '/kaggle/working/train_interactions_full_data_6_13_featues.parquet'
table = pq.read_table(file_path, columns=None)
train_2 = table.to_pandas().iloc[125_000_000:]
file_path = '/kaggle/working/train_interactions_full_data_14_20_featues.parquet'
table = pq.read_table(file_path, columns=None)
train_3 = table.to_pandas().iloc[125_000_000:]

train = pd.concat(
    [train_1, train_2.iloc[:, 2:], train_3.iloc[:, 2:]],  # Выбираем все столбцы первого и все кроме первого из второго
    axis=1  # Присоединяем по столбцам
)
train.head()

del train_1, train_2, train_3

train.to_parquet('/kaggle/working/train_interactions_125_143_data_all_featues.parquet')

In [124]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20667282 entries, 125000000 to 145667281
Data columns (total 23 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   user_id                 uint32 
 1   item_id                 uint32 
 2   duration                uint8  
 3   attention_video_median  float16
 4   attention_mean          float16
 5   duration_median         float16
 6   gender_liked_mode       int8   
 7   gender_difference       int8   
 8   like_mean               float16
 9   dislike_mean            float16
 10  share_mean              float16
 11  bookmarks_mean          float16
 12  like_video_mean         float16
 13  dislike_video_mean      float16
 14  share_video_mean        float16
 15  bookmarks_video_mean    float16
 16  source_id_total_views   float16
 17  age_liked_median        int8   
 18  age_disliked_median     int8   
 19  age_difference          int8   
 20  item_id_total_views     float16
 21  like_dislike_value  

In [125]:
file_path = '/kaggle/working/test_0_20_featues.parquet'
table = pq.read_table(file_path, columns=None)
test = table.to_pandas()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1655120 entries, 0 to 1655119
Data columns (total 22 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   user_id                 1655120 non-null  int64  
 1   item_id                 1655120 non-null  int64  
 2   duration                1655120 non-null  uint8  
 3   attention_video_median  1655120 non-null  float16
 4   attention_mean          1655120 non-null  float16
 5   duration_median         1655120 non-null  float16
 6   gender_liked_mode       1655120 non-null  int8   
 7   gender_difference       1655120 non-null  int8   
 8   like_mean               1655120 non-null  float16
 9   dislike_mean            1655120 non-null  float16
 10  share_mean              1655120 non-null  float16
 11  bookmarks_mean          1655120 non-null  float16
 12  like_video_mean         1655120 non-null  float16
 13  dislike_video_mean      1655120 non-null  float16
 14  sh

In [126]:
# Всё, модель попробуем. На самых последних данных чанка 125:

Попробовать: 
Собрать конвеер для incremental learning
Увеличить глубину дерева (depth).
Уменьшить learning_rate и увеличить iterations.
Увеличить регуляризацию l2_leaf_reg.
Задействовать GridSearchCV или Optuna для автоматического тюнинга параметров.
Переключиться на XGBoost, хорошо работает на разреженных данных.

# GridSearchCV для подбора гиперпараметров

In [127]:
from sklearn.model_selection import GridSearchCV, train_test_split
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd

# Шаг 1: Загрузка данных
train = pd.read_parquet('/kaggle/working/train_interactions_125_143_data_all_featues.parquet')
train = train.drop(columns={'user_id', 'item_id'})
train = train.sample(frac=0.5, random_state=42)
test = pd.read_parquet('/kaggle/working/test_0_20_featues.parquet')
test = test.drop(columns={'user_id', 'item_id'})

# Разделение данных на признаки и целевую переменную
y = train["like_dislike"]
X = train.drop(columns={"like_dislike"})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Указываем категориальные признаки
cat_features = ['gender_liked_mode', 'gender_difference']

# Создаем Pool для обучения и тестирования
train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features)
test_pool = Pool(data=test, cat_features=cat_features)

# Шаг 2: Определение модели
model = CatBoostClassifier(
    thread_count=-1,
    loss_function='MultiClass',
    eval_metric='MultiClass',  # Для многоклассовой задачи
    task_type='GPU',
    verbose=800
)

# Шаг 3: Параметры для поиска
param_grid = {
    'iterations': [800],                 # Количество итераций (подобрал ранее)
    'depth': [7, 8, 9],                  # Глубина деревьев
    'learning_rate': [0.05, 0.8, 0.1],   # Темп обучения
    'l2_leaf_reg': [3, 7, 15],           # Регуляризация L2
    'subsample': [0.7],                  # Сэмплирование (подобрал ранее)
    'bootstrap_type': ['Poisson']        # Подвыборка
}

# Шаг 4: RandomizedSearchCV для поиска наилучших параметров
grid_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid,
    n_iter=30,
    cv=3,
    verbose=2,
    n_jobs=1,
    scoring='accuracy',  # Простейшая метрика
    refit=True
)

# Передача категориальных признаков через fit_params
grid_search.fit(X_train, y_train, **{'cat_features': cat_features})

# Шаг 5: Результаты поиска
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)

# Получаем обученную модель с лучшими параметрами
best_model = grid_search.best_estimator_



Fitting 3 folds for each of 27 candidates, totalling 81 fits
0:	learn: 1.0119051	total: 216ms	remaining: 2m 52s
799:	learn: 0.1235651	total: 44.6s	remaining: 0us
[CV] END bootstrap_type=Poisson, depth=7, iterations=800, l2_leaf_reg=3, learning_rate=0.05, subsample=0.7; total time= 1.5min
0:	learn: 1.0119100	total: 78.1ms	remaining: 1m 2s
799:	learn: 0.1236010	total: 45.3s	remaining: 0us
[CV] END bootstrap_type=Poisson, depth=7, iterations=800, l2_leaf_reg=3, learning_rate=0.05, subsample=0.7; total time= 1.5min
0:	learn: 1.0119081	total: 77.4ms	remaining: 1m 1s
799:	learn: 0.1234927	total: 46.5s	remaining: 0us
[CV] END bootstrap_type=Poisson, depth=7, iterations=800, l2_leaf_reg=3, learning_rate=0.05, subsample=0.7; total time= 1.5min
0:	learn: 0.2701396	total: 78ms	remaining: 1m 2s
799:	learn: 0.1650774	total: 46.5s	remaining: 0us
[CV] END bootstrap_type=Poisson, depth=7, iterations=800, l2_leaf_reg=3, learning_rate=0.8, subsample=0.7; total time= 1.5min
0:	learn: 0.2701434	total: 76.

In [128]:
print(best_model.feature_names_)
print(best_model.get_cat_feature_indices())

['duration', 'attention_video_median', 'attention_mean', 'duration_median', 'gender_liked_mode', 'gender_difference', 'like_mean', 'dislike_mean', 'share_mean', 'bookmarks_mean', 'like_video_mean', 'dislike_video_mean', 'share_video_mean', 'bookmarks_video_mean', 'source_id_total_views', 'age_liked_median', 'age_disliked_median', 'age_difference', 'item_id_total_views', 'like_dislike_value']
[4, 5]


In [129]:
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'subsample': 0.7, 'learning_rate': 0.1, 'l2_leaf_reg': 15, 'iterations': 800, 'depth': 8, 'bootstrap_type': 'Poisson'}


In [130]:
# Применяем лучшую модель к тестовым данным
# Предсказания вероятностей для тестовых данных
test['predicted_prob'] = best_model.predict_proba(test_pool)[:, 1]

# Вывод результатов
test.iloc[:,-1:]  # Посмотрим на результаты

Unnamed: 0,predicted_prob
0,0.098958
1,0.179027
2,0.164233
3,0.081440
4,0.035734
...,...
1655115,0.000976
1655116,0.010532
1655117,0.001690
1655118,0.001262


In [131]:
# Перезапись и сохранение результатов
subm = pd.read_csv('/kaggle/input/eda-simpmod-fi-vk-2024-a01-s06-datasets/sample_submission.csv')
subm['predict'] = test['predicted_prob']
subm.to_csv('/kaggle/working/sample_submission_gridsearch_best_model.csv', index=False)

# Посмотрим на часть данных в финальном файле
subm.head()

Unnamed: 0,user_id,item_id,predict
0,1,7363,0.098958
1,1,73770,0.179027
2,1,75700,0.164233
3,1,81204,0.08144
4,1,110249,0.035734


# Обучение модели

In [132]:
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import pandas as pd

In [133]:
train = pd.read_parquet('/kaggle/working/train_interactions_125_143_data_all_featues.parquet')
train = train.drop(columns={'user_id', 'item_id'})
test = pd.read_parquet('/kaggle/working/test_0_20_featues.parquet')
test = test.drop(columns={'user_id', 'item_id'})

In [134]:
# train0_25 = pd.read_parquet('/kaggle/working/train_interactions_0_25_data_all_featues.parquet')
# train25_50 = pd.read_parquet('/kaggle/working/train_interactions_0_25_data_all_featues.parquet')
# train0_25 = train25_50.drop(columns={'user_id', 'item_id'})
# train25_50 = train25_50.drop(columns={'user_id', 'item_id'})
# train = pd.concat([train0_25, train25_50], ignore_index=True)
# del train0_25, train25_50

In [135]:
# Шаг 1: Разделяем данные
y = train["like_dislike"]
X = train.drop(columns={"like_dislike"})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
del train, X, y

In [136]:
# Категориальные признаки
cat_features = ['gender_liked_mode', 'gender_difference']

In [137]:
# Шаг 2: Создаём Pool для CatBoost
train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features)
test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)

In [138]:
# Шаг 3: Создаём и обучаем модель
model = CatBoostClassifier(
    thread_count=-1,
    iterations=800,
    depth=8,
    learning_rate=0.1,
    l2_leaf_reg=15,
    loss_function='MultiClass',   # Многоклассовая задача
    eval_metric='AUC',            # Метрика для многоклассовой задачи
    task_type='GPU',
    devices='0-1',
    verbose=100,
    early_stopping_rounds=100,
    bootstrap_type='Poisson',     # Указываем тип бутстрэпа
    subsample=0.7,                # Применяем subsample с типом бутстрэпа Poisson
    border_count=254              # Устанавливаем нужное количество границ
)

# Обучение модели на данных
model.fit(
    train_pool,
    eval_set=test_pool,
    use_best_model=True
)

Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7676394	best: 0.7676394 (0)	total: 718ms	remaining: 9m 33s
100:	test: 0.9549485	best: 0.9549694 (97)	total: 1m 23s	remaining: 9m 40s
200:	test: 0.9670950	best: 0.9671194 (196)	total: 2m 42s	remaining: 8m 4s
300:	test: 0.9684826	best: 0.9684826 (300)	total: 3m 59s	remaining: 6m 37s
400:	test: 0.9693932	best: 0.9693973 (386)	total: 5m 16s	remaining: 5m 14s
500:	test: 0.9696264	best: 0.9696276 (493)	total: 6m 31s	remaining: 3m 53s
600:	test: 0.9698142	best: 0.9698143 (597)	total: 7m 46s	remaining: 2m 34s
700:	test: 0.9700280	best: 0.9700286 (699)	total: 9m 1s	remaining: 1m 16s
799:	test: 0.9701106	best: 0.9701111 (794)	total: 10m 15s	remaining: 0us
bestTest = 0.9701110695
bestIteration = 794
Shrink model to first 795 iterations.


<catboost.core.CatBoostClassifier at 0x7ca2c2411540>

In [139]:
# Шаг 4: Оцениваем модель
# Получаем вероятности для каждого класса
y_pred = model.predict_proba(test_pool)

# Для многоклассовой классификации используем 'ovr' (one-vs-rest) или 'ovo' (one-vs-one)
roc_auc = roc_auc_score(y_test, y_pred, multi_class='ovr', average='macro')

print(f"ROC-AUC: {roc_auc:.4f}")

ROC-AUC: 0.9454


In [140]:
# Шаг 5: Создание Pool для тестовых данных
test_pool = Pool(data=test, cat_features=cat_features)

In [141]:
# Шаг 6: Предсказание вероятностей
test['predicted_prob'] = model.predict_proba(test_pool)[:, 1]

In [142]:
test.iloc[:,-1:]

Unnamed: 0,predicted_prob
0,0.083051
1,0.143101
2,0.167285
3,0.081847
4,0.037101
...,...
1655115,0.000716
1655116,0.009049
1655117,0.001547
1655118,0.001461


In [143]:
# Шаг 7: Перезапись и сохранение результатов
subm = pd.read_csv('/kaggle/input/eda-simpmod-fi-vk-2024-a01-s06-datasets/sample_submission.csv')
subm['predict'] = test['predicted_prob']
subm.to_csv('/kaggle/working/sample_submission_all_feas_Gsearch.csv', index=False)
subm.head()

Unnamed: 0,user_id,item_id,predict
0,1,7363,0.083051
1,1,73770,0.143101
2,1,75700,0.167285
3,1,81204,0.081847
4,1,110249,0.037101


In [144]:
# Модель позволила получить метрику 0,6409711144 на приватном датасете.
# Попробуем "вскрыть" эмбеддинги. В случае неудачи будем улучшать текущую модель