In [1]:
# !pip install \
#     --extra-index-url=https://pypi.nvidia.com \
#     "cudf-cu12==24.10.*" "dask-cudf-cu12==24.10.*" "cuml-cu12==24.10.*" \
#     "cugraph-cu12==24.10.*" "nx-cugraph-cu12==24.10.*" "cuspatial-cu12==24.10.*" \
#     "cuproj-cu12==24.10.*" "cuxfilter-cu12==24.10.*" "cucim-cu12==24.10.*" \
#     "pylibraft-cu12==24.10.*" "raft-dask-cu12==24.10.*" "cuvs-cu12==24.10.*" \
#     "nx-cugraph-cu12==24.10.*" 
# !pip install catboost
# !pip install numpy

In [1]:
# !wget https://storage.yandexcloud.net/ds-ods/files/files/0d7b7c0f/test.parquet -O test.parquet
# !wget https://storage.yandexcloud.net/ds-ods/files/data/docs/competitions/AvitoTechMLcup2024/data/train.parquet -O train.parquet
# !wget https://storage.yandexcloud.net/ds-ods/files/files/fe98ae56/campaigns_meta.parquet -O campaigns_meta.parquet

In [1]:
from itertools import combinations

import cudf
import catboost
import pandas as pd
import torch

from nn_predictions import get_nn_predictions_mean

In [2]:
cudf.__version__, catboost.__version__, pd.__version__, torch.__version__

('24.10.01', '1.2.7', '2.2.2', '2.2.0')

In [3]:
def preprocess():
    data = cudf.read_parquet('train.parquet')

    data['user_id'] = data['user_id'].astype('int32')
    data['adv_campaign_id']  = data['adv_campaign_id'].astype('int16')
    data['adv_creative_id']  = data['adv_creative_id'].astype('int16')
    data['platform_id']  = data['platform_id'].astype('int8')
    data['banner_code']  = data['banner_code'].astype('int8')
    data['target'] = data['target'].astype('int8')

    start_date = data['event_date'].min()
    data['event_date'] = (data['event_date'] - start_date).dt.days.astype('int8')

    data.to_parquet('train_processed.parquet')

    meta = cudf.read_parquet('campaigns_meta.parquet')

    meta['start_date'] = (meta['start_date'] - start_date).dt.days.astype('int8')
    meta['end_date'] = (meta['end_date'] - start_date).dt.days.astype('int8')

    meta['location_id'] = meta['location_id'].astype('int8')
    meta['logcat_id'] = meta['logcat_id'].astype('int8')

    meta.to_parquet('meta.parquet')

In [None]:
preprocess()

In [4]:
data = cudf.read_parquet('train_processed.parquet')
test = cudf.read_parquet('test.parquet')
meta = cudf.read_parquet('meta.parquet')

meta['campaign_duration'] = meta['end_date'] - meta['start_date']
meta['budget'] = (meta['goal_budget'] / meta['goal_cost']).astype('float32')

data = data.merge(
    meta[['adv_campaign_id', 'location_id', 'logcat_id', 'campaign_duration', 'budget']], 
    on='adv_campaign_id',
    how='left',
)
test = test.merge(
    meta[['adv_campaign_id', 'location_id', 'logcat_id', 'campaign_duration', 'budget']], 
    on='adv_campaign_id',
    how='left',
)

In [5]:
def get_users_mask(data):
    user_num_targets = data.groupby('user_id')['target'].nunique()
    return data['user_id'].isin(
        user_num_targets[user_num_targets == 2].index
    )

def split_data(data, offset):
    df_date = data['event_date'].unique().sort_values().to_pandas().values[-(offset + 1)]
    df_history = data[data['event_date'] < df_date]
    df = data[data['event_date'] == df_date]
    df_history['user_id'] = df_history['user_id'] * 10 + offset
    df['user_id'] = df['user_id'] * 10 + offset
    df = df[get_users_mask(df)]
    df = df.sort_values(['user_id', 'adv_campaign_id'])
    return df_history, df

In [12]:
def _get_group(ser1, ser2):
    coef = {
        'user_id': 10 ** 8,
        'banner_code': 10,
        'is_main': 10,
        'location_id': 100,
        'logcat_id': 100,
        'platform_id': 10,
        'adv_campaign_id': 10 ** 4,
        'adv_creative_id': 10 ** 4,
    }.get(ser2.name, 10 ** 8)
    return ser1.astype('int64') * coef + ser2

def get_group(df, cols):
    if len(cols) == 1:
        return df[cols[0]]
    return _get_group(df[cols[0]], get_group(df, cols[1:]))


def create_mappings(df, cols):
    df = df[get_users_mask(df)]

    pos_df = df[df['target'] == 1]
    neg_df = df[df['target'] == 0]
    
    user_pos_counts = pos_df['user_id'].value_counts()
    user_neg_counts = neg_df['user_id'].value_counts()
    
    pos_df['counts'] = pos_df['user_id'].map(user_neg_counts)
    neg_df['counts'] = neg_df['user_id'].map(user_pos_counts)

    result = {}
    for col in cols:
        pos_mapping = pos_df.groupby(col)['counts'].sum()
        neg_mapping = neg_df.groupby(col)['counts'].sum()
        result[col] = (pos_mapping, neg_mapping)
    return result

def create_features(df_history, df):
    
    features = df[['adv_campaign_id', 'adv_creative_id', 'banner_code', 
                   'location_id', 'logcat_id', 'campaign_duration', 'budget']]
    target_columns = ['user_id', 'adv_campaign_id', 'adv_creative_id', 
                      'banner_code', 'location_id', 'logcat_id']

    for r in [1, 2, 3]:
        for cols in combinations(target_columns, r=r):

            group_history = get_group(df_history, cols)
            group = get_group(df, cols)
            
            col_mean = df_history.groupby(group_history)['target'].mean().astype('float32')
            features[f"{'_'.join(cols)}_mean"] = group.map(col_mean).fillna(-1)
            
            if 'user_id' not in cols:
                col_counts = group.value_counts(normalize=True).astype('float32')
                features[f"{'_'.join(cols)}_counts"] = group.map(col_counts).fillna(0)
                last_day_mask = df_history['event_date'] == df_history['event_date'].max()
                group_history = get_group(df_history[last_day_mask], cols)
                col_counts = group_history.value_counts(normalize=True).astype('float32')
                features[f"{'_'.join(cols)}_counts_2"] = group.map(col_counts).fillna(0)
                features[f"{'_'.join(cols)}_counts_3"] = (
                    features[f"{'_'.join(cols)}_counts"] / features[f"{'_'.join(cols)}_counts_2"]
                )


    for col1, col2 in combinations(target_columns, r=2):
        nunique = df_history.groupby(col1)[col2].nunique() / df_history[col1].value_counts().astype('float32')
        features[f'{col1}_{col2}_nunique'] = df[col1].map(nunique).fillna(-1)

    target_columns = ['user_id', 'adv_campaign_id', 'adv_creative_id', 'banner_code']
    for r1 in [1, 2]:
        for cols1 in combinations(target_columns, r=r1):
            for r2 in [1, 2]:
                for cols2 in combinations(target_columns, r=r2):
                    if cols1 != cols2:
                        group_history1 = get_group(df_history, cols1)
                        group_history2 = get_group(df_history, cols2)
                        group2 = get_group(df, cols2)
                        nunique = group_history1.groupby(group_history2).nunique()
                        features[f"{'_'.join(cols1)}__{'_'.join(cols2)}_nunique"] = group2.map(nunique).fillna(0)
                        last_day_mask = df_history['event_date'] == df_history['event_date'].max()
                        group_history1 = get_group(df_history[last_day_mask], cols1)
                        group_history2 = get_group(df_history[last_day_mask], cols2)
                        nunique = group_history1.groupby(group_history2).nunique()
                        features[f"{'_'.join(cols1)}__{'_'.join(cols2)}_nunique2"] = group2.map(nunique).fillna(0)

    cols = ['adv_campaign_id', 'banner_code']
    mappings = create_mappings(df_history, cols)
    for col in cols:
        pos_stat, neg_stat = mappings[col]
        pos = df[col].map(pos_stat).fillna(0).astype('float32')
        neg = df[col].map(neg_stat).fillna(0).astype('float32')
        features[f'pos_neg_stat_{col}'] = pos / (pos + neg)
        features[f'pos_count_{col}'] = pos

    features['nn_predict'] = get_nn_predictions_mean(df_history, df)

    return features.to_pandas()

In [None]:
%%time
all_trains = []
all_train_features = []
for offset in range(8):
    train_history, train = split_data(data, offset=offset)
    train_features = create_features(train_history, train)
    all_trains.append(train.to_pandas())
    all_train_features.append(train_features)

train_features = pd.concat(all_train_features)
train = pd.concat(all_trains)

In [None]:
test_features = create_features(data, test)

In [None]:
train_pool = catboost.Pool(
    data=train_features, 
    label=train['target'],
    group_id=train['user_id'],
)
test_pool = catboost.Pool(
    data=test_features,
)

In [None]:
params = {
    'iterations': 20000,
    'depth': 6,
    'learning_rate': 0.05,
    'loss_function': 'YetiRank',
    'task_type': 'GPU', 
    'devices': '0',
}
model = catboost.train(
    params=params, 
    pool=train_pool,
    verbose_eval=100, 
)

In [None]:
test['predict'] = model.predict(test_pool)
test[['user_id', 'adv_campaign_id', 'predict']].to_csv('submission.csv', index=False)