In [None]:
import pandas as pd
import seaborn as sb
import numpy as np

In [None]:
valid_train = pd.read_csv('validate_train.csv', sep='\t')
valid_test = pd.read_csv('validate_test.csv', sep='\t')
train_ans = pd.read_csv('validate_answers_train.csv', sep='\t')
test_ans = pd.read_csv('validate_answers_test.csv', sep='\t')

In [None]:
users = pd.read_csv('users (2).tsv', sep='\t')
history = pd.read_csv('history (2).tsv', sep='\t')

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split

In [None]:
users.head()

Unnamed: 0,user_id,sex,age,city_id
0,0,2,19,0
1,1,1,0,1
2,2,2,24,2
3,3,1,20,3
4,4,2,29,4


In [None]:
history.head()

Unnamed: 0,hour,cpm,publisher,user_id
0,10,30.0,1,15661
1,8,41.26,1,8444
2,7,360.0,1,15821
3,18,370.0,1,21530
4,8,195.0,2,22148


In [None]:
valid_train.head()

Unnamed: 0.1,Unnamed: 0,cpm,hour_start,hour_end,publishers,audience_size,user_ids,average_age,percent_6_15,percent_16_25,...,mean_users_min_evening_cpm,mean_users_total_views_evening,average_cpm,min_cpm,max_cpm,most_active_hour,second_active_hour,third_active_hour,fourth_active_hour,fifth_active_hour
0,9,250.0,1440,1442,"(2, 3)",972,"(46, 58, 176, 209, 255, 284, 305, 312, 367, 41...",29.095679,4.835391,33.641975,...,101.10857,10.170782,198.263661,30.0,209053.98,21.0,20.0,22.0,19.0,18.0
1,15,89.0,1328,1339,"(2, 9, 12)",1020,"(35, 66, 86, 88, 96, 100, 107, 118, 154, 157, ...",25.528431,13.137255,36.372549,...,66.108147,9.191176,195.881392,30.0,55909.62,21.0,20.0,22.0,19.0,18.0
2,18,32.0,1328,1429,"(7, 21)",928,"(98, 100, 129, 130, 131, 151, 215, 344, 387, 4...",14.44181,100.0,0.0,...,42.513502,7.71875,241.496884,30.0,33147.27,13.0,15.0,14.0,11.0,16.0
3,37,152.0,1435,1465,"(2, 3, 17)",460,"(12, 120, 165, 274, 295, 468, 481, 671, 701, 8...",26.986957,0.0,0.0,...,83.986174,13.128261,198.189052,30.0,209053.98,21.0,20.0,22.0,19.0,18.0
4,43,130.0,1348,1447,"(3, 7, 13)",1906,"(12, 44, 46, 50, 58, 71, 93, 122, 134, 143, 17...",28.721406,5.403987,32.581322,...,97.915792,10.349948,215.803645,30.0,209053.98,21.0,20.0,19.0,15.0,14.0


In [None]:
from tqdm.notebook import tqdm

In [None]:
!pip install metrics



In [None]:
import metrics

In [None]:
def parse_user_ids(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        x_clean = x.strip().replace('(', '').replace(')', '')
        return [int(uid.strip()) for uid in x_clean.split(',') if uid.strip().isdigit()]
    return []


valid_train['user_ids'] = valid_train['user_ids'].apply(parse_user_ids)
valid_test['user_ids'] = valid_test['user_ids'].apply(parse_user_ids)

history_grouped = history.groupby('user_id')

def build_user_features(user_id):
    user_history = history_grouped.get_group(user_id) if user_id in history_grouped.groups else pd.DataFrame(columns=history.columns)
    features = {
        'user_id': user_id,
        'mean_views': user_history.groupby('hour').size().mean() if not user_history.empty else 0,
        'sum_views': user_history.groupby('hour').size().sum() if not user_history.empty else 0,
        'mean_cpm': user_history['cpm'].mean() if not user_history.empty else 0,
        'max_cpm': user_history['cpm'].max() if not user_history.empty else 0,
        'unique_publishers': user_history['publisher'].nunique() if not user_history.empty else 0
    }

    # Сессии
    if not user_history.empty:
        user_history_sorted = user_history.sort_values(by='hour')
        user_history_sorted['hour_shift'] = user_history_sorted['hour'].shift()
        user_history_sorted['hour_diff'] = user_history_sorted['hour'] - user_history_sorted['hour_shift']
        user_history_sorted['new_session'] = (user_history_sorted['hour_diff'] > 6).astype(int)
        features['session_count'] = user_history_sorted['new_session'].sum()
    else:
        features['session_count'] = 0

    user_row = users[users['user_id'] == user_id]
    if not user_row.empty:
        features['age'] = user_row.iloc[0]['age']
        features['age'] = features['age'] if features['age'] > 0 else 25
        features['sex'] = user_row.iloc[0]['sex']
        features['city_id'] = user_row.iloc[0]['city_id']
    else:
        features['age'] = 25
        features['sex'] = -1
        features['city_id'] = -1

    return features

train_rows = []
for user_id in tqdm(users['user_id'], desc="train_df"):
    train_rows.append(build_user_features(user_id))
train_df = pd.DataFrame(train_rows)

# one-hot кодирование для sex и city_id
train_df = pd.get_dummies(train_df, columns=['sex', 'city_id'])

feature_cols = [col for col in train_df.columns if col != 'user_id']

train_df = train_df.merge(valid_train.explode('user_ids')[['user_ids']], left_on='user_id', right_on='user_ids', how='inner')
targets = train_ans[['at_least_one']].mean().values[0]
train_df['at_least_one'] = targets

X_train = train_df[feature_cols].fillna(0)
y_train = train_df['at_least_one']

train_data = lgb.Dataset(X_train, label=y_train)
params = {
    'objective': 'regression',
    'metric': 'mae',
    'verbosity': -1,
    'seed': 42
}
print("Обучаем LightGBM")
model = lgb.train(params, train_data, num_boost_round=200)

train_df = train_df.drop_duplicates(subset='user_id').reset_index(drop=True)
user_features_dict = train_df.set_index('user_id').to_dict('index')

validate_predictions = []

for idx, row in tqdm(valid_test.iterrows(), total=len(valid_test), desc="Предсказываем test"):
    user_ids = row['user_ids']
    pred_sum = 0
    count = 0

    for user_id in user_ids:
        user_features = user_features_dict.get(user_id)
        if user_features:
            features = [user_features[col] for col in feature_cols]
            pred = model.predict(np.array([features]))[0]
            pred_sum += pred
            count += 1

    avg_pred = pred_sum / count if count > 0 else 0
    validate_predictions.append(avg_pred)

valid_test['prediction'] = validate_predictions
responses = pd.DataFrame({
    'at_least_one': valid_test['prediction'] / valid_test['audience_size'],
    'at_least_two': (valid_test['prediction'] / 2) / valid_test['audience_size'],
    'at_least_three': (valid_test['prediction'] / 3) / valid_test['audience_size'],
})
responses = responses.clip(0, 1)

score = metrics.get_smoothed_mean_log_accuracy_ratio(test_ans, responses)
print(f'score: {score:.2f}%')

train_df:   0%|          | 0/27769 [00:00<?, ?it/s]

Обучаем LightGBM


Предсказываем test:   0%|          | 0/716 [00:00<?, ?it/s]

score: 415.13%


In [None]:
print("at_least_two и at_least_three")

targets_one = train_ans['at_least_one'].mean()
targets_two = train_ans['at_least_two'].mean()
targets_three = train_ans['at_least_three'].mean()

train_df['at_least_one'] = targets_one
train_df['at_least_two'] = targets_two
train_df['at_least_three'] = targets_three

models = {}
for target in ['at_least_one', 'at_least_two', 'at_least_three']:
    X_train = train_df[feature_cols].fillna(0)
    y_train = train_df[target]

    train_data = lgb.Dataset(X_train, label=y_train)
    params = {
        'objective': 'regression',
        'metric': 'mae',
        'verbosity': -1,
        'seed': 42
    }
    print(f"Обучаем LightGBM для {target}")
    model = lgb.train(params, train_data, num_boost_round=200)
    models[target] = model

validate_predictions = {'at_least_one': [], 'at_least_two': [], 'at_least_three': []}

for idx, row in tqdm(valid_test.iterrows(), total=len(valid_test), desc="Предсказываем test (multi-target)"):
    user_ids = row['user_ids']
    pred_sum = {'at_least_one': 0, 'at_least_two': 0, 'at_least_three': 0}
    count = 0

    for user_id in user_ids:
        user_features = user_features_dict.get(user_id)
        if user_features:
            features = [user_features[col] for col in feature_cols]
            for target in ['at_least_one', 'at_least_two', 'at_least_three']:
                pred = models[target].predict(np.array([features]))[0]
                pred_sum[target] += pred
            count += 1

    if count > 0:
        for target in ['at_least_one', 'at_least_two', 'at_least_three']:
            validate_predictions[target].append(pred_sum[target] / count)
    else:
        for target in ['at_least_one', 'at_least_two', 'at_least_three']:
            validate_predictions[target].append(0)

responses_multi = pd.DataFrame({
    'at_least_one': np.array(validate_predictions['at_least_one']) / valid_test['audience_size'],
    'at_least_two': np.array(validate_predictions['at_least_two']) / valid_test['audience_size'],
    'at_least_three': np.array(validate_predictions['at_least_three']) / valid_test['audience_size'],
})
responses_multi = responses_multi.clip(0, 1)

score_multi = metrics.get_smoothed_mean_log_accuracy_ratio(test_ans, responses_multi)
print(f'Multi-target LightGBM score = {score_multi:.2f}%')


at_least_two и at_least_three
Обучаем LightGBM для at_least_one
Обучаем LightGBM для at_least_two
Обучаем LightGBM для at_least_three


Предсказываем test (multi-target):   0%|          | 0/716 [00:00<?, ?it/s]

Multi-target LightGBM score = 415.19%
