In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge
from tqdm.notebook import tqdm
import metrics
import gc

In [3]:
valid_train = pd.read_csv('validate_train.csv', sep='\t')
valid_test = pd.read_csv('validate_test.csv', sep='\t')
train_ans = pd.read_csv('validate_answers_train.csv', sep='\t')
test_ans = pd.read_csv('validate_answers_test.csv', sep='\t')

In [4]:
users = pd.read_csv('users (2).tsv', sep='\t')
history = pd.read_csv('history (2).tsv', sep='\t')

In [5]:
def parse_user_ids(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        x = x.replace('(', '').replace(')', '')
        return [int(i.strip()) for i in x.split(',') if i.strip().isdigit()]
    return []

valid_train['user_ids'] = valid_train['user_ids'].apply(parse_user_ids)
valid_test['user_ids'] = valid_test['user_ids'].apply(parse_user_ids)

history_grouped = history.groupby('user_id')

from collections import defaultdict

user_stats = defaultdict(lambda: {'ads': 0, 'cpm_total': 0, 'publishers_total': 0, 'response_sum': np.zeros(3)})

for idx, row in valid_train.iterrows():
    user_ids = row['user_ids']
    ans = train_ans.loc[idx][['at_least_one', 'at_least_two', 'at_least_three']].values
    for uid in user_ids:
        user_stats[uid]['ads'] += 1
        user_stats[uid]['cpm_total'] += row['cpm']
        user_stats[uid]['publishers_total'] += len(row['publishers'].split(','))
        user_stats[uid]['response_sum'] += ans

user_features_list = []

for uid in tqdm(users['user_id'], desc="user features"):
    stats = user_stats.get(uid, None)
    user_history = history_grouped.get_group(uid) if uid in history_grouped.groups else pd.DataFrame(columns=history.columns)
    features = {
        'user_id': uid,
        'mean_views': user_history.groupby('hour').size().mean() if not user_history.empty else 0,
        'sum_views': user_history.groupby('hour').size().sum() if not user_history.empty else 0,
        'mean_cpm': user_history['cpm'].mean() if not user_history.empty else 0,
        'max_cpm': user_history['cpm'].max() if not user_history.empty else 0,
        'unique_publishers': user_history['publisher'].nunique() if not user_history.empty else 0,
        'age': users.loc[users['user_id'] == uid, 'age'].values[0] if uid in users['user_id'].values else 25,
        'sex': users.loc[users['user_id'] == uid, 'sex'].values[0] if uid in users['user_id'].values else -1,
        'city_id': users.loc[users['user_id'] == uid, 'city_id'].values[0] if uid in users['user_id'].values else -1,
        'mean_response_1': stats['response_sum'][0] / stats['ads'] if stats else 0,
        'mean_response_2': stats['response_sum'][1] / stats['ads'] if stats else 0,
        'mean_response_3': stats['response_sum'][2] / stats['ads'] if stats else 0,
        'mean_cpm_train': stats['cpm_total'] / stats['ads'] if stats else 0,
        'mean_publishers_train': stats['publishers_total'] / stats['ads'] if stats else 0,
    }
    user_features_list.append(features)

user_df = pd.DataFrame(user_features_list)
user_df = pd.get_dummies(user_df, columns=['sex', 'city_id'])

feature_cols_user = [col for col in user_df.columns if col != 'user_id']
scaler_user = StandardScaler()
user_scaled = scaler_user.fit_transform(user_df[feature_cols_user])
kmeans = KMeans(n_clusters=4, random_state=42)
user_df['cluster'] = kmeans.fit_predict(user_scaled)

user_features_dict = user_df.set_index('user_id').to_dict('index')
feature_cols_user = [col for col in user_df.columns if col not in ['user_id', 'cluster']]

# Обучение по кластерам
model_by_cluster = {}

def build_train_sample_by_cluster(cluster_id):
    X, y = [], []
    user_ids_in_cluster = user_df[user_df['cluster'] == cluster_id]['user_id'].values
    cluster_validate_rows = valid_train[valid_train['user_ids'].apply(lambda ids: any(uid in user_ids_in_cluster for uid in ids))]
    cluster_answers = train_ans.loc[cluster_validate_rows.index]

    for i, row in cluster_validate_rows.iterrows():
        target_values = cluster_answers.loc[i][['at_least_one', 'at_least_two', 'at_least_three']].values
        for uid in row['user_ids']:
            if uid not in user_features_dict:
                continue
            if user_features_dict[uid]['cluster'] != cluster_id:
                continue
            feats_user = user_features_dict[uid]
            base = [feats_user[col] for col in feature_cols_user]
            ad_feats = [row['cpm'], row['hour_end'] - row['hour_start'], len(row['publishers'].split(','))]
            X.append(base + ad_feats)
            y.append(target_values)
    return np.array(X, dtype=np.float32), np.array(y, dtype=np.float32)

for cluster_id in range(4):
    print(f"Обучаем модель для кластера {cluster_id}")
    X_cluster, y_cluster = build_train_sample_by_cluster(cluster_id)
    if len(X_cluster) == 0:
        continue
    scaler = StandardScaler()
    X_cluster_scaled = scaler.fit_transform(X_cluster)
    model = MultiOutputRegressor(Ridge(alpha=10.0))
    model.fit(X_cluster_scaled, y_cluster)
    model_by_cluster[cluster_id] = (model, scaler)
    del X_cluster, y_cluster
    gc.collect()

# Предсказания
predictions = {'at_least_one': [], 'at_least_two': [], 'at_least_three': []}

for i, row in tqdm(valid_test.iterrows(), total=len(valid_test), desc="predict test"):
    preds = []
    for uid in row['user_ids']:
        feats_user = user_features_dict.get(uid)
        if feats_user is None:
            continue
        cluster = feats_user['cluster']
        if cluster not in model_by_cluster:
            continue
        model, scaler = model_by_cluster[cluster]
        base = [feats_user[col] for col in feature_cols_user]
        ad_feats = [row['cpm'], row['hour_end'] - row['hour_start'], len(row['publishers'].split(','))]
        feats_all = base + ad_feats
        feats_scaled = scaler.transform([feats_all])
        pred = model.predict(feats_scaled)[0]
        preds.append(pred)
    preds_avg = np.mean(preds, axis=0) if preds else [0.0, 0.0, 0.0]
    predictions['at_least_one'].append(preds_avg[0])
    predictions['at_least_two'].append(preds_avg[1])
    predictions['at_least_three'].append(preds_avg[2])

responses = pd.DataFrame({
    'at_least_one': np.array(predictions['at_least_one']) / valid_test['audience_size'],
    'at_least_two': np.array(predictions['at_least_two']) / valid_test['audience_size'],
    'at_least_three': np.array(predictions['at_least_three']) / valid_test['audience_size'],
}).clip(0, 1)

score = metrics.get_smoothed_mean_log_accuracy_ratio(test_ans, responses)
print(f"score = {score:.2f}%")


user features:   0%|          | 0/27769 [00:00<?, ?it/s]

Обучаем модель для кластера 0
Обучаем модель для кластера 1
Обучаем модель для кластера 2
Обучаем модель для кластера 3


predict test:   0%|          | 0/716 [00:00<?, ?it/s]

score = 407.47%
