In [260]:
import pandas as pd
import seaborn as sb
import numpy as np

In [261]:
history = pd.read_csv('history (2).tsv', sep='\t')
users = pd.read_csv('users (2).tsv', sep='\t')

In [262]:
validate = pd.read_csv('validate (1).tsv', sep='\t')
validate_answers = pd.read_csv('validate_answers (1).tsv', sep='\t')

In [263]:
history.head()

Unnamed: 0,hour,cpm,publisher,user_id
0,10,30.0,1,15661
1,8,41.26,1,8444
2,7,360.0,1,15821
3,18,370.0,1,21530
4,8,195.0,2,22148


In [264]:
users.head()

Unnamed: 0,user_id,sex,age,city_id
0,0,2,19,0
1,1,1,0,1
2,2,2,24,2
3,3,1,20,3
4,4,2,29,4


In [265]:
validate.head()

Unnamed: 0,cpm,hour_start,hour_end,publishers,audience_size,user_ids
0,220.0,1058,1153,717,1906,"12,44,46,50,58,71,93,122,134,143,176,184,187,1..."
1,312.0,1295,1301,318,1380,"29,81,98,102,165,167,195,205,218,231,242,263,3..."
2,70.0,1229,1249,12391521,888,"12,23,25,29,45,85,92,124,156,190,272,334,456,5..."
3,240.0,1295,1377,114,440,"44,122,187,209,242,255,312,345,382,465,513,524..."
4,262.0,752,990,1378,1476,"15,24,30,43,50,53,96,105,159,168,181,190,196,2..."


In [266]:
validate_answers.head()

Unnamed: 0,at_least_one,at_least_two,at_least_three
0,0.043,0.0152,0.0073
1,0.013,0.0,0.0
2,0.0878,0.0135,0.0
3,0.2295,0.1295,0.0727
4,0.3963,0.2785,0.227


In [267]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [268]:
#кол-во показов пользователю за период истории
user_hourly_counts = history.groupby(['user_id', 'hour']).size().reset_index(name='views')

#среднее и сумма показов
user_agg = user_hourly_counts.groupby('user_id')['views'].agg(['mean', 'sum']).reset_index()
users = users.merge(user_agg, on='user_id', how='left').fillna(0)

validate['user_ids'] = validate['user_ids'].apply(lambda x: list(map(int, x.strip().split(','))))

def compute_user_features(user_ids):
    subset = users[users['user_id'].isin(user_ids)]
    mean_views = subset['mean'].mean()
    sum_views = subset['sum'].mean()
    return pd.Series({'mean_views': mean_views, 'sum_views': sum_views})

validate_user_features = validate['user_ids'].apply(compute_user_features)
validate = pd.concat([validate, validate_user_features], axis=1)


In [269]:
import numpy as np

In [270]:
X_train = users[['mean', 'sum']]
y_train = users['sum']

model = LinearRegression()
model.fit(X_train, y_train)

validate_features = validate[['mean_views', 'sum_views']].rename(columns={'mean_views': 'mean', 'sum_views': 'sum'})
validate['prediction'] = model.predict(validate_features)
validate['prediction'] = validate['prediction'].clip(lower=0)

responses = pd.DataFrame({
    'at_least_one': validate['prediction'] / validate['audience_size'],
    'at_least_two': (validate['prediction'] / 2) / validate['audience_size'],
    'at_least_three': (validate['prediction'] / 3) / validate['audience_size'],
})

responses = responses.clip(0, 1)

In [271]:
import metrics

score = metrics.get_smoothed_mean_log_accuracy_ratio(validate_answers, responses)
print(f"lin reg score: {score}%")


lin reg score: 229.37%


In [272]:
!pip install lightgbm




In [273]:
import lightgbm as lgb

In [274]:
cpm_agg = history.groupby('user_id')['cpm'].agg(['mean', 'max']).reset_index()
cpm_agg.rename(columns={'mean': 'mean_cpm', 'max': 'max_cpm'}, inplace=True)
users = users.merge(cpm_agg, on='user_id', how='left').fillna(0)

publisher_dummies = pd.get_dummies(history['publisher'], prefix='publisher')
publisher_dummies['user_id'] = history['user_id']
publisher_agg = publisher_dummies.groupby('user_id').sum().reset_index()
users = users.merge(publisher_agg, on='user_id', how='left').fillna(0)

def parse_user_ids(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        return list(map(int, x.strip().split(',')))
    return []

validate['user_ids'] = validate['user_ids'].apply(parse_user_ids)
validate['validate_cpm'] = validate['cpm']
validate['publisher_list'] = validate['publishers'].apply(lambda x: x.strip().split(','))

all_publishers = [col for col in publisher_dummies.columns if col != 'user_id']

def compute_user_features(user_ids):
    subset = users[users['user_id'].isin(user_ids)]
    features = {
        'mean': subset['mean'].mean(),
        'sum': subset['sum'].mean(),
        'mean_cpm': subset['mean_cpm'].mean(),
        'max_cpm': subset['max_cpm'].mean(),
    }
    for pub_col in all_publishers:
        features[pub_col] = subset[pub_col].mean() if pub_col in subset else 0
    return pd.Series(features)

validate_features = validate['user_ids'].apply(compute_user_features)
validate = pd.concat([validate, validate_features], axis=1)

for col in all_publishers:
    if col not in users.columns:
        users[col] = 0

feature_cols = ['mean', 'sum', 'mean_cpm', 'max_cpm'] + all_publishers
X_train = users[feature_cols].fillna(0)
y_train = users['sum']

train_data = lgb.Dataset(X_train, label=y_train)
params = {
    'objective': 'regression',
    'metric': 'mae',
    'verbosity': -1,
    'seed': 42,
}
model = lgb.train(params, train_data, num_boost_round=100)

X_validate = validate[feature_cols].fillna(0)
X_validate = X_validate[X_train.columns]
validate['prediction'] = model.predict(X_validate).clip(0)


responses = pd.DataFrame({
    'at_least_one': validate['prediction'] / validate['audience_size'],
    'at_least_two': (validate['prediction'] / 2) / validate['audience_size'],
    'at_least_three': (validate['prediction'] / 3) / validate['audience_size'],
})
responses = responses.clip(0, 1)


score = metrics.get_smoothed_mean_log_accuracy_ratio(validate_answers, responses)
print(f'lgb score: {score}%')

lgb score: 229.49%


In [275]:
validate = validate.loc[:, ~validate.columns.duplicated()]

X_validate = validate.reindex(columns=feature_cols).fillna(0)
validate['prediction'] = model.predict(X_validate).clip(0)

responses = pd.DataFrame({
    'at_least_one': validate['prediction'] / validate['audience_size'],
    'at_least_two': (validate['prediction'] / 2) / validate['audience_size'],
    'at_least_three': (validate['prediction'] / 3) / validate['audience_size'],
})
responses = responses.clip(0, 1)

score = metrics.get_smoothed_mean_log_accuracy_ratio(validate_answers, responses)
print(f'lgb score: {score}%')


lgb score: 229.49%
