In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
users = pd.read_csv('users.tsv', sep ='\t')
history_unsorted = pd.read_csv('history.tsv', sep ='\t')
validate = pd.read_csv('validate.tsv', sep ='\t')
validate_answers = pd.read_csv('validate_answers.tsv', sep ='\t')

In [None]:
users

Unnamed: 0,user_id,sex,age,city_id
0,0,2,19,0
1,1,1,0,1
2,2,2,24,2
3,3,1,20,3
4,4,2,29,4
...,...,...,...,...
27764,27764,1,38,295
27765,27765,2,30,79
27766,27766,2,21,1953
27767,27767,2,17,0


In [6]:
history = history_unsorted.sort_values(by='hour')
history = history.reset_index(drop=True)
history

Unnamed: 0,hour,cpm,publisher,user_id
0,3,163.49,1,15004
1,3,34.55,1,15015
2,3,174.82,1,20794
3,3,255.00,3,4127
4,3,151.05,1,12942
...,...,...,...,...
1147852,1490,229.14,1,24304
1147853,1490,678.15,1,17741
1147854,1490,300.34,2,12964
1147855,1490,189.00,1,7871


**Хотим построить 2 модели:**
1. *Модель активности пользователя*. Сколько объявлений (возможностей для аукциона) он вероятнее всего увидит в конкретный час?
2. *Модель аукциона*. Будет ли наше объявление выигрывать?

### Создадим больше признаков


In [7]:
history['day'] = history['hour'] // 24
history['hour_of_day'] = history['hour'] % 24
history['day_of_week'] = history['day'] % 7

In [8]:
history.sample(5)

Unnamed: 0,hour,cpm,publisher,user_id,day,hour_of_day,day_of_week
452317,608,355.0,1,9792,25,8,4
979370,1285,40.46,3,6033,53,13,4
285671,391,105.0,1,7741,16,7,2
815949,1078,61.5,1,24991,44,22,2
651488,862,32.0,1,19346,35,22,0


In [None]:
aggregations = {
    'hour': ['count', 'nunique'],  # общее число показов, кол-во активных часов
    'day': ['nunique'],            # кол-во активных дней
    'publisher': ['nunique'],      # кол-во уникальных площадок
    'cpm': ['mean', 'median', 'max'] # Статистики по CPM
}

user_agg_features = history.groupby('user_id').agg(aggregations)

user_agg_features.columns = ['_'.join(col).strip() for col in user_agg_features.columns.values]

user_agg_features.rename(columns={
    'hour_count': 'total_impressions',
    'hour_nunique': 'active_hours',
    'day_nunique': 'active_days',
    'publisher_nunique': 'unq_publishers',
}, inplace=True)

user_agg_features['avg_daily_impressions'] = (
    user_agg_features['total_impressions'] / user_agg_features['active_days']
)

user_agg_features.sample(5)

Unnamed: 0_level_0,total_impressions,active_hours,active_days,unq_publishers,cpm_mean,cpm_median,cpm_max,avg_daily_impressions
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
26104,1,1,1,1,229.4,229.4,229.4,1.0
26025,6,5,5,1,125.73,126.87,258.64,1.2
4825,217,129,51,1,242.798525,188.08,1199.5,4.254902
15185,12,10,8,1,168.6525,131.64,467.8,1.5
26347,11,10,10,1,289.362727,228.56,805.0,1.1


In [None]:
day_hour_profile = history.pivot_table(
    index='user_id',
    columns='hour_of_day',
    values='cpm',
    aggfunc='count',
    fill_value=0
)
day_hour_profile.columns = [f'impressions_h_{h}' for h in day_hour_profile.columns]
day_hour_profile.sample(5)

Unnamed: 0_level_0,impressions_h_0,impressions_h_1,impressions_h_2,impressions_h_3,impressions_h_4,impressions_h_5,impressions_h_6,impressions_h_7,impressions_h_8,impressions_h_9,...,impressions_h_14,impressions_h_15,impressions_h_16,impressions_h_17,impressions_h_18,impressions_h_19,impressions_h_20,impressions_h_21,impressions_h_22,impressions_h_23
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7871,33,24,13,8,0,0,0,3,2,11,...,26,16,16,20,15,17,12,15,43,37
10884,0,0,0,0,2,3,6,3,0,0,...,0,1,3,0,1,2,3,0,1,0
13061,1,3,0,0,1,0,0,0,3,4,...,2,0,3,1,6,7,9,6,1,2
19775,0,3,0,0,0,0,0,4,1,0,...,5,0,2,1,1,0,2,5,4,3
21080,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [None]:
week_day_profile = history.pivot_table(
    index='user_id',
    columns='day_of_week',
    values='cpm',
    aggfunc='count',
    fill_value=0
)
week_day_profile.columns = [f'impressions_d_{d}' for d in week_day_profile.columns]
week_day_profile.sample(5)

Unnamed: 0_level_0,impressions_d_0,impressions_d_1,impressions_d_2,impressions_d_3,impressions_d_4,impressions_d_5,impressions_d_6
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12354,1,0,0,0,0,0,0
2385,2,1,3,3,4,0,3
15993,0,0,0,0,1,1,0
9302,1,1,0,0,3,1,1
7084,2,3,1,5,2,1,0


In [None]:
users_with_features = users.copy()
users_with_features = users_with_features.merge(user_agg_features, on='user_id', how='left')
users_with_features = users_with_features.merge(day_hour_profile, on='user_id', how='left')
users_with_features = users_with_features.merge(week_day_profile, on='user_id', how='left')
users_with_features.sample(5)

Unnamed: 0,user_id,sex,age,city_id,total_impressions,active_hours,active_days,unq_publishers,cpm_mean,cpm_median,...,impressions_h_21,impressions_h_22,impressions_h_23,impressions_d_0,impressions_d_1,impressions_d_2,impressions_d_3,impressions_d_4,impressions_d_5,impressions_d_6
14053,14053,1,101,0,2.0,2.0,2.0,1.0,1158.38,1158.38,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
22847,22847,1,42,0,122.0,95.0,46.0,3.0,226.641148,175.0,...,6.0,10.0,4.0,13.0,20.0,22.0,22.0,11.0,18.0,16.0
14390,14390,1,20,163,1.0,1.0,1.0,1.0,90.0,90.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25258,25258,2,30,418,37.0,33.0,28.0,2.0,135.942703,105.0,...,1.0,2.0,1.0,5.0,3.0,7.0,6.0,8.0,4.0,4.0
20098,20098,2,20,1582,3.0,1.0,1.0,1.0,30.05,30.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0


#### Заполняем пропуски после аггрегации

In [None]:
users_with_features.isnull().sum()

Unnamed: 0,0
user_id,0
sex,0
age,0
city_id,0
total_impressions,2233
active_hours,2233
active_days,2233
unq_publishers,2233
cpm_mean,2233
cpm_median,2233


In [None]:
cols_to_fill = users_with_features.columns.difference(users.columns)
users_with_features[cols_to_fill] = users_with_features[cols_to_fill].fillna(0)
users_with_features.isnull().sum()

Unnamed: 0,0
user_id,0
sex,0
age,0
city_id,0
total_impressions,0
active_hours,0
active_days,0
unq_publishers,0
cpm_mean,0
cpm_median,0


#### Преобразуем типы

In [None]:
users_with_features.dtypes

Unnamed: 0,0
user_id,int64
sex,int64
age,int64
city_id,int64
total_impressions,float64
active_hours,float64
active_days,float64
unq_publishers,float64
cpm_mean,float64
cpm_median,float64


In [None]:
cols_to_convert = [
    'total_impressions',
    'active_hours',
    'active_days',
    'unq_publishers',
] + [f'impressions_h_{h}' for h in range(24)] + [f'impressions_d_{d}' for d in range(7)]

conversion_dict = {col: 'int64' for col in cols_to_convert}

users_with_features = users_with_features.astype(conversion_dict)
users_with_features.dtypes

Unnamed: 0,0
user_id,int64
sex,int64
age,int64
city_id,int64
total_impressions,int64
active_hours,int64
active_days,int64
unq_publishers,int64
cpm_mean,float64
cpm_median,float64


#### Выгрузим данные для постоянного хранения

In [None]:
users_with_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27769 entries, 0 to 27768
Data columns (total 43 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   user_id                27769 non-null  int64  
 1   sex                    27769 non-null  int64  
 2   age                    27769 non-null  int64  
 3   city_id                27769 non-null  int64  
 4   total_impressions      27769 non-null  int64  
 5   active_hours           27769 non-null  int64  
 6   active_days            27769 non-null  int64  
 7   unq_publishers         27769 non-null  int64  
 8   cpm_mean               27769 non-null  float64
 9   cpm_median             27769 non-null  float64
 10  cpm_max                27769 non-null  float64
 11  avg_daily_impressions  27769 non-null  float64
 12  impressions_h_0        27769 non-null  int64  
 13  impressions_h_1        27769 non-null  int64  
 14  impressions_h_2        27769 non-null  int64  
 15  im

In [None]:
users_with_features.to_pickle('users_with_features.pkl')

### Соберем датасет для модели активности

In [None]:
positive_samples = history.groupby(['user_id', 'hour']).size().reset_index(name='target')

In [None]:
positive_samples.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 836241 entries, 0 to 836240
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   user_id  836241 non-null  int64
 1   hour     836241 non-null  int64
 2   target   836241 non-null  int64
dtypes: int64(3)
memory usage: 19.1 MB


In [None]:
positive_samples.sample(10)

Unnamed: 0,user_id,hour,target
588746,19472,210,1
250330,8205,1221,1
265083,8707,398,2
472977,15761,17,1
553814,18402,281,1
103991,3383,1220,2
789333,26169,739,2
114439,3745,134,1
376824,12510,1134,1
268197,8789,108,1


In [None]:
active_user_hours = set(positive_samples[['user_id', 'hour']].itertuples(index=False, name=None))

In [None]:
positive_count = len(positive_samples)
negative_count_desired = positive_count * 2

In [None]:
all_users = users['user_id'].unique()

In [None]:
min_hour = history['hour'].min()
max_hour = history['hour'].max()

In [None]:
negative_samples_list = []
generated_count = 0

while generated_count < negative_count_desired:
    random_user = np.random.choice(all_users)
    random_hour = np.random.randint(min_hour, max_hour + 1)

    if (random_user, random_hour) not in active_user_hours:
        negative_samples_list.append((random_user, random_hour, 0))
        active_user_hours.add((random_user, random_hour))
        generated_count += 1

negative_samples = pd.DataFrame(negative_samples_list, columns=['user_id', 'hour', 'target'])

In [None]:
negative_samples.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1672482 entries, 0 to 1672481
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype
---  ------   --------------    -----
 0   user_id  1672482 non-null  int64
 1   hour     1672482 non-null  int64
 2   target   1672482 non-null  int64
dtypes: int64(3)
memory usage: 38.3 MB


In [None]:
negative_samples.sample(10)

Unnamed: 0,user_id,hour,target
355520,25050,922,0
1442071,1222,84,0
1116743,16902,439,0
365910,17333,748,0
1598172,13743,1045,0
1477735,6025,903,0
1162552,26360,1313,0
715132,10609,620,0
942582,6239,896,0
101418,14808,1052,0


In [None]:
df_activity = pd.concat([positive_samples, negative_samples], ignore_index=True)
df_activity = df_activity.sample(frac=1).reset_index(drop=True)

In [None]:
df_activity['hour_of_day'] = df_activity['hour'] % 24
df_activity['day_of_week'] = (df_activity['hour'] // 24) % 7

df_activity = df_activity.merge(users_with_features, on='user_id', how='left')

In [None]:
df_activity.sample(10)

Unnamed: 0,user_id,hour,target,hour_of_day,day_of_week,sex,age,city_id,total_impressions,active_hours,...,ratio_impressions_h_21,ratio_impressions_h_22,ratio_impressions_h_23,ratio_impressions_d_0,ratio_impressions_d_1,ratio_impressions_d_2,ratio_impressions_d_3,ratio_impressions_d_4,ratio_impressions_d_5,ratio_impressions_d_6
1122985,8588,1362,1,18,0,2,17,326,92,72,...,0.043478,0.01087,0.021739,0.130435,0.141304,0.119565,0.163043,0.26087,0.076087,0.108696
1987545,19399,575,0,23,2,2,77,251,55,45,...,0.0,0.018182,0.018182,0.145455,0.236364,0.127273,0.018182,0.2,0.163636,0.109091
2068313,18081,674,0,2,0,1,46,1951,2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2135473,9924,953,1,17,4,1,17,0,17,15,...,0.058824,0.0,0.0,0.0,0.176471,0.176471,0.117647,0.411765,0.117647,0.0
1245939,5560,1208,0,8,1,1,36,158,13,9,...,0.076923,0.0,0.0,0.0,0.076923,0.384615,0.230769,0.076923,0.076923,0.153846
2219320,24028,1312,0,16,5,1,35,29,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1128145,15221,102,0,6,4,2,18,0,27,26,...,0.0,0.111111,0.074074,0.259259,0.111111,0.185185,0.148148,0.111111,0.148148,0.037037
2112398,13893,914,0,2,3,2,17,16,6,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.833333,0.0,0.166667,0.0
1783149,8333,216,1,0,2,1,20,332,139,110,...,0.079137,0.086331,0.043165,0.129496,0.158273,0.165468,0.208633,0.079137,0.136691,0.122302
796562,8485,1160,0,8,6,2,16,0,2,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


#### Выгрузим данные

In [None]:
df_activity.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2508723 entries, 0 to 2508722
Data columns (total 78 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   user_id                 int64  
 1   hour                    int64  
 2   target                  int64  
 3   hour_of_day             int64  
 4   day_of_week             int64  
 5   sex                     int64  
 6   age                     int64  
 7   city_id                 int64  
 8   total_impressions       int64  
 9   active_hours            int64  
 10  active_days             int64  
 11  unq_publishers          int64  
 12  cpm_mean                float64
 13  cpm_median              float64
 14  cpm_max                 float64
 15  avg_daily_impressions   float64
 16  impressions_h_0         int64  
 17  impressions_h_1         int64  
 18  impressions_h_2         int64  
 19  impressions_h_3         int64  
 20  impressions_h_4         int64  
 21  impressions_h_5         int64  

In [None]:
df_activity.to_pickle('activity.pkl')

### Экспериментируем с обучением

In [None]:
import lightgbm as lgb
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
split_hour = 1080

train_df = df_activity[df_activity['hour'] < split_hour].copy()
val_df = df_activity[df_activity['hour'] >= split_hour].copy()

In [None]:
print(f"Размер обучающей выборки: {len(train_df)}")
print(f"Размер валидационной выборки: {len(val_df)}")

Размер обучающей выборки: 1809408
Размер валидационной выборки: 699315


In [None]:
features_to_use = [col for col in train_df.columns if col not in ['user_id', 'hour', 'target']]

X_train = train_df[features_to_use]
y_train = train_df['target']

X_val = val_df[features_to_use]
y_val = val_df['target']

In [None]:
categorical_features = ['sex', 'age', 'city_id', 'hour_of_day', 'day_of_week']

In [None]:
# Baseline
dummy_model = DummyRegressor(strategy='mean')
dummy_model.fit(X_train, y_train)

dummy_preds = dummy_model.predict(X_val)

dummy_rmse = np.sqrt(mean_squared_error(y_val, dummy_preds))
dummy_mae = mean_absolute_error(y_val, dummy_preds)

print(f"Dummy Regressor RMSE: {dummy_rmse:.4f}")
print(f"Dummy Regressor MAE: {dummy_mae:.4f}")
print("-" * 30)

Dummy Regressor RMSE: 0.8106
Dummy Regressor MAE: 0.6159
------------------------------


In [None]:
params = {
    'objective': 'tweedie',
    'metric': 'rmse',
    'n_estimators': 1500,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'verbose': -1,
    'n_jobs': -1,
    'seed': 42,
    'boosting_type': 'gbdt',
}

lgbm_model = lgb.LGBMRegressor(**params)


lgbm_model.fit(X_train, y_train,
               eval_set=[(X_val, y_val)],
               eval_metric='rmse',
               callbacks=[lgb.early_stopping(100, verbose=True)],
               categorical_feature=categorical_features)

lgbm_preds = lgbm_model.predict(X_val)

lgbm_preds[lgbm_preds < 0] = 0

lgbm_rmse = np.sqrt(mean_squared_error(y_val, lgbm_preds))
lgbm_mae = mean_absolute_error(y_val, lgbm_preds)

print(f"\nLightGBM Regressor RMSE: {lgbm_rmse:.4f}")
print(f"LightGBM Regressor MAE: {lgbm_mae:.4f}")


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[842]	valid_0's rmse: 0.647554

LightGBM Regressor RMSE: 0.6476
LightGBM Regressor MAE: 0.3599


In [None]:
print("4. Важность признаков по версии LightGBM:")
feature_importance_df = pd.DataFrame({
    'feature': lgbm_model.feature_name_,
    'importance': lgbm_model.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance_df.head(20))

4. Важность признаков по версии LightGBM:
                   feature  importance
4                  city_id        9318
0              hour_of_day        5624
3                      age        1686
1              day_of_week         753
5        total_impressions         349
6             active_hours         332
73   ratio_impressions_d_5         213
70   ratio_impressions_d_2         192
69   ratio_impressions_d_1         192
59  ratio_impressions_h_15         187
68   ratio_impressions_d_0         186
74   ratio_impressions_d_6         181
71   ratio_impressions_d_3         180
53   ratio_impressions_h_9         178
64  ratio_impressions_h_20         170
67  ratio_impressions_h_23         169
12   avg_daily_impressions         168
57  ratio_impressions_h_13         167
72   ratio_impressions_d_4         164
63  ratio_impressions_h_19         163


In [None]:
avg_target = y_val.mean()
print(f"Среднее значение target в валидационной выборке: {avg_target}")

Среднее значение target в валидационной выборке: 0.4721734840522511


#### Попробуем изменить признаки, будем смотреть на долю объявлений показанных в конкретный час или день недели, в добавление к общему количеству

In [None]:
hourly_impression_cols = [f'impressions_h_{h}' for h in range(24)]

for col in hourly_impression_cols:
    ratio_col_name = f'ratio_{col}'

    users_with_features[ratio_col_name] = np.divide(
        users_with_features[col],
        users_with_features['total_impressions'],
        out=np.zeros_like(users_with_features[col], dtype=float),
        where=users_with_features['total_impressions'] != 0
    )

weekly_impression_cols = [f'impressions_d_{d}' for d in range(7)]

for col in weekly_impression_cols:
    ratio_col_name = f'ratio_{col}'

    users_with_features[ratio_col_name] = np.divide(
        users_with_features[col],
        users_with_features['total_impressions'],
        out=np.zeros_like(users_with_features[col], dtype=float),
        where=users_with_features['total_impressions'] != 0
    )


users_with_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27769 entries, 0 to 27768
Data columns (total 74 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   user_id                 27769 non-null  int64  
 1   sex                     27769 non-null  int64  
 2   age                     27769 non-null  int64  
 3   city_id                 27769 non-null  int64  
 4   total_impressions       27769 non-null  int64  
 5   active_hours            27769 non-null  int64  
 6   active_days             27769 non-null  int64  
 7   unq_publishers          27769 non-null  int64  
 8   cpm_mean                27769 non-null  float64
 9   cpm_median              27769 non-null  float64
 10  cpm_max                 27769 non-null  float64
 11  avg_daily_impressions   27769 non-null  float64
 12  impressions_h_0         27769 non-null  int64  
 13  impressions_h_1         27769 non-null  int64  
 14  impressions_h_2         27769 non-null

In [None]:
users_with_features.sample(5)

Unnamed: 0,user_id,sex,age,city_id,total_impressions,active_hours,active_days,unq_publishers,cpm_mean,cpm_median,...,ratio_impressions_h_21,ratio_impressions_h_22,ratio_impressions_h_23,ratio_impressions_d_0,ratio_impressions_d_1,ratio_impressions_d_2,ratio_impressions_d_3,ratio_impressions_d_4,ratio_impressions_d_5,ratio_impressions_d_6
5650,5650,2,28,0,15,11,7,1,183.546,170.0,...,0.0,0.0,0.2,0.266667,0.0,0.0,0.133333,0.0,0.266667,0.333333
6470,6470,1,0,7,10,8,6,1,584.682,569.46,...,0.0,0.1,0.0,0.0,0.0,0.3,0.2,0.0,0.1,0.4
21040,21040,1,53,1460,5,5,5,1,73.278,45.0,...,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.4,0.0,0.4
9380,9380,1,15,1315,16,16,15,1,127.32875,116.175,...,0.0,0.0625,0.0,0.1875,0.125,0.125,0.0625,0.25,0.125,0.125
20033,20033,1,15,2085,1,1,1,1,30.0,30.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


#### Сохраним модель и датасеты

In [None]:
model_filename = 'lgbm_activity_model.txt'
lgbm_model.booster_.save_model(model_filename)

<lightgbm.basic.Booster at 0x790fd21b4190>

In [None]:
users_with_features.to_pickle('users_with_features.pkl')

### Модель аукциона

In [2]:
from collections import defaultdict

In [11]:
cpm_groups = history.groupby(['day_of_week', 'hour_of_day', 'publisher'])['cpm'].agg(list)

In [12]:
cpm_slot_lookup = cpm_groups.to_dict()

In [13]:
for key, cpm_list in cpm_slot_lookup.items():
    cpm_slot_lookup[key] = np.array(cpm_list, dtype=np.float32)

print(f"Создали словарь для {len(cpm_slot_lookup)} уникальных слотов (d, h, pub).")

Создан словарь для 2956 уникальных слотов (d, h, pub).


In [16]:
import pickle

filename = 'cpm_slot_lookup.pkl'

with open(filename, 'wb') as f:
    pickle.dump(cpm_slot_lookup, f)

In [15]:
def get_win_probability(my_cpm, publisher, hour, cpm_lookup):
    day_of_week = (hour // 24) % 7
    hour_of_day = hour % 24

    slot_key = (day_of_week, hour_of_day, publisher)

    competitor_cpms = cpm_lookup.get(slot_key)

    if competitor_cpms is None or len(competitor_cpms) == 0:
        return 0.5

    total_competitors = len(competitor_cpms)

    p_less = np.sum(my_cpm > competitor_cpms) / total_competitors

    p_equal = np.sum(my_cpm == competitor_cpms) / total_competitors

    win_prob = p_less + 0.5 * p_equal

    return win_prob