In [1]:
import pandas as pd, lightgbm as lgb
from sklearn.model_selection import train_test_split
import numpy as np
from catboost import CatBoostRegressor

In [2]:
actions = pd.read_csv('actions.csv')
triggers = pd.read_csv('triggers.csv')

In [3]:
triggers = triggers.drop_duplicates()

In [5]:
triggers.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37141059 entries, 0 to 43074626
Data columns (total 4 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   guid     object
 1   date     object
 2   trigger  int64 
 3   type     int64 
dtypes: int64(2), object(2)
memory usage: 1.4+ GB


In [6]:
actions['result'].value_counts()

result
0    367066
1     11138
Name: count, dtype: int64

In [7]:
actions['date']  = pd.to_datetime(actions['date'])
triggers['date'] = pd.to_datetime(triggers['date'])

In [8]:
actions  = actions.sort_values(['date','guid']).reset_index(drop=True)
triggers = triggers.sort_values(['date','guid']).reset_index(drop=True)

In [None]:
triggers = triggers.rename(columns={'date':'date_t'})


actions  = actions.sort_values('date').reset_index(drop=True)
triggers = triggers.sort_values('date_t').reset_index(drop=True)


merged = pd.merge_asof(
    actions,
    triggers,
    left_on='date',       
    right_on='date_t',    
    by='guid',
    direction='backward',
)


In [11]:
merged['delta_sec'] = (merged['date'] - merged['date_t']).dt.total_seconds()
percentiles = merged['delta_sec'].quantile([0.90, 0.95, 0.99])
print(percentiles)

0.90    349.0
0.95    401.0
0.99    952.0
Name: delta_sec, dtype: float64


In [12]:
train = pd.merge_asof(
    actions,
    triggers,
    left_on='date',      
    right_on='date_t', 
    by='guid',
    direction='backward',
    tolerance=pd.Timedelta('7min')
)

In [13]:
train.isna().sum()

guid           0
date           0
result         0
date_t     14752
trigger    14752
type       14752
dtype: int64

In [14]:
train = train.dropna()
train = train.reset_index(drop= True)

In [15]:
print(train.columns.tolist())

['guid', 'date', 'result', 'date_t', 'trigger', 'type']


In [16]:
train['hour']  = train.date.dt.hour
train['wday']  = train.date.dt.weekday
train['month'] = train.date.dt.month

shifted = train.groupby('guid')['result'].shift(fill_value=0)

train['user_clicks_cum'] = shifted.groupby(train['guid']).cumsum()
train['user_shows_cum'] = train.groupby('guid').cumcount()
train['user_ctr'] = train['user_clicks_cum'] / train['user_shows_cum'].replace(0, np.nan)


In [17]:
train[train['user_ctr']>1]['user_ctr'].count()


0

In [18]:
train['user_ctr'].value_counts(ascending=False)

user_ctr
0.0    87760
1.0     3048
0.5      157
Name: count, dtype: int64

In [19]:
actions['clicks_cum'] = actions.groupby('guid')['result'].cumsum().shift(fill_value=0)
actions['shows_cum']  = actions.groupby('guid').cumcount()
hist = actions[['guid','date','clicks_cum','shows_cum']].rename(columns={'date': 'date_t'})
trig = triggers.sort_values(['guid','date_t']).reset_index(drop=True)

In [20]:
trig = trig.drop_duplicates(['guid', 'date_t'])
trig = trig.sort_values('date_t').reset_index(drop=True)

In [21]:
print("Есть ли NaN в date_t:", trig['date_t'].isna().any())
print("Есть ли дубликаты:", trig.duplicated(['guid', 'date_t']).any())

Есть ли NaN в date_t: False
Есть ли дубликаты: False


In [22]:
assert trig.groupby('guid')['date_t'].apply(lambda x: x.is_monotonic_increasing).all(), "Даты внутри guid не отсортированы!"

In [None]:
trig = pd.merge_asof(
    left=trig,
    right=hist,
    on='date_t',
    by='guid',
    direction='backward',
    tolerance=pd.Timedelta('365d')  
)

In [None]:
trig['hour']  = trig['date_t'].dt.hour
trig['wday']  = trig['date_t'].dt.weekday
trig['month'] = trig['date_t'].dt.month
trig[['clicks_cum','shows_cum']] = trig[['clicks_cum','shows_cum']].fillna(0)
trig['user_ctr'] = trig['clicks_cum'] / trig['shows_cum']
trig.loc[trig['shows_cum'] == 0, 'user_ctr'] = 0
trig['user_ctr'] = trig['user_ctr'].clip(upper=1.0)

In [25]:
trig['user_ctr'].value_counts()

user_ctr
0.0    33103389
1.0      218855
0.5        1819
Name: count, dtype: int64

In [None]:
features = ['trigger','type','hour','wday','month','user_ctr']

X_train, X_test, y_train, y_test = train_test_split(
    train[features], 
    train['result'],
    test_size=0.2,
    stratify=train['result'],
    random_state=42
)

dtrain = lgb.Dataset(X_train, y_train)
dtest = lgb.Dataset(X_test, y_test)

features = ['trigger','type','hour','wday','month','user_ctr']
dtrain = lgb.Dataset(train[features], label=train['result'])
params = {
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.01,
    'num_leaves': 15,
    'scale_pos_weight': 100,  
    'reg_alpha': 0.5,
    'reg_lambda': 0.5,
    'min_child_samples': 100,
    'verbosity': -1
}
model = lgb.train(
    params,
    dtrain,
    num_boost_round=2000,
    valid_sets=[dtest],
    valid_names=['valid'],
    callbacks=[lgb.log_evaluation(100)]
)


[100]	valid's auc: 0.618115
[200]	valid's auc: 0.621368
[300]	valid's auc: 0.627417
[400]	valid's auc: 0.632339
[500]	valid's auc: 0.636009
[600]	valid's auc: 0.640095
[700]	valid's auc: 0.642877
[800]	valid's auc: 0.645909
[900]	valid's auc: 0.64848
[1000]	valid's auc: 0.651159
[1100]	valid's auc: 0.653501
[1200]	valid's auc: 0.655644
[1300]	valid's auc: 0.657877
[1400]	valid's auc: 0.659849
[1500]	valid's auc: 0.661868
[1600]	valid's auc: 0.663662
[1700]	valid's auc: 0.665131
[1800]	valid's auc: 0.666634
[1900]	valid's auc: 0.668255
[2000]	valid's auc: 0.669795


In [27]:
trig['p_click'] = model.predict(trig[features], num_iteration=model.best_iteration)

In [32]:
trig = trig.sort_values(['guid','date_t'])
trig['days_since_last'] = (trig.date_t - trig.groupby('guid')['date_t'].shift()).dt.days
mask_14d = (trig['days_since_last'] >= 14) | trig['days_since_last'].isna()
candidates = trig[mask_14d]

In [33]:
candidates = candidates.sort_values('p_click', ascending=False)

In [None]:
selected = []
balance = 0
for _, row in candidates.iterrows():
    exp_profit = 5*row.p_click - 1
    if balance + exp_profit < 0:      
        break
    balance += exp_profit
    selected.append(row)



In [36]:
plan = pd.DataFrame(selected)[['guid','date_t','p_click']]
plan.to_csv('banner_plan.csv', index=False)
print(f'Ожидаемый оборот: {5*plan.p_click.sum():.0f} $; ожидаемый баланс: {balance:.1f} $')

Ожидаемый оборот: 3634473 $; ожидаемый баланс: 2538724.4 $
