In [17]:
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score

In [18]:
path = r'D:\Книги\Программирование\ML_contests_хакатоны\Avito_tech_huck_2024'

In [19]:
# Шаг 1: Читаем только нужные столбцы для экономии памяти
columns_to_read = ['user_id', 'adv_campaign_id',
    'platform_id', 'banner_code', 'is_main', 'user_id_mean', 'adv_campaign_id_mean', 
    'adv_creative_id_mean', 'goal_cost', 'goal_budget', 
    'location_id', 'logcat_id', 'camp_len', 'effect_goal', 'target'
]

train = pd.read_parquet(path + r'\train_features.parquet', columns=columns_to_read)

In [25]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114741035 entries, 0 to 114741034
Data columns (total 15 columns):
 #   Column                Dtype  
---  ------                -----  
 0   user_id               int32  
 1   adv_campaign_id       int16  
 2   platform_id           int8   
 3   banner_code           int8   
 4   is_main               int8   
 5   user_id_mean          float32
 6   adv_campaign_id_mean  float32
 7   adv_creative_id_mean  float32
 8   goal_cost             float32
 9   goal_budget           int32  
 10  location_id           int16  
 11  logcat_id             int16  
 12  camp_len              int16  
 13  effect_goal           int32  
 14  target                int8   
dtypes: float32(4), int16(4), int32(3), int8(4)
memory usage: 4.3 GB


In [27]:
# Добавим бейзлайновую фичу
user_ads_clicks = train.groupby(["user_id", "adv_campaign_id"], as_index=False)["target"].max()

In [28]:
user_ads_clicks.head()

Unnamed: 0,user_id,adv_campaign_id,target
0,1,2,0
1,1,41,0
2,1,57,0
3,1,70,0
4,1,72,0


In [30]:
user_ads_clicks = user_ads_clicks.rename(columns={'target': 'base'})
user_ads_clicks['base'] = user_ads_clicks['base'].astype('int8')

In [31]:
train = train.drop(columns={'user_id', 'adv_campaign_id'})

In [32]:
train = train.sample(frac=0.01, random_state=42)

In [33]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1147410 entries, 58287114 to 85112173
Data columns (total 13 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   platform_id           1147410 non-null  int8   
 1   banner_code           1147410 non-null  int8   
 2   is_main               1147410 non-null  int8   
 3   user_id_mean          1147410 non-null  float32
 4   adv_campaign_id_mean  1147410 non-null  float32
 5   adv_creative_id_mean  1147410 non-null  float32
 6   goal_cost             1147410 non-null  float32
 7   goal_budget           1147410 non-null  int32  
 8   location_id           1147410 non-null  int16  
 9   logcat_id             1147410 non-null  int16  
 10  camp_len              1147410 non-null  int16  
 11  effect_goal           1147410 non-null  int32  
 12  target                1147410 non-null  int8   
dtypes: float32(4), int16(3), int32(2), int8(4)
memory usage: 46.0 MB


In [34]:
# Шаг 2: Разделяем данные
y = train["target"]
X = train.drop(columns={"target"})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
del train

In [36]:
# Категориальные признаки
cat_features = ['platform_id', 'banner_code', 'is_main', 'location_id', 'logcat_id']

In [37]:
# Шаг 3: Создаём Pool для CatBoost
train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features)
test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)

In [38]:
"""
# Сохраняем на диске
X_train['target'] = y_train
X_train.to_csv(path + r'\X_train.csv', index=False)
X_test['target'] = y_test
X_test.to_csv(path + r'\X_test.csv', index=False)

del train, y, X, X_train, y_train, X_test, y_test

# Шаг 3: Создаём Pool для CatBoost с диска
train_pool = Pool(data=path + r'\X_train.csv', column_description=path + r'\column_description.txt')
test_pool = Pool(data=path + r'\X_test.csv', column_description=path + r'\column_description.txt')
"""

"\n# Сохраняем на диске\nX_train['target'] = y_train\nX_train.to_csv(path + r'\\X_train.csv', index=False)\nX_test['target'] = y_test\nX_test.to_csv(path + r'\\X_test.csv', index=False)\n\ndel train, y, X, X_train, y_train, X_test, y_test\n\n# Шаг 3: Создаём Pool для CatBoost с диска\ntrain_pool = Pool(data=path + r'\\X_train.csv', column_description=path + r'\\column_description.txt')\ntest_pool = Pool(data=path + r'\\X_test.csv', column_description=path + r'\\column_description.txt')\n"

In [39]:
# Шаг 4: Создаём и обучаем модель
model = CatBoostClassifier(
    thread_count=-1,
    iterations=2000,
    depth=4,
    learning_rate=0.03,
    l2_leaf_reg=5,
    subsample=0.8,
    loss_function='Logloss',
    eval_metric='AUC',
    task_type="CPU",
    verbose=100,
    early_stopping_rounds=50
)

model.fit(
    train_pool,
    eval_set=test_pool,
    use_best_model=True
)

0:	test: 0.6570897	best: 0.6570897 (0)	total: 737ms	remaining: 24m 33s
100:	test: 0.9170425	best: 0.9170499 (99)	total: 53.2s	remaining: 16m 39s
200:	test: 0.9182960	best: 0.9182960 (200)	total: 1m 45s	remaining: 15m 43s
300:	test: 0.9186011	best: 0.9186011 (300)	total: 2m 36s	remaining: 14m 44s
400:	test: 0.9187389	best: 0.9187521 (395)	total: 3m 32s	remaining: 14m 6s
500:	test: 0.9188176	best: 0.9188176 (500)	total: 4m 29s	remaining: 13m 27s
600:	test: 0.9188818	best: 0.9188820 (598)	total: 5m 21s	remaining: 12m 28s
700:	test: 0.9189167	best: 0.9189204 (698)	total: 6m 15s	remaining: 11m 35s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9189305193
bestIteration = 713

Shrink model to first 714 iterations.


<catboost.core.CatBoostClassifier at 0x1e25262fc10>

In [40]:
# Шаг 5: Оцениваем модель
y_pred = model.predict_proba(test_pool)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred)
print(f"ROC-AUC: {roc_auc:.4f}")

ROC-AUC: 0.9189


In [41]:
# Шаг 1: Подготовка тестовых данных
columns_test_to_read = ['user_id', 'adv_campaign_id',
    'platform_id', 'banner_code', 'is_main', 'location_id', 'logcat_id', 
    'user_id_mean', 'adv_campaign_id_mean', 'adv_creative_id_mean', 
    'goal_cost', 'goal_budget', 'camp_len', 'effect_goal'
]

test = pd.read_csv(path + r'\test_features.csv', usecols=columns_test_to_read)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1983287 entries, 0 to 1983286
Data columns (total 14 columns):
 #   Column                Dtype  
---  ------                -----  
 0   user_id               int64  
 1   adv_campaign_id       int64  
 2   platform_id           int64  
 3   banner_code           int64  
 4   is_main               int64  
 5   user_id_mean          float64
 6   adv_campaign_id_mean  float64
 7   adv_creative_id_mean  float64
 8   goal_cost             float64
 9   goal_budget           int64  
 10  location_id           int64  
 11  logcat_id             int64  
 12  camp_len              int64  
 13  effect_goal           int64  
dtypes: float64(4), int64(10)
memory usage: 211.8 MB


In [42]:
# Шаг 2: Создание Pool для тестовых данных
test_pool = Pool(data=test, cat_features=cat_features)

In [43]:
# Шаг 3: Применение модели
# Предсказание вероятностей (например, вероятность класса 1)
test['predicted_prob'] = model.predict_proba(test_pool)[:, 1]

In [44]:
test

Unnamed: 0,user_id,adv_campaign_id,platform_id,banner_code,is_main,user_id_mean,adv_campaign_id_mean,adv_creative_id_mean,goal_cost,goal_budget,location_id,logcat_id,camp_len,effect_goal,predicted_prob
0,2714742,3026,2,8,1,0.0,0.008517,0.008517,4.754986,5080,1,65,6,1068,0.000060
1,2714742,2994,2,8,1,0.0,0.005669,0.005669,7.819801,55768,30,65,14,7131,0.000045
2,2714742,97,2,8,1,0.0,0.007804,0.007804,5.138341,10024,1,65,6,1950,0.000066
3,2714742,3539,2,8,1,0.0,0.007723,0.007723,5.058388,9925,1,65,6,1962,0.000064
4,2714742,2756,2,8,1,0.0,0.010646,0.010646,4.827791,19293,30,65,9,3996,0.000089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1983282,2398626,1099,2,8,1,0.0,0.006284,0.006284,1.912499,3223,42,26,6,1685,0.000048
1983283,2398626,488,2,8,1,0.0,0.008186,0.008186,6.929049,4491,42,35,6,648,0.000056
1983284,2398626,49,2,8,1,0.0,0.006106,0.006106,2.658746,3163,42,30,7,1190,0.000044
1983285,2398626,1053,2,5,0,0.0,0.005639,0.005639,2.296300,6426,42,26,11,2798,0.000081


In [50]:
# Шаг 4: Перезапись и сохранение результатов
sam = pd.read_csv(path + r'\sample_submission.csv')
sam['predict'] = test['predicted_prob']

In [51]:
sam = sam.merge(user_ads_clicks, on=["user_id", "adv_campaign_id"], how="left")

In [52]:
sam["base"] = sam["base"].fillna(0.0)

In [53]:
sam["base"] = sam["base"] * 3
sam["base"] = sam["base"] + 1
sam["base"].value_counts()

base
1.0    1960532
4.0      22755
Name: count, dtype: int64

In [54]:
sam.head(2)

Unnamed: 0,user_id,adv_campaign_id,predict,base
0,2714742,3026,6e-05,1.0
1,2714742,2994,4.5e-05,1.0


In [55]:
sam['predict'] = sam['predict'] * sam['base']

In [58]:
sam = sam.drop(columns={'base'})

In [59]:
sam.head(2)

Unnamed: 0,user_id,adv_campaign_id,predict
0,2714742,3026,6e-05
1,2714742,2994,4.5e-05


In [60]:
sam.to_csv(path + r'\sample_submission.csv', index=False)