In [32]:
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score
import os

In [2]:
path = r'D:\Книги\Программирование\ML_contests_хакатоны\Avito_tech_huck_2024'

In [3]:
# Шаг 1: Читаем только нужные столбцы для экономии памяти
columns_to_read = ['user_id', 'adv_campaign_id',
    'platform_id', 'banner_code', 'is_main', 'user_id_mean', 'adv_campaign_id_mean', 
    'adv_creative_id_mean', 'goal_cost', 'goal_budget', 
    'location_id', 'logcat_id', 'camp_len', 'effect_goal', 'target'
]

train = pd.read_parquet(path + r'\train_features.parquet', columns=columns_to_read)

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114741035 entries, 0 to 114741034
Data columns (total 15 columns):
 #   Column                Dtype  
---  ------                -----  
 0   user_id               int32  
 1   adv_campaign_id       int16  
 2   platform_id           int8   
 3   banner_code           int8   
 4   is_main               int8   
 5   user_id_mean          float32
 6   adv_campaign_id_mean  float32
 7   adv_creative_id_mean  float32
 8   goal_cost             float32
 9   goal_budget           int32  
 10  location_id           int16  
 11  logcat_id             int16  
 12  camp_len              int16  
 13  effect_goal           int32  
 14  target                int8   
dtypes: float32(4), int16(4), int32(3), int8(4)
memory usage: 4.3 GB


In [5]:
# Добавим бейзлайновую фичу
# user_ads_clicks = train.groupby(["user_id", "adv_campaign_id"], as_index=False)["target"].max()
# user_ads_clicks.head()
# user_ads_clicks = user_ads_clicks.rename(columns={'target': 'base'})
# user_ads_clicks['base'] = user_ads_clicks['base'].astype('int8')
# user_ads_clicks.to_csv(path + r'\user_ads_clicks.csv', index=False)

In [6]:
train = train.drop(columns={'user_id', 'adv_campaign_id'})

In [7]:
train = train.sample(frac=0.15, random_state=42)

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17211155 entries, 58287114 to 23233480
Data columns (total 13 columns):
 #   Column                Dtype  
---  ------                -----  
 0   platform_id           int8   
 1   banner_code           int8   
 2   is_main               int8   
 3   user_id_mean          float32
 4   adv_campaign_id_mean  float32
 5   adv_creative_id_mean  float32
 6   goal_cost             float32
 7   goal_budget           int32  
 8   location_id           int16  
 9   logcat_id             int16  
 10  camp_len              int16  
 11  effect_goal           int32  
 12  target                int8   
dtypes: float32(4), int16(3), int32(2), int8(4)
memory usage: 689.4 MB


In [9]:
# Шаг 2: Разделяем данные
y = train["target"]
X = train.drop(columns={"target"})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
del train

In [11]:
# Категориальные признаки
cat_features = ['platform_id', 'banner_code', 'is_main', 'location_id', 'logcat_id']

In [12]:
# Шаг 3: Создаём Pool для CatBoost
train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features)
test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)

In [13]:
"""
# Сохраняем на диске
X_train['target'] = y_train
X_train.to_csv(path + r'\X_train.csv', index=False)
X_test['target'] = y_test
X_test.to_csv(path + r'\X_test.csv', index=False)

del train, y, X, X_train, y_train, X_test, y_test

# Шаг 3: Создаём Pool для CatBoost с диска
train_pool = Pool(data=path + r'\X_train.csv', column_description=path + r'\column_description.txt')
test_pool = Pool(data=path + r'\X_test.csv', column_description=path + r'\column_description.txt')
"""

"\n# Сохраняем на диске\nX_train['target'] = y_train\nX_train.to_csv(path + r'\\X_train.csv', index=False)\nX_test['target'] = y_test\nX_test.to_csv(path + r'\\X_test.csv', index=False)\n\ndel train, y, X, X_train, y_train, X_test, y_test\n\n# Шаг 3: Создаём Pool для CatBoost с диска\ntrain_pool = Pool(data=path + r'\\X_train.csv', column_description=path + r'\\column_description.txt')\ntest_pool = Pool(data=path + r'\\X_test.csv', column_description=path + r'\\column_description.txt')\n"

In [14]:
# Шаг 4: Создаём и обучаем модель
model = CatBoostClassifier(
    thread_count=-1,
    iterations=2000,
    depth=4,
    learning_rate=0.03,
    l2_leaf_reg=5,
    subsample=0.8,
    loss_function='Logloss',
    eval_metric='AUC',
    task_type="CPU",
    verbose=100,
    early_stopping_rounds=50
)

model.fit(
    train_pool,
    eval_set=test_pool,
    use_best_model=True
)

0:	test: 0.5731356	best: 0.5731356 (0)	total: 8.1s	remaining: 4h 30m
100:	test: 0.9167160	best: 0.9167160 (100)	total: 12m 17s	remaining: 3h 51m 8s
200:	test: 0.9214919	best: 0.9214919 (200)	total: 25m 21s	remaining: 3h 47m 1s
300:	test: 0.9217770	best: 0.9217788 (298)	total: 37m 35s	remaining: 3h 32m 10s
400:	test: 0.9218887	best: 0.9218887 (400)	total: 49m 30s	remaining: 3h 17m 24s
500:	test: 0.9219669	best: 0.9219669 (500)	total: 1h 2m 11s	remaining: 3h 6m 4s
600:	test: 0.9220371	best: 0.9220371 (600)	total: 1h 15m 16s	remaining: 2h 55m 14s
700:	test: 0.9220769	best: 0.9220773 (697)	total: 1h 28m 51s	remaining: 2h 44m 38s
800:	test: 0.9220991	best: 0.9220993 (797)	total: 1h 40m 59s	remaining: 2h 31m 10s
900:	test: 0.9221103	best: 0.9221104 (899)	total: 1h 53m 34s	remaining: 2h 18m 31s
1000:	test: 0.9221233	best: 0.9221234 (997)	total: 2h 6m 50s	remaining: 2h 6m 35s
1100:	test: 0.9221327	best: 0.9221327 (1100)	total: 2h 19m 7s	remaining: 1h 53m 35s
1200:	test: 0.9221439	best: 0.92214

<catboost.core.CatBoostClassifier at 0x2201ca19f50>

In [33]:
# Сохранение модели в файл
model.save_model(os.path.join(path, "catboost_model.cbm"))

In [15]:
# Шаг 5: Оцениваем модель
y_pred = model.predict_proba(test_pool)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred)
print(f"ROC-AUC: {roc_auc:.4f}")

ROC-AUC: 0.9222


In [16]:
# Шаг 1: Подготовка тестовых данных
columns_test_to_read = ['user_id', 'adv_campaign_id',
    'platform_id', 'banner_code', 'is_main', 'location_id', 'logcat_id', 
    'user_id_mean', 'adv_campaign_id_mean', 'adv_creative_id_mean', 
    'goal_cost', 'goal_budget', 'camp_len', 'effect_goal'
]

test = pd.read_csv(path + r'\test_features.csv', usecols=columns_test_to_read)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1983287 entries, 0 to 1983286
Data columns (total 14 columns):
 #   Column                Dtype  
---  ------                -----  
 0   user_id               int64  
 1   adv_campaign_id       int64  
 2   platform_id           int64  
 3   banner_code           int64  
 4   is_main               int64  
 5   user_id_mean          float64
 6   adv_campaign_id_mean  float64
 7   adv_creative_id_mean  float64
 8   goal_cost             float64
 9   goal_budget           int64  
 10  location_id           int64  
 11  logcat_id             int64  
 12  camp_len              int64  
 13  effect_goal           int64  
dtypes: float64(4), int64(10)
memory usage: 211.8 MB


In [17]:
# Шаг 2: Создание Pool для тестовых данных
test_pool = Pool(data=test, cat_features=cat_features)

In [18]:
# Шаг 3: Применение модели
# Предсказание вероятностей
test['predicted_prob'] = model.predict_proba(test_pool)[:, 1]

In [19]:
test

Unnamed: 0,user_id,adv_campaign_id,platform_id,banner_code,is_main,user_id_mean,adv_campaign_id_mean,adv_creative_id_mean,goal_cost,goal_budget,location_id,logcat_id,camp_len,effect_goal,predicted_prob
0,2714742,3026,2,8,1,0.0,0.008517,0.008517,4.754986,5080,1,65,6,1068,1.155799e-06
1,2714742,2994,2,8,1,0.0,0.005669,0.005669,7.819801,55768,30,65,14,7131,8.553904e-07
2,2714742,97,2,8,1,0.0,0.007804,0.007804,5.138341,10024,1,65,6,1950,1.138719e-06
3,2714742,3539,2,8,1,0.0,0.007723,0.007723,5.058388,9925,1,65,6,1962,1.130556e-06
4,2714742,2756,2,8,1,0.0,0.010646,0.010646,4.827791,19293,30,65,9,3996,1.414431e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1983282,2398626,1099,2,8,1,0.0,0.006284,0.006284,1.912499,3223,42,26,6,1685,9.371172e-07
1983283,2398626,488,2,8,1,0.0,0.008186,0.008186,6.929049,4491,42,35,6,648,1.062324e-06
1983284,2398626,49,2,8,1,0.0,0.006106,0.006106,2.658746,3163,42,30,7,1190,8.054419e-07
1983285,2398626,1053,2,5,0,0.0,0.005639,0.005639,2.296300,6426,42,26,11,2798,1.399971e-06


In [41]:
# Шаг 4: Перезапись и сохранение результатов
sam = pd.read_csv(path + r'\sample_submission.csv')
sam['predict'] = test['predicted_prob']
sam.to_csv(path + r'\sample_submission1.csv', index=False)
sam.head()

Unnamed: 0,user_id,adv_campaign_id,predict
0,2714742,3026,1.155799e-06
1,2714742,2994,8.553904e-07
2,2714742,97,1.138719e-06
3,2714742,3539,1.130556e-06
4,2714742,2756,1.414431e-06


In [37]:
# Добавим бейзлайновую фичу
user_ads_clicks = pd.read_csv(path + r'\user_ads_clicks.csv')

In [42]:
sam = sam.merge(user_ads_clicks, on=["user_id", "adv_campaign_id"], how="left")

In [43]:
sam["base"] = sam["base"].fillna(0.0)

In [44]:
sam.head(2)

Unnamed: 0,user_id,adv_campaign_id,predict,base
0,2714742,3026,1.155799e-06,0.0
1,2714742,2994,8.553904e-07,0.0


In [45]:
sam['predict'] = (sam['predict'] + sam['base']) / 2

In [46]:
sam = sam.drop(columns={'base'})

In [47]:
sam.head(2)

Unnamed: 0,user_id,adv_campaign_id,predict
0,2714742,3026,5.778996e-07
1,2714742,2994,4.276952e-07


In [48]:
sam.to_csv(path + r'\sample_submission.csv', index=False)

In [34]:
# Создаём пустую модель и загружаем её
model = CatBoostClassifier()
model.load_model(os.path.join(path, "catboost_model.cbm"))

<catboost.core.CatBoostClassifier at 0x2201af97a10>