In [1]:
import pandas as pd
print('pandas {}.'.format(pd.__version__))
import numpy as np
print('numpy {}.'.format(np.__version__))
from datetime import datetime
import pickle
import dill
print('dill {}.'.format(dill.__version__))
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import sklearn
print('sklearn {}.'.format(sklearn.__version__))
pd.options.display.max_columns = None
import gc 

import warnings
warnings.filterwarnings('ignore')

pandas 2.0.0.
numpy 1.24.1.
dill 0.3.6.
sklearn 1.3.2.


In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

RANDOM_SEED = 42

In [3]:
def nan_click_viewed__deleting(X, y = None):
    X = X[X['click'].isna() == False]
    X = X[X['view'] != 0]
    return X 

In [4]:
def dub_dropping(X, y = None):
    X =  X.drop_duplicates()                              
    return X

In [5]:
def paused_status_dropping(X, y = None):
    X.drop(X.loc[X['status'] == 'paused'].index, inplace=True)
    return X

In [6]:
def place_number_decrease(X, y = None):
    X = X[X['place_number'] > 0]
    X['place_number'] = X['place_number'] - 1
    return X

In [7]:
def nan_filling(X, y = None):
    nan_cols = {}
    for col in X.columns:
        nan_count = X[col].isna().sum()
        if nan_count != 0:
            per = np.round((nan_count / df.shape[0]) * 100, 2)
            nan_cols[col] = per
                       
    for col, per in nan_cols.items():
        if per < 5:
            X = X[X[col].isna() == False]
        else:
            X[col] = X[col].fillna('unknown')
            
    return X

In [8]:
df_clickhouse = pd.read_csv('../internship/clickhouse.csv')
df_clickhouse.shape

(5889167, 39)

In [9]:
req_df_columns = df_clickhouse.columns
prep_tools_dict = {'req_df_columns' : req_df_columns}

In [10]:
df_clickhouse.head()

Unnamed: 0,auction_date_time,impression_hash,ssp,auction_id,impression_id,bid_id,auction_type,bid_floor,bid_price,loss_reason,is_win,pay_price,is_pay,view,place_number,click,ssp_user_id,creative_id,campaign_id,stream_id,link_id,format,device,OS,browser,geo_country,geo_city,ip_v4,ip_v6,site_id,tag_id,iab_category,ab_test,enter_utm_source,enter_utm_campaign,enter_utm_medium,enter_utm_content,enter_utm_term,event_date_time
0,2024-01-10 15:15:34,3595129331408507136,7,cmf8kpg76rmanjkfhlh0,mobicbottomx2002_1,3Y1MGDSF4DZ45TYK3J8BXRZH,fst_price,0.01,107056393,10,1,107056,1,1,2,0,cmf8km876rmanjkfhgn0:cmf8km876rmanjkfhgmg,6b6df5b1f938,77,111,4362,native,4,Android,Chrome,RU,551487,85.249.31.254,,01gm531atx,mobicbottomx2002,[],automl,rtb_24smi,86,bidml,7f73d0f3b178,13926_25077,2024-01-10 15:15:45
1,2024-01-22 15:49:37,5501180719971990097,7,cmn68o876rmanjm2to9g,v45h0s63cj3bst4e_2,3Z0K7D4EPZH05TQNXA6PK9EF,fst_price,0.01,61879580,19,1,61879,1,1,3,0,cmn61fg76rmanjm0qeh0:cmn61fg76rmanjm0qegg,7d98a8273750,77,86,3666,native,4,Android,Chrome,RU,542420,85.249.172.121,,01hew0ev25,v45h0s63cj3bst4e,[],automl,rtb_24smi,86,bidml,0a395fe6dd05,16608_16536,2024-01-22 15:49:43
2,2024-01-11 21:47:01,8997213320937048206,7,cmg3f9876rmanjgkth00,tk6eb5nq1w1sxmt2_4,3Y4X9XT2D7CZD3GSM1E7PD2E,fst_price,0.01,22018806,0,1,22018,0,0,5,0,cmftll876rmanjnlqh7g:cmftll876rmanjnlqh70,8b19bb757ba5,77,78,3918,native,4,Android,Chrome,RU,0,94.142.248.55,,01hew0ev25,tk6eb5nq1w1sxmt2,[],catboost,rtb_24smi,86,bidml,04376a608224,22858_25378,2024-01-11 21:47:02
3,2024-01-14 16:57:24,-6183089819839663183,7,cmhugh076rmanjice1i0,au2d6d7cjcpefc00_2,3YC3XSA2HXEBN986JGKHKEJ2,fst_price,0.01,68345961,2,1,68345,1,1,3,0,cmhugh076rmanjice1bg:cmhugh076rmanjice1b0,d67ab8ecea54,77,273,4251,native,2,Win10,Yandex Browser,RU,1496747,92.127.194.43,,01gm531atx,au2d6d7cjcpefc00,[],automl,rtb_24smi,86,bidml,9b209a09a68a,30261_24285,2024-01-14 16:57:26
4,2024-01-16 13:12:22,2397132522009946486,7,cmj5d1g76rmanjgu50og,ou9j4n9sqtfe8400_1,3YGVV5E15PV4P9MCA4PR4APY,fst_price,0.01,16396436,0,1,16396,0,0,2,0,cmidveo76rmanjkvl9d0:cmidveo76rmanjkvl9cg,0381d514fa62,77,114,4082,native,4,Android,Miui Browser,RU,473249,178.35.44.55,,01gm531atx,ou9j4n9sqtfe8400,[],automl,rtb_24smi,86,bidml,8d65123ad67e,15975_15005,2024-01-16 13:12:22


In [10]:
df_creatives = pd.read_csv('../internship/creatives.csv')
df_creatives.shape

(132, 15)

In [11]:
df_creatives = df_creatives.rename(columns={'id': 'creative_id'})

In [91]:
df = pd.merge(df_clickhouse, df_creatives, on="creative_id")
df.shape

(5889167, 53)

In [92]:
df = paused_status_dropping(df)

In [93]:
df = nan_filling(dub_dropping(nan_click_viewed__deleting(df)))
df.shape

(2797034, 53)

In [94]:
df.head()

Unnamed: 0,auction_date_time,impression_hash,ssp,auction_id,impression_id,bid_id,auction_type,bid_floor,bid_price,loss_reason,is_win,pay_price,is_pay,view,place_number,click,ssp_user_id,creative_id,campaign_id_x,stream_id_x,link_id,format,device,OS,browser,geo_country,geo_city,ip_v4,ip_v6,site_id,tag_id,iab_category_x,ab_test,enter_utm_source,enter_utm_campaign,enter_utm_medium,enter_utm_content,enter_utm_term,event_date_time,status,is_deleted,campaign_id_y,user_id,stream_id_y,theme,second_theme,iab_category_y,image,image_extension,mime_type,image_tag,created_at,updated_at
0,2024-01-10 15:15:34,3595129331408507136,7,cmf8kpg76rmanjkfhlh0,mobicbottomx2002_1,3Y1MGDSF4DZ45TYK3J8BXRZH,fst_price,0.01,107056393,10,1,107056,1,1,2,0,cmf8km876rmanjkfhgn0:cmf8km876rmanjkfhgmg,6b6df5b1f938,77,111,4362,native,4,Android,Chrome,RU,551487,85.249.31.254,unknown,01gm531atx,mobicbottomx2002,[],automl,rtb_24smi,86,bidml,7f73d0f3b178,13926_25077,2024-01-10 15:15:45,active,0,77,9,111,default,default,IAB7,b22f3655f7be6cd9d37e9b44,jpg,image/jpeg,default,2023-10-11 12:32:35,2023-11-08 13:23:02
4,2024-01-15 17:14:51,-3336184151420619993,7,cmijrmo76rmanjm1rqd0,ou9j4n9sqtfe8400_0,3YEQAEAH9SCKP3FHT4DH38SE,fst_price,0.01,172606393,6,1,172606,1,1,1,0,cmijrkg76rmanjm1rmqg:cmijrkg76rmanjm1rmq0,6b6df5b1f938,77,111,4362,native,4,Android,Chrome,RU,524901,188.94.32.123,unknown,01gm531atx,ou9j4n9sqtfe8400,[],automl,rtb_24smi,86,bidml,8d65123ad67e,15314_15024,2024-01-15 17:15:25,active,0,77,9,111,default,default,IAB7,b22f3655f7be6cd9d37e9b44,jpg,image/jpeg,default,2023-10-11 12:32:35,2023-11-08 13:23:02
8,2024-01-14 13:50:40,-4845328304293519401,7,cmhrp0076rmanjhro4cg,my0fq2bg0twts35c_3,3YBS7VCA4NRNFNQ7HXA8DC89,fst_price,0.01,55438822,2,1,55438,1,1,4,0,cmhrp0076rmanjhro45g:cmhrp0076rmanjhro450,6b6df5b1f938,77,111,4362,native,2,Win10,Yandex Browser,RU,581049,95.53.35.121,unknown,01hew0ev25,my0fq2bg0twts35c,[],automl,rtb_24smi,86,bidml,04376a608224,30347_27769,2024-01-14 13:50:42,active,0,77,9,111,default,default,IAB7,b22f3655f7be6cd9d37e9b44,jpg,image/jpeg,default,2023-10-11 12:32:35,2023-11-08 13:23:02
9,2024-01-18 12:20:31,1927312752573112664,7,cmkeqno76rmanjnmjdgg,mobicbottomx2002_1,3YNXNNJC7HM8GFKZXE92T412,fst_price,0.01,76715517,10,1,76715,1,1,2,0,cmkeqi876rmanjnmj5u0:cmkeqi876rmanjnmj5tg,6b6df5b1f938,77,111,4362,native,4,Android,Chrome,RU,2022890,80.83.239.41,unknown,01gm531atx,mobicbottomx2002,[],automl,rtb_24smi,86,bidml,7f73d0f3b178,13926_25077,2024-01-18 12:21:00,active,0,77,9,111,default,default,IAB7,b22f3655f7be6cd9d37e9b44,jpg,image/jpeg,default,2023-10-11 12:32:35,2023-11-08 13:23:02
10,2024-01-10 01:18:04,-4294347528276567097,7,cmesc7076rmanjikaivg,ou9j4n9sqtfe8400_1,3Y04JY55N0MZ3C2BXAES4C2W,fst_price,0.01,79799215,2,1,79799,1,1,2,1,cmesc5g76rmanjikafdg:cmesc5g76rmanjikafd0,6b6df5b1f938,77,111,4362,native,4,Android,Chrome,RU,498817,31.134.189.196,unknown,01gm531atx,ou9j4n9sqtfe8400,[],automl,rtb_24smi,86,bidml,9b209a09a68a,13926_25077,2024-01-10 01:25:11,active,0,77,9,111,default,default,IAB7,b22f3655f7be6cd9d37e9b44,jpg,image/jpeg,default,2023-10-11 12:32:35,2023-11-08 13:23:02


In [95]:
df = place_number_decrease(df)
df.shape

(2797034, 53)

In [96]:
df.head()

Unnamed: 0,auction_date_time,impression_hash,ssp,auction_id,impression_id,bid_id,auction_type,bid_floor,bid_price,loss_reason,is_win,pay_price,is_pay,view,place_number,click,ssp_user_id,creative_id,campaign_id_x,stream_id_x,link_id,format,device,OS,browser,geo_country,geo_city,ip_v4,ip_v6,site_id,tag_id,iab_category_x,ab_test,enter_utm_source,enter_utm_campaign,enter_utm_medium,enter_utm_content,enter_utm_term,event_date_time,status,is_deleted,campaign_id_y,user_id,stream_id_y,theme,second_theme,iab_category_y,image,image_extension,mime_type,image_tag,created_at,updated_at
0,2024-01-10 15:15:34,3595129331408507136,7,cmf8kpg76rmanjkfhlh0,mobicbottomx2002_1,3Y1MGDSF4DZ45TYK3J8BXRZH,fst_price,0.01,107056393,10,1,107056,1,1,1,0,cmf8km876rmanjkfhgn0:cmf8km876rmanjkfhgmg,6b6df5b1f938,77,111,4362,native,4,Android,Chrome,RU,551487,85.249.31.254,unknown,01gm531atx,mobicbottomx2002,[],automl,rtb_24smi,86,bidml,7f73d0f3b178,13926_25077,2024-01-10 15:15:45,active,0,77,9,111,default,default,IAB7,b22f3655f7be6cd9d37e9b44,jpg,image/jpeg,default,2023-10-11 12:32:35,2023-11-08 13:23:02
4,2024-01-15 17:14:51,-3336184151420619993,7,cmijrmo76rmanjm1rqd0,ou9j4n9sqtfe8400_0,3YEQAEAH9SCKP3FHT4DH38SE,fst_price,0.01,172606393,6,1,172606,1,1,0,0,cmijrkg76rmanjm1rmqg:cmijrkg76rmanjm1rmq0,6b6df5b1f938,77,111,4362,native,4,Android,Chrome,RU,524901,188.94.32.123,unknown,01gm531atx,ou9j4n9sqtfe8400,[],automl,rtb_24smi,86,bidml,8d65123ad67e,15314_15024,2024-01-15 17:15:25,active,0,77,9,111,default,default,IAB7,b22f3655f7be6cd9d37e9b44,jpg,image/jpeg,default,2023-10-11 12:32:35,2023-11-08 13:23:02
8,2024-01-14 13:50:40,-4845328304293519401,7,cmhrp0076rmanjhro4cg,my0fq2bg0twts35c_3,3YBS7VCA4NRNFNQ7HXA8DC89,fst_price,0.01,55438822,2,1,55438,1,1,3,0,cmhrp0076rmanjhro45g:cmhrp0076rmanjhro450,6b6df5b1f938,77,111,4362,native,2,Win10,Yandex Browser,RU,581049,95.53.35.121,unknown,01hew0ev25,my0fq2bg0twts35c,[],automl,rtb_24smi,86,bidml,04376a608224,30347_27769,2024-01-14 13:50:42,active,0,77,9,111,default,default,IAB7,b22f3655f7be6cd9d37e9b44,jpg,image/jpeg,default,2023-10-11 12:32:35,2023-11-08 13:23:02
9,2024-01-18 12:20:31,1927312752573112664,7,cmkeqno76rmanjnmjdgg,mobicbottomx2002_1,3YNXNNJC7HM8GFKZXE92T412,fst_price,0.01,76715517,10,1,76715,1,1,1,0,cmkeqi876rmanjnmj5u0:cmkeqi876rmanjnmj5tg,6b6df5b1f938,77,111,4362,native,4,Android,Chrome,RU,2022890,80.83.239.41,unknown,01gm531atx,mobicbottomx2002,[],automl,rtb_24smi,86,bidml,7f73d0f3b178,13926_25077,2024-01-18 12:21:00,active,0,77,9,111,default,default,IAB7,b22f3655f7be6cd9d37e9b44,jpg,image/jpeg,default,2023-10-11 12:32:35,2023-11-08 13:23:02
10,2024-01-10 01:18:04,-4294347528276567097,7,cmesc7076rmanjikaivg,ou9j4n9sqtfe8400_1,3Y04JY55N0MZ3C2BXAES4C2W,fst_price,0.01,79799215,2,1,79799,1,1,1,1,cmesc5g76rmanjikafdg:cmesc5g76rmanjikafd0,6b6df5b1f938,77,111,4362,native,4,Android,Chrome,RU,498817,31.134.189.196,unknown,01gm531atx,ou9j4n9sqtfe8400,[],automl,rtb_24smi,86,bidml,9b209a09a68a,13926_25077,2024-01-10 01:25:11,active,0,77,9,111,default,default,IAB7,b22f3655f7be6cd9d37e9b44,jpg,image/jpeg,default,2023-10-11 12:32:35,2023-11-08 13:23:02


In [76]:
# df_train, df_test = train_test_split(df, stratify = df['click'], test_size=0.2, random_state=RANDOM_SEED)

# X_train = df_train.drop('click', axis = 1)
# y_train = df_train['click']

# X_test = df_test.drop('click', axis = 1)
# y_test = df_test['click']

In [98]:
X_train = df.drop('click', axis = 1)
y_train = df['click']

In [99]:
def column_deleting(X, y = None):
    columns_for_drop = ['creative_id', 'auction_date_time', 'impression_hash', 'ssp', 'auction_id', 'impression_id', 'bid_id', 'auction_type', 'bid_floor', 'bid_price', 'is_win', 'pay_price', 'is_pay', 'view', 'ssp_user_id', 'campaign_id_x', 'stream_id_x', 'link_id', 'format', 'ip_v4', 'ip_v6', 'site_id', 'iab_category_x', 'ab_test', 'enter_utm_campaign', 'status', 'is_deleted', 'campaign_id_y', 'user_id', 'stream_id_y', 'image', 'created_at', 'updated_at']
    X = X.drop(columns_for_drop, axis = 1)
    X.reset_index(drop=True, inplace=True)
    return X

In [100]:
def feature_reduction(X, y = None):
    X['browser'] = X['browser'].apply(lambda x : 'Chrome' if x == 'Google' else x)
    X['browser'] = X['browser'].apply(lambda x : 'Yandex Browser' if x == 'Yandex' else x)
    X.reset_index(drop=True, inplace=True)
    return X

In [101]:
def date_time_col_transformation(X, y = None):
    X['req_date_time'] = pd.to_datetime(X['event_date_time'], format='%Y-%m-%d %H:%M:%S')
    X['day'] =  X['req_date_time'].dt.day
    X['month'] =  X['req_date_time'].dt.month
    X['year'] =  X['req_date_time'].dt.year
    X['hour'] =  X['req_date_time'].dt.hour
    X['day_of_week'] = X['event_date_time'].apply(lambda x : datetime.strptime(x, "%Y-%m-%d %H:%M:%S").weekday())
    X['is_holyday'] = X['day_of_week'].apply(lambda x : 1 if x in [5 , 6] else 0)
    X['is_friday'] = X['day_of_week'].apply(lambda x : 1 if x == 4 else 0)
    X['is_monday'] = X['day_of_week'].apply(lambda x : 1 if x == 0 else 0)
    X['day_part'] = X['hour'].apply(lambda x : 'Night' if x >= 22 and x < 5 else ('Morning' if x >= 5 and x < 12 else ('Day' if x >= 12 and x < 17 else 'Evening')))
    X['year_part'] = X['month'].apply(lambda x : 'Winter' if x in [12, 1, 2] else ('Spring' if x in [3, 4, 5] else ('Summer' if x in [6, 7, 8] else 'Autumn')))
    X = X.drop(['event_date_time', 'req_date_time', 'year'], axis = 1)
    X.reset_index(drop=True, inplace=True)
    return X

In [23]:
def type_transformation(X, y = None):
    X['loss_reason'] = X['loss_reason'].astype('int8')
    X['device'] = X['device'].astype('int8')
    X['day'] = X['day'].astype('int8')
    X['month'] = X['month'].astype('int8')
    X['hour'] = X['hour'].astype('int8')
    X['day_of_week'] = X['day_of_week'].astype('int8')
    X['is_holyday'] = X['is_holyday'].astype('int8')
    X['is_friday'] = X['is_friday'].astype('int8')
    X['is_monday'] = X['is_monday'].astype('int8')
    X.reset_index(drop=True, inplace=True)
    return X

In [102]:
ohe = OneHotEncoder(sparse = True, max_categories = None, handle_unknown = 'ignore')
#ohe = OneHotEncoder(sparse = False, max_categories = None, handle_unknown = 'ignore')

In [116]:
def cat_encoding(X, y = None):

    cat_cols = X.columns

    ft = ohe.transform(X[cat_cols])

    X = ft
    
    return X

In [117]:
preprocessor = Pipeline(
    steps=
    [
        ('column_deleting', FunctionTransformer(func=column_deleting)),
        ('feature_reduction', FunctionTransformer(func=feature_reduction)),
        ('date_time_col_transformation', FunctionTransformer(func=date_time_col_transformation)),
        ('cat_encoding', FunctionTransformer(func=cat_encoding)),
        
    ]
)

In [105]:
X_train_prep = preprocessor.transform(X_train)

In [107]:
X_train_prep.shape

(2797034, 1788)

In [120]:
prep_tools_dict['encoder'] = ohe

In [121]:
prep_tools_dict['preprocessor'] = preprocessor

In [87]:
X_test_prep = preprocessor.transform(X_test)

In [88]:
X_test_prep.shape

(559407, 1771)

In [111]:
%%time
hgbc_model = HistGradientBoostingClassifier(l2_regularization = 0.487, learning_rate = 0.065, max_depth = 49, max_leaf_nodes = 91,  scoring = 'roc_auc', random_state=RANDOM_SEED)
with parallel_backend('threading', n_jobs = 10):
    hgbc_model.fit(X_train_prep.toarray(), y_train)

CPU times: total: 1h 11min 54s
Wall time: 15min 33s


In [122]:
prep_tools_dict['model'] = hgbc_model
prep_tools_dict['model_metadata'] = 'Модель: HistGradientBoostingClassifier, ROC_AUC: 0.6722063798560853'

In [123]:
with open("../pickles/prep_tools_dict.pkl", "wb") as f:
        dill.Pickler(f, recurse=True).dump(prep_tools_dict)

In [89]:
%%time
hgbc_model = HistGradientBoostingClassifier(l2_regularization = 0.487, learning_rate = 0.065, max_depth = 49, max_leaf_nodes = 91,  scoring = 'roc_auc', random_state=RANDOM_SEED)
with parallel_backend('threading', n_jobs = 10):
    hgbc_model.fit(X_train_prep.toarray(), y_train)

y_pred_prob_test = hgbc_model.predict_proba(X_test_prep.toarray()) 
roc_auc_test = roc_auc_score(y_test, y_pred_prob_test[:, 1])
    
print(f'Модель: {type(hgbc_model).__name__}\nROC_AUC: {roc_auc_test}\n')

Модель: HistGradientBoostingClassifier
ROC_AUC: 0.6722063798560853

CPU times: total: 1h 1min 37s
Wall time: 12min


In [46]:
from joblib import parallel_backend

In [47]:
%%time
hgbc_model_ = HistGradientBoostingClassifier(l2_regularization = 0.487, learning_rate = 0.065, max_depth = 49, max_leaf_nodes = 91,  scoring = 'roc_auc', random_state=RANDOM_SEED)
with parallel_backend('threading', n_jobs = 10):
    hgbc_model_.fit(X_train_prep.toarray(), y_train)
    
y_pred_prob_test = hgbc_model_.predict_proba(X_test_prep.toarray()) 
roc_auc_test = roc_auc_score(y_test, y_pred_prob_test[:, 1])
        
print(f'Модель: {type(hgbc_model_).__name__}\nROC_AUC: {roc_auc_test}\n')

Модель: HistGradientBoostingClassifier
ROC_AUC: 0.6876728430297292

CPU times: total: 1h 2min 10s
Wall time: 12min 39s
