# Подготовка данных

# 0. Импорт сторонних библиотек

In [1]:
import pandas as pd
print('pandas {}.'.format(pd.__version__))
import numpy as np
print('numpy {}.'.format(np.__version__))
from datetime import datetime
import pickle
import dill
print('dill {}.'.format(dill.__version__))
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import sklearn
print('sklearn {}.'.format(sklearn.__version__))
pd.options.display.max_columns = None
import gc 

import warnings
warnings.filterwarnings('ignore')

pandas 2.0.0.
numpy 1.24.1.
dill 0.3.6.
sklearn 1.3.2.


## 0.1 Настройки среды, объявление глобальных переменных

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

RANDOM_SEED = 42

## 0.2 Описание функций

In [3]:
BYTES_TO_MB_DIV = 0.000001
def print_memory_usage_of_data_frame(df):
        mem = round(df.memory_usage().sum() * BYTES_TO_MB_DIV, 3) 
        print("Memory usage is " + str(mem) + " MB")

In [4]:
def nan_click_viewed__deleting(X, y = None):
    X = X[X['click'].isna() == False]
    X = X[X['view'] != 0]
    return X 

In [5]:
def dub_dropping(X, y = None):
    X =  X.drop_duplicates()                              
    return X

In [6]:
def dub_dropping(X, y = None):
    X =  X.drop_duplicates()                              
    return X

In [7]:
def nan_filling(X, y = None):
    nan_cols = {}
    for col in X.columns:
        nan_count = X[col].isna().sum()
        if nan_count != 0:
            per = np.round((nan_count / df.shape[0]) * 100, 2)
            nan_cols[col] = per
                       
    for col, per in nan_cols.items():
        if per < 5:
            X = X[X[col].isna() == False]
        else:
            X[col] = X[col].fillna('unknown')
            
    return X

# 1. Чтение исходных датасетов

In [8]:
df_clickhouse = pd.read_csv('../internship/clickhouse.csv')
df_clickhouse.shape

(5889167, 39)

In [9]:
req_df_columns = df_clickhouse.columns

In [10]:
df_creatives = pd.read_csv('../internship/creatives.csv')
df_creatives.shape

(132, 15)

In [11]:
df_creatives = df_creatives.rename(columns={'id': 'creative_id'})

# 2. Объединение датасетов

In [12]:
df = pd.merge(df_clickhouse, df_creatives, on="creative_id")
df.shape

(5889167, 53)

# 3. Предобработка объединённого датасета

In [13]:
df = nan_filling(dub_dropping(nan_click_viewed__deleting(df)))

# 4. Разбиение датасета на X и y

In [14]:
X_train = df.drop('click', axis = 1)
y_train = df['click']

# 5. Описание функций для предпроцессинга

In [31]:
def column_deleting(X, y = None):
    columns_for_drop = ['auction_date_time', 'impression_hash', 'ssp', 'is_pay',  'pay_price', 'auction_id', 'bid_id', 'auction_type', 'bid_floor', 'bid_price', 'is_win', 'view', 'place_number', 'ssp_user_id', 'campaign_id_x', 'enter_utm_campaign',  'stream_id_x', 'link_id', 'format', 'ip_v4', 'ip_v6', 'iab_category_x', 'ab_test', 'campaign_id_y', 'user_id', 'stream_id_y', 'is_deleted', 'image', 'created_at', 'updated_at']
    X = X.drop(columns_for_drop, axis = 1)
    X.reset_index(drop=True, inplace=True)
    return X

In [32]:
def feature_reduction(X, y = None):
    X['browser'] = X['browser'].apply(lambda x : 'Chrome' if x == 'Google' else x)
    X['browser'] = X['browser'].apply(lambda x : 'Yandex Browser' if x == 'Yandex' else x)
    X.reset_index(drop=True, inplace=True)
    return X

In [33]:
def date_time_col_transformation(X, y = None):
    X['req_date_time'] = pd.to_datetime(X['event_date_time'], format='%Y-%m-%d %H:%M:%S')
    X['day'] =  X['req_date_time'].dt.day
    X['month'] =  X['req_date_time'].dt.month
    X['year'] =  X['req_date_time'].dt.year
    X['hour'] =  X['req_date_time'].dt.hour
    X['day_of_week'] = X['event_date_time'].apply(lambda x : datetime.strptime(x, "%Y-%m-%d %H:%M:%S").weekday())
    X['is_holyday'] = X['day_of_week'].apply(lambda x : 1 if x in [5 , 6] else 0)
    X['is_friday'] = X['day_of_week'].apply(lambda x : 1 if x == 4 else 0)
    X['is_monday'] = X['day_of_week'].apply(lambda x : 1 if x == 0 else 0)
    X['day_part'] = X['hour'].apply(lambda x : 'Night' if x >= 22 and x < 5 else ('Morning' if x >= 5 and x < 12 else ('Day' if x >= 12 and x < 17 else 'Evening')))
    X['year_part'] = X['month'].apply(lambda x : 'Winter' if x in [12, 1, 2] else ('Spring' if x in [3, 4, 5] else ('Summer' if x in [6, 7, 8] else 'Autumn')))
    X = X.drop(['event_date_time', 'req_date_time', 'year'], axis = 1)
    X.reset_index(drop=True, inplace=True)
    return X

In [34]:
def type_transformation(X, y = None):
    X['loss_reason'] = X['loss_reason'].astype('int8')
    X['device'] = X['device'].astype('int8')
    X['day'] = X['day'].astype('int8')
    X['month'] = X['month'].astype('int8')
    X['hour'] = X['hour'].astype('int8')
    X['day_of_week'] = X['day_of_week'].astype('int8')
    X['is_holyday'] = X['is_holyday'].astype('int8')
    X['is_friday'] = X['is_friday'].astype('int8')
    X['is_monday'] = X['is_monday'].astype('int8')
    X.reset_index(drop=True, inplace=True)
    return X

Обьявление Encoder:

In [35]:
ohe = OneHotEncoder(sparse = True, max_categories = None, handle_unknown = 'ignore')


In [42]:
def cat_encoding(X, y = None):

    cat_cols = X.columns
    #ohe.fit(X[cat_cols])
    X.reset_index(drop=True, inplace=True)
    ft = ohe.transform(X[cat_cols])

    X = ft
    
    return X

# 6. Создание пайплайна препроцессинга

In [43]:
preprocessor = Pipeline(
    steps=
    [
        ('column_deleting', FunctionTransformer(func=column_deleting)),
        ('feature_reduction', FunctionTransformer(func=feature_reduction)),
        ('date_time_col_transformation', FunctionTransformer(func=date_time_col_transformation)),
        ('type_transformation', FunctionTransformer(func=type_transformation)),
        ('cat_encoding', FunctionTransformer(func=cat_encoding)),
        
    ]
)

# 7. Препроцессинг обучающей выборки X

In [41]:
X_train_prep = preprocessor.fit_transform(X_train, y_train)

Сохранение энкодера и списка параметров объединенного датасета:

In [44]:
preprocessing_dict = {'encoder': ohe}
preprocessing_dict['req_df_columns'] = req_df_columns

In [None]:
req_df_columns

In [45]:
with open('../pickles/preprocessing_dict_cat_2.pickle', 'wb') as f:
    dill.Pickler(f, recurse=True).dump(preprocessing_dict)
   # pickle.dump(preprocessing_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

Сохранение препроцессора:

In [46]:
preprocessor_pipline_dict = {}
preprocessor_pipline_dict['preprocessor'] = preprocessor

In [47]:
with open("../pickles/preprocessor_cat_pipline_dict_2.pkl", "wb") as f:
        dill.Pickler(f, recurse=True).dump(preprocessor_pipline_dict)

In [48]:
print_memory_usage_of_data_frame(X_train_prep)

AttributeError: 'csr_matrix' object has no attribute 'memory_usage'

# 8. Определение модели

In [28]:
%%time
hgbc_model = HistGradientBoostingClassifier(scoring = 'roc_auc', random_state=RANDOM_SEED)
hgbc_model.fit(X_train_prep.toarray(), y_train)
y_pred_prob_test = hgbc_model.predict_proba(X_test_prep.toarray()) 
roc_auc_test = roc_auc_score(y_test, y_pred_prob_test[:, 1])
    
print(f'Модель: {type(hgbc_model).__name__}\nROC_AUC: {roc_auc_test}\n')

Модель: HistGradientBoostingClassifier
ROC_AUC: 0.670343702685418

CPU times: total: 40min 35s
Wall time: 10min 48s


# 9. Тюннинг модели

In [None]:
%%time
hgbc_model = HistGradientBoostingClassifier(l2_regularization = 0.487, learning_rate = 0.065, max_depth = 49, max_leaf_nodes = 91,  scoring = 'roc_auc', random_state=RANDOM_SEED)
hgbc_model.fit(X_train_prep.toarray(), y_train)
# y_pred_prob_test = hgbc_model.predict_proba(X_test_prep.toarray()) 
# roc_auc_test = roc_auc_score(y_test, y_pred_prob_test[:, 1])
    
# print(f'Модель: {type(hgbc_model).__name__}\nROC_AUC: {roc_auc_test}\n')

In [48]:
y_pred_prob_test = hgbc_model.predict_proba(X_test_prep.toarray()) 
y_pred_prob_test

array([[0.9684323 , 0.0315677 ],
       [0.9684323 , 0.0315677 ],
       [0.97534416, 0.02465584],
       [0.97534416, 0.02465584],
       [0.98825991, 0.01174009],
       [0.98825991, 0.01174009],
       [0.97627256, 0.02372744],
       [0.97627256, 0.02372744],
       [0.98620594, 0.01379406],
       [0.98239241, 0.01760759],
       [0.94606829, 0.05393171],
       [0.97727549, 0.02272451],
       [0.97583572, 0.02416428],
       [0.9753912 , 0.0246088 ]])

# 10. Сохранение обученной модели

In [36]:
model_dict = {}

model_dict['metadata'] = 'Модель: HistGradientBoostingClassifier, ROC_AUC test: 0.6728202352712497'
model_dict['model'] = hgbc_model
with open("../pickles/model_cat_dict_2.pkl", "wb") as f:
        dill.Pickler(f, recurse=True).dump(model_dict)