In [1]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
from catboost import CatBoostClassifier

In [2]:
dataset = pd.read_parquet('../data/dataset.parquet')

In [3]:
dataset = dataset.sort_values(by='event_date')

In [4]:
dataset

Unnamed: 0,user_id,adv_campaign_id,platform_id,adv_creative_id,event_date,banner_code,is_main,target,dayofweek,is_weekend,...,campaign_budget_per_day,logcat_id,creative_click_rate,creative_impressions,banner_click_rate,banner_impressions,platform_ctr,microcat_popularity,parent_microcat_count,user_campaign_interaction_rate
64734484,2955818,138,2,102,2024-09-01,8,True,0,6,1,...,1879.886858,65,0.005781,57256,0.005474,74296252,0.006028,551,93,0.0
65190669,2325517,704,3,4243,2024-09-01,6,True,0,6,1,...,10744.114955,18,0.003695,152094,0.003393,24006607,0.004186,6260,2364,0.0
65190668,1404068,2737,3,3398,2024-09-01,6,True,0,6,1,...,377.240209,7,0.002556,58290,0.003393,24006607,0.004186,16,5,0.0
65190667,2696683,322,2,2978,2024-09-01,8,True,0,6,1,...,3998.736003,40,0.003661,158161,0.005474,74296252,0.006028,671,136,0.0
65190666,2057085,2261,2,433,2024-09-01,8,True,0,6,1,...,739.639242,40,0.004923,53624,0.005474,74296252,0.006028,671,136,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52295997,495334,1346,2,288,2024-09-22,8,True,0,6,1,...,4091.154297,25,0.008013,36817,0.005474,74296252,0.006028,44,13,0.0
52295996,1106316,1016,3,3790,2024-09-22,6,True,0,6,1,...,3198.056315,40,0.005022,162888,0.003393,24006607,0.004186,671,136,0.0
52295995,1763445,2794,2,920,2024-09-22,8,True,0,6,1,...,3539.982096,18,0.005075,106200,0.005474,74296252,0.006028,6260,2364,0.0
52296001,1153377,1446,3,862,2024-09-22,6,True,0,6,1,...,423.067976,65,0.004895,32889,0.003393,24006607,0.004186,551,93,0.0


In [9]:
def reduce_mem_usage(df):
    """
    Уменьшает размер датафрейма за счёт оптимизации типов для int и float столбцов
    :param df: входной датафрейм
    :return df: оптимизированный датафрейм
    """
    for col in df.columns:
        col_type = df[col].dtype.name

        if col_type not in ['object', 'category', 'datetime64[ns, UTC]', 'datetime64[ns]', 'bool']:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    return df


In [10]:
dataset = reduce_mem_usage(dataset)

In [11]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 114741035 entries, 64734484 to 47642688
Data columns (total 28 columns):
 #   Column                          Dtype         
---  ------                          -----         
 0   user_id                         int32         
 1   adv_campaign_id                 int16         
 2   platform_id                     int8          
 3   adv_creative_id                 int16         
 4   event_date                      datetime64[ns]
 5   banner_code                     int8          
 6   is_main                         bool          
 7   target                          int8          
 8   dayofweek                       int8          
 9   is_weekend                      int8          
 10  end_date                        datetime64[ns]
 11  days_to_campaign_end            int8          
 12  is_campaign_early               bool          
 13  user_click_rate                 float16       
 14  user_impressions_count          int16        

In [12]:
test_size = int(len(dataset)*0.2)
train_size = len(dataset) - test_size

In [13]:
train = dataset[:train_size]
test = dataset[train_size:]

In [20]:
train.to_parquet('../data/train.parquet')
test.to_parquet('../data/test.parquet')