In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline
# import warnings
# warnings.filterwarnings('ignore')
import optuna

from sklearn.model_selection import RepeatedStratifiedKFold, GroupKFold, KFold
# from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV, 
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb
import catboost as cb

# optuna.logging.set_verbosity(optuna.logging.WARNING)
import warnings
# warnings.filterwarnings('ignore')

# from xgboost import *
pd.set_option("display.max_columns", None)


In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sub = pd.read_csv('data/sample_submission.csv')
rev_target = pd.read_csv('data/revealed_targets.csv')

In [3]:
print("Train: ", train.shape)
print("Test: ", test.shape)
print("Sample_sub: ", sub.shape)
print("Rev_target: ", rev_target.shape)

Train:  (5237980, 17)
Test:  (33000, 16)
Sample_sub:  (33000, 3)
Rev_target:  (33162, 7)


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5237980 entries, 0 to 5237979
Data columns (total 17 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   stock_id                 int64  
 1   date_id                  int64  
 2   seconds_in_bucket        int64  
 3   imbalance_size           float64
 4   imbalance_buy_sell_flag  int64  
 5   reference_price          float64
 6   matched_size             float64
 7   far_price                float64
 8   near_price               float64
 9   bid_price                float64
 10  bid_size                 float64
 11  ask_price                float64
 12  ask_size                 float64
 13  wap                      float64
 14  target                   float64
 15  time_id                  int64  
 16  row_id                   object 
dtypes: float64(11), int64(5), object(1)
memory usage: 679.4+ MB


In [5]:
train.isna().sum()

stock_id                         0
date_id                          0
seconds_in_bucket                0
imbalance_size                 220
imbalance_buy_sell_flag          0
reference_price                220
matched_size                   220
far_price                  2894342
near_price                 2857180
bid_price                      220
bid_size                         0
ask_price                      220
ask_size                         0
wap                            220
target                          88
time_id                          0
row_id                           0
dtype: int64

In [6]:
train = train.drop("row_id", axis=1)
test = test.drop("row_id", axis=1)

In [7]:
def median_mode(data):
        col_obj = []
        col_num = []
        for i in tqdm(data.columns):
            if data[i].isna().sum() == 0:
                continue
            if data[i].dtype == "O":

                col_obj.append(i)
                data[i] = data[i].fillna(data[i].mode()[0])
            else:
                col_num.append(i)
                data[i] = data[i].fillna(data[i].median())
        print(f"на моду заменены значения в колонках: {col_obj}")
        print(f"на моду заменены значения в колонках: {col_num}")
        return data

In [8]:
train = median_mode(train)
test = median_mode(test)

100%|██████████████████████████████████████████████████████████████████████████████████| 16/16 [00:01<00:00, 13.85it/s]


на моду заменены значения в колонках: []
на моду заменены значения в колонках: ['imbalance_size', 'reference_price', 'matched_size', 'far_price', 'near_price', 'bid_price', 'ask_price', 'wap', 'target']


100%|████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 2140.97it/s]

на моду заменены значения в колонках: []
на моду заменены значения в колонках: ['far_price', 'near_price']





In [9]:
train.sample(n=5)

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id
962706,38,90,190,9353400.0,1,1.001957,10488940.0,0.999883,0.999889,1.001788,16465.94,1.002127,38512.5,1.00189,6.66976,4969
703404,168,66,150,269740600.0,-1,0.997408,274006000.0,0.999883,0.999889,0.997151,923013.0,0.997543,16911.65,0.997536,-3.179908,3645
2900035,133,268,170,8866312.0,1,0.99934,18737340.0,0.999883,0.999889,0.999028,44890.0,0.99934,4265.88,0.999313,-1.680255,14757
3266178,143,301,400,120160.0,1,1.00505,121422.6,0.999883,1.00505,1.001787,230090.36,1.00505,244776.84,1.003371,-6.66976,16595
1864029,144,173,290,2024821.0,1,0.997997,19582440.0,0.999883,0.999889,0.998117,16335.24,0.998599,8296.0,0.998437,-0.270009,9544


In [10]:
test.sample(n=5)

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,time_id
20678,78,479,480,0.0,0,0.999199,2687405.7,0.999199,0.999199,0.998198,173382.54,0.999199,150639.21,0.998734,26393
745,145,478,30,2092223.29,-1,0.999147,1268166.4,0.999833,0.999873,0.999147,35559.08,0.999463,18796.05,0.999354,26293
11346,146,479,10,589978.76,-1,0.999966,1940430.64,0.999833,0.999873,0.999292,15717.68,1.000573,296.94,1.000549,26346
17935,135,479,340,1816220.27,1,1.00024,11392920.26,1.028829,1.011525,0.999863,49359.06,1.00024,93782.93,0.999993,26379
5044,44,478,250,5598516.59,-1,0.999827,16434378.82,0.999833,0.999873,0.999827,6055.17,1.000045,65520.21,0.999845,26315


In [11]:
rev_target.head()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,time_id,revealed_target,revealed_date_id,revealed_time_id
0,0.0,478,0,26290,-2.310276,477.0,26235.0
1,1.0,478,0,26290,-12.850165,477.0,26235.0
2,2.0,478,0,26290,-0.439882,477.0,26235.0
3,3.0,478,0,26290,7.259846,477.0,26235.0
4,4.0,478,0,26290,4.780292,477.0,26235.0


In [12]:
sub.head(3)

Unnamed: 0,time_id,row_id,target
0,26290,478_0_0,1
1,26290,478_0_1,1
2,26290,478_0_2,1


In [13]:
train = train.dropna(subset=['target'])

In [14]:
train['target'].nunique()

15934

# baseline

In [15]:
params = {
                "n_estimators": 2000,
                "learning_rate": 0.001,
                "loss_function": "MAE",
                "eval_metric": "MAE",
                "task_type": "CPU",
                "max_bin": 20,
                "verbose": False,
                "max_depth": 6,
                "l2_leaf_reg": 10,
                "early_stopping_rounds": 50,
                "thread_count": 6,
                "random_seed": 42,
#                 "plot": True
            }

In [16]:
X, Y = train.drop("target", axis=1), train['target']

In [17]:
sk = KFold(n_splits = 5)
pret_test = list()
for i, (train_idx, test_idx) in enumerate(sk.split(X, Y)):

    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    Y_train, Y_test = Y.iloc[train_idx], Y.iloc[test_idx]
    
    model = cb.CatBoostRegressor(**params)
    if i == 0:
        print("Началось обучение...")
    else:
        print("обучение, эпоха: {}".format(i))
    model.fit(X_train, Y_train)#, plot=True)
    
    y_pred = model.predict(X_test)
    mae_score = mean_absolute_error(Y_test, y_pred)
    print("Fold : {}, mean_absolute_error: {}".format(i, mae_score))
    
    pret_test.append(model.predict(test))

Началось обучение...
Fold : 0, mean_absolute_error: 5.799211285589398
обучение, эпоха: 1
Fold : 1, mean_absolute_error: 7.1568691491880845
обучение, эпоха: 2
Fold : 2, mean_absolute_error: 6.399129799892014
обучение, эпоха: 3
Fold : 3, mean_absolute_error: 6.385988849963334
обучение, эпоха: 4
Fold : 4, mean_absolute_error: 6.008823114180683


In [18]:
sub.head()

Unnamed: 0,time_id,row_id,target
0,26290,478_0_0,1
1,26290,478_0_1,1
2,26290,478_0_2,1
3,26290,478_0_3,1
4,26290,478_0_4,1


In [21]:
ens_preds_test = pd.DataFrame(pret_test).apply(np.mean, axis = 0)

sub['target'] = ens_preds_test
sub.to_csv('sub_base.csv', index = False)

pred = model.predict(test)
sub['target'] = pred
sub.to_csv('sub_base1.csv', index = False)