# Import Libraries 📂

In [None]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import catboost as cb
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
from sklearn.metrics import accuracy_score, roc_curve, auc, confusion_matrix

# Importing Data 📚



In [None]:
%%time
train = pd.read_csv('../input/jane-street-market-prediction/train.csv')
train = train.astype({c: np.float32 for c in train.select_dtypes(include='float64').columns}) #limit memory use

# Preparing Data

In [None]:
#Нам не нужны сделки с нулевым весом, поэтому мы их игнорируем
train = train.query('weight > 0').reset_index(drop = True)
train.shape

In [None]:
#Данные будут с 86 дня
train = train.query('date > 85').reset_index(drop = True)
train.shape

In [None]:
#Заполняем пропущенные значения средним 
train.fillna(train.mean(),inplace=True)

In [None]:
#Генерируем значения 0 или 1 на основе функций resp и сохраняем их в столбце 'action'
train['action'] = (train['resp'] > 0 ).astype('int')

In [None]:
resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp']

In [None]:
features_train_data  = train.iloc[:,7:137]

In [None]:
# Найдем пары признаков с корреляцией > |0.9|
def corrFilter(x: pd.DataFrame, bound: float):
    xCorr = x.corr()
    xFiltered = xCorr[((xCorr >= bound) | (xCorr <= -bound)) & (xCorr !=1.000)]
    xFlattened = xFiltered.unstack().sort_values().drop_duplicates()
    return xFlattened

high_correlations=corrFilter(features_train_data, .9).to_frame()

In [None]:
all_drop_cols = set(high_correlations.index.get_level_values(0))

In [None]:
features = features_train_data.columns.tolist()

In [None]:
# for i in all_drop_cols:
#     features.remove(i)

In [None]:
f_mean = np.mean(train[features].values,axis=0)

# Creating Train and Test DataFrame 

In [None]:
X = train.loc[:, features].values
y = np.stack([(train[c] > 0).astype('int') for c in resp_cols]).T

In [None]:
models = [] # список моделей, которых мы будем обучать

for i in tqdm(range(y.shape[1])):
    x_tr,x_val,y_tr,y_val = train_test_split(X ,y[:,i],test_size=0.2,stratify=y[:,i])
    model = CatBoostClassifier(iterations = 5000,
                          depth=10,
                          learning_rate = 0.1,
                          random_seed = 42,
                          eval_metric='Accuracy',
                          custom_metric=['Logloss', 'AUC'],
                          od_wait=500,
                          task_type='GPU',
                         )
    model.fit(x_tr, y_tr,
         eval_set=(x_val, y_val),
         verbose_eval=100,
         use_best_model=True,
         #plot=True
         )
    
    nom_fich = "weights_target_" + resp_cols[i]
    
    model.save_model(nom_fich)
    
    models.append(model)

# Submission

In [None]:
f = np.median
th = 0.5000
import janestreet
env = janestreet.make_env()
for (test_df, pred_df) in tqdm(env.iter_test()):
    if test_df['weight'].item() > 0:
        x_tt = test_df.loc[:, features].values
        if np.isnan(x_tt.sum()):
            x_tt = np.nan_to_num(x_tt) + np.isnan(x_tt) * f_mean
        
        pred = f(np.stack([model.predict(x_tt) for model in models]),axis=0).T
        pred_df.action = np.where(pred >= th, 1, 0).astype(int)
    else:
        pred_df.action = 0
    env.predict(pred_df)