# Import Libraries 📂

In [None]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import xgboost as xgb
import optuna   
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
from sklearn.metrics import accuracy_score

# Importing Data 📚



In [None]:
%%time
train = pd.read_csv('../input/jane-street-market-prediction/train.csv')
train = train.astype({c: np.float32 for c in train.select_dtypes(include='float64').columns}) #limit memory use

# Preparing Data

In [None]:
#Нам не нужны сделки с нулевым весом, поэтому мы их игнорируем
train = train.query('weight > 0').reset_index(drop = True)
train.shape

In [None]:
#Данные будут с 86 дня
train = train.query('date > 85').reset_index(drop = True)
train.shape

In [None]:
#Заполняем пропущенные значения средним 
train.fillna(train.mean(),inplace=True)

In [None]:
#Генерируем значения 0 или 1 на основе функций resp и сохраняем их в столбце 'action'
train['action'] = (train['resp'] > 0 ).astype('int')

In [None]:
resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp']

In [None]:
features_train_data  = train.iloc[:,7:137]

In [None]:
# Найдем пары признаков с корреляцией > |0.9|
def corrFilter(x: pd.DataFrame, bound: float):
    xCorr = x.corr()
    xFiltered = xCorr[((xCorr >= bound) | (xCorr <= -bound)) & (xCorr !=1.000)]
    xFlattened = xFiltered.unstack().sort_values().drop_duplicates()
    return xFlattened

high_correlations=corrFilter(features_train_data, .9).to_frame()

In [None]:
high_correlations

In [None]:
all_drop_cols = set(high_correlations.index.get_level_values(0))

In [None]:
features = features_train_data.columns.tolist()

In [None]:
for i in all_drop_cols:
    features.remove(i)

In [None]:
f_mean = np.mean(train[features].values,axis=0)

# Creating Train and Test DataFrame 

In [None]:
X = train.loc[:, features].values
y = np.stack([(train[c] > 0).astype('int') for c in resp_cols]).T
print(X.shape, y.shape)

# Model: XGBClassifier | Using Optuna for Hyperparameter Tuning

In [None]:
def objective(trial):
    
# params specifies the XGBoost hyperparameters to be tuned
    params = {
        
        'max_depth': trial.suggest_int('max_depth', 10, 20),
        'eta': trial.suggest_uniform('eta', 0.01, .1),
        'subsample': trial.suggest_uniform('subsample', 0.50, 1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.50, 1),
        'gamma': trial.suggest_int('gamma', 0, 10),
        'tree_method': 'gpu_hist',  
        'objective': 'binary:logistic',
        'eval_metric': 'auc'
    }
    
    bst = xgb.train(params, dtrain)
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
# испытания будут оцениваться на основе их точности на тестовом наборе
    accuracy = sklearn.metrics.accuracy_score(y_valid, pred_labels)
    return accuracy

In [None]:
models = [] # список моделей, которых мы будем обучать

for i in tqdm(range(y.shape[1])):
    x_train, x_valid, y_train, y_valid = train_test_split(X, y[:,i], test_size=0.2, random_state = 42, stratify=y[:,i])
    
    # Создаем специфичный для Xgboost формат данных DMatrix из массива numpy для оптимизации потребления памяти
    dtrain = xgb.DMatrix(x_train, label=y_train)
    dvalid = xgb.DMatrix(x_valid, label=y_valid)
    
    # Подбираем параметры 
    study = optuna.create_study(direction='maximize')
    study.optimize(objective,n_trials=5)
    
    best_params = study.best_trial.params
    best_params['tree_method'] = 'gpu_hist'      
    best_params['objective'] = 'binary:logistic'
    best_params['eval_metric'] = 'auc'
    
    # Освобождаем немного места
    del x_train, x_valid, y_train, y_valid, dtrain, dvalid
    
    # Обучаем
    clf = xgb.XGBClassifier(**best_params).fit(X,y[:,i])
    
    # Сохраняем модель
    nom_fich = "weights_target_" + resp_cols[i]
    clf.save_model(nom_fich)
    models.append(clf)

# Submission

In [None]:
f = np.median
th = 0.5000
import janestreet
env = janestreet.make_env()
test=env.iter_test()

In [None]:
for (test_df, pred_df) in tqdm(test):
    if test_df['weight'].item() > 0:
        x_tt = test_df.loc[:, features].values
        if np.isnan(x_tt.sum()):
            x_tt = np.nan_to_num(x_tt) + np.isnan(x_tt) * f_mean
        
        pred = f(np.stack([model.predict(x_tt) for model in models]),axis=0).T
        #pred = np.median(np.mean([model.predict(x_tt).numpy() for model in models],axis=0))
        pred_df.action = np.where(pred >= th, 1, 0).astype(int)
    else:
        pred_df.action = 0
    env.predict(pred_df)