In [None]:
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import optuna
import gc
import os
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
import logging
import sys
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from skopt import gp_minimize
from skopt.space import Real
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import numpy as np
from skopt import gp_minimize
from skopt.space import Real
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import numpy as np
import gc
from numba import jit

In [None]:
# для кагла
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)


logging.basicConfig(
    level=logging.INFO,
    handlers=[logging.StreamHandler(sys.stdout)],  # Вывод в stdout
    format="%(asctime)s - %(levelname)s - %(message)s"
)

In [None]:
class Preprocessing:
    def __init__(self, random_seed):
        self.random_state = random_seed
        self.train = None
        self.test = None
        self.target = None
        self.cat_features = None

    def read_data(self, train_path: str, test_path: str) -> None:
        '''Создается train, test, target для train'''
        logging.info('\nНачало предобработки')
        self.train = pd.read_parquet(train_path)
        self.test = pd.read_parquet(test_path)
        self.target = self.train['target']
        self.train.drop(['target', 'smpl'], axis=1, inplace = True)
        self.test.drop(['smpl'], axis=1, inplace = True)
        logging.info('Данные прочитаны и разделены')

    def type_research(self) -> None:
        self.cat_features = []
        for i in self.train.columns:
            if self.train[i].dropna().nunique()/len(self.train[i].dropna()) < 0.5:
                self.cat_features.append(i)
        logging.info('Типы определены')

    def filling_nans(self) -> None:
        '''...'''

        self.train[self.cat_features] = self.train[self.cat_features].astype('string')
        self.test[self.cat_features] = self.test[self.cat_features].astype('string')
        logging.info('Пропущенные значения заполнены')


    def select_features_with_catboost(self) -> None:
        X_train, X_test, y_train, y_test = train_test_split(self.train, self.target, test_size=0.2, random_state=self.random_state, stratify=self.target)
        params = {
            'iterations': 3000,
            'early_stopping_rounds': 150,
            'loss_function': 'CrossEntropy',
            'l2_leaf_reg': 4.5,
            'task_type': 'GPU',
            'cat_features': self.cat_features,
            'random_seed': self.random_state,
            'grow_policy': 'Lossguide',
            'eval_metric': 'AUC',
            'depth': 7
        }
        if len(X_train) < 100000:
            fractions = [0.4, 0.3, 0.2]
            fractions = [0.4]
        else:
            fractions = [0.4]
        best_score = 0
        last_features = None
        for _ in fractions:
            model = CatBoostClassifier(**params)
            summary = model.select_features(X_train, y_train,
                              eval_set=(X_test, y_test),
                              features_for_select=X_train.columns,
                              num_features_to_select=round(len(X_train.columns) * _),
                              steps=3,
                              algorithm='RecursiveByPredictionValuesChange',
                              train_final_model=True,
                              logging_level='Silent')
            if model.get_best_score()['validation']['AUC'] > best_score:
                best_score = model.get_best_score()['validation']['AUC']
                last_features = summary['selected_features_names']

        del X_train, X_test, y_train, y_test
        gc.collect()
        self.train = self.train[last_features]
        self.test = self.test[last_features]
        self.cat_features = list(set(last_features) & set(self.cat_features))
        logging.info('Признаки отобраны')

    def remove_correlated_features(self):
        pass


    def normalize_data(self) -> None:
        numeric_columns = self.train.select_dtypes(include=[np.number]).columns
        if not numeric_columns.empty:
            scaler = StandardScaler()
            columns_to_scale = self.train.select_dtypes(include=[np.number]).columns
            self.train[columns_to_scale] = scaler.fit_transform(self.train[columns_to_scale])
            self.test[columns_to_scale] = scaler.transform(self.test[columns_to_scale])
            logging.info('Числовые столбцы нормализованны')
        else:
            logging.info('Нормализация не потребовалась')

    def preprocess_all_data(self, train_path: str, test_path: str):
        self.read_data(train_path, test_path)
        self.type_research()
        self.filling_nans()
        self.remove_correlated_features()
        self.select_features_with_catboost()
        self.normalize_data()
        return self.train, self.test, self.target, self.cat_features


In [None]:
class Model:
    def __init__(self, cat_features, random_seed):
        self.cat_features = cat_features
        self.random_seed = random_seed

    def predict(self, X_test) -> np.ndarray:
        return self.model.predict(X_test)

    def predict_proba(self, X_test) -> np.ndarray:
        return self.model.predict_proba(X_test)[:, 1]


class CB_model(Model):
    def __init__(self, cat_features: str, random_seed: int):
        super().__init__(cat_features, random_seed)
        self.params = {'iterations': 3000,
          'task_type': 'GPU',
          'depth': 7,
          'early_stopping_rounds': 150,
          'grow_policy' : 'Lossguide',
          'l2_leaf_reg': 4.5,
          'eval_metric': 'AUC',
          'bagging_temperature': .5,
          'bootstrap_type': 'Bayesian',
          'loss_function': 'CrossEntropy',
          'random_strength': 20,
          'verbose' : False,
          'random_state' : self.random_seed,
         }

    def train_with_validation(self, X_train, y_train, X_test, y_test) -> None:
        logging.info('Начало обучения CatBoost')

        self.model = CatBoostClassifier(**self.params)
        self.model.fit(X_train, y_train, cat_features = self.cat_features, eval_set = (X_test, y_test))
        self.params['iterations'] = self.model.get_best_iteration() + 200
        logging.info('CatBoost обучен на валидации')

class XGB_model(Model):
    def __init__(self, cat_features: str, random_seed: int):
        super().__init__(cat_features, random_seed)
        self.params = {
                'n_estimators': 3000,
                'learning_rate':0.03,
                'max_depth': 7,
                'use_label_encoder':False,
                'enable_categorical':True,
                'eval_metric':'auc',
                'tree_method':'hist',
                'device':'cuda',
                'reg_lambda': 8,
                'reg_alpha': 20,
                'max_bin': 1000,
                'subsample': 0.8,
                'grow_policy': 'lossguide',
                'min_child_weight' : 5,
                'sampling_method': 'uniform',
                'early_stopping_rounds': 100,
                'random_state': self.random_seed,
                'colsample_bytree' : 0.8,
            }


    def train_with_validation(self, X_train, y_train, X_test, y_test) -> None:
        X_train[self.cat_features] = X_train[self.cat_features].astype('category')
        X_test[self.cat_features] = X_test[self.cat_features].astype('category')

        self.model = XGBClassifier(**self.params)
        self.model.fit(X_train, y_train,
                       eval_set = [(X_test, y_test)],
                      verbose=False)
        self.params['n_estimators'] = self.model.best_iteration + 200
        del self.params['early_stopping_rounds'], self.params['eval_metric'] #

        logging.info('XGB обучен на валидации')

class Model_training:
    def __init__(self, random_seed):
        self.random_seed = random_seed
        self.cb1 = None
        self.xgb1 = None
        self.cb2 = None
        self.xgb2 = None

    def train_and_tune_models(self, train, target, cat_features) -> None:
        logging.info('\nНачало обучения моделей на валидации')
        self.cb1 = CB_model(cat_features, self.random_seed)
        self.xgb1 = XGB_model(cat_features, self.random_seed)
        self.cb2 = CB_model(cat_features, self.random_seed)
        self.xgb2 = XGB_model(cat_features, self.random_seed)
        self.cb1.params['depth'] = 6
        if len(train) > 700000:
            self.xgb1.params['max_depth'] = 8
        else:
            self.xgb1.params['max_depth'] = 6

        X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.2, random_state=self.random_seed, stratify=target)
        self.cb1.train_with_validation(X_train, y_train, X_test, y_test)
        self.xgb1.train_with_validation(X_train, y_train, X_test, y_test)
        self.cb2.train_with_validation(X_train, y_train, X_test, y_test)
        self.xgb2.train_with_validation(X_train, y_train, X_test, y_test)

    def get_trained_models(self):
        return {
            'catboost_1': [self.cb1.model, self.cb1.params],
            'xgboost_1': [self.xgb1.model, self.xgb1.params],
            'catboost_2': [self.cb2.model, self.cb2.params],
            'xgboost_2': [self.xgb2.model, self.xgb2.params],
        }

In [None]:
class Ensemble:
    def __init__(self, random_seed):
        self.random_seed = random_seed
        self.best_weights = None
        self.metrics = None
        self.models_rate = None
        self.cb_blender_rs = None
        self.blending = None

    def calculate_roc_auc(self, models, train, target, cat_features):
        logging.info('Начало обучения ансамбля')

        X_train, X_test, y_train, y_test = train_test_split(train, target, stratify=target, test_size=0.2, random_state=self.random_seed)
        roc_auc_scores = {}

        for model_name, model in models.items():
            if isinstance(model[0], CatBoostClassifier):
                X_test[cat_features] = X_test[cat_features].astype('str')
            else:
                X_test[cat_features] = X_test[cat_features].astype('category')
            y_pred_proba = model[0].predict_proba(X_test)[:, 1]
            score = roc_auc_score(y_test, y_pred_proba)
            roc_auc_scores[model_name] = score

        sorted_models = sorted(roc_auc_scores.items(), key=lambda x: -x[1])
        sorted_model_names, sorted_metrics = zip(*sorted_models)

        self.models_rate = list(sorted_model_names)
        self.metrics = list(sorted_metrics)
        logging.info('ROC-AUC почитан, модели отсортированны')

    def compute_weights(self, models, train, target, cat_features):
        X_train, X_test, y_train, y_test = train_test_split(train, target, stratify=target, test_size=0.2, random_state=self.random_seed)

        preds = []
        score = 0
        for model in self.models_rate:
            if isinstance(models[model][0], CatBoostClassifier):
                X_test[cat_features] = X_test[cat_features].astype('str')
            else:
                X_test[cat_features] = X_test[cat_features].astype('category')
            preds.append(models[model][0].predict_proba(X_test)[:, 1])
        preds = np.array(preds)
        for a in tqdm(np.arange(0.25, 1 + 0.00001, 0.05)):
            for b in np.arange(0, 1 - a + 0.00001, 0.05):
                for c in np.arange(0, 1 - a - b + 0.00001,0.05):
                    cur = roc_auc_score(y_test, preds[0] * a + preds[1] * b + preds[2] * c + preds[3] * (1 - a - b - c))
                    if cur > score:
                        weights = np.array([a, b, c, 1 - a - b - c])
                        score = cur


        self.best_weights = weights
        print(weights)
        logging.info('Веса ансамбля посчитаны')


    def train_blending_models(self, models, train, target, cat_features):
        trained_models = []

        for model_name in self.models_rate:
            model = models[model_name][0].__class__()
            print(models[model_name][1])
            model.set_params(**models[model_name][1])

            if isinstance(models[model_name][0], CatBoostClassifier):
                train[cat_features] = train[cat_features].astype('str')

            else:
                train[cat_features] = train[cat_features].astype('category')

            trained_model = model.fit(train, target)
            trained_models.append(trained_model)

        self.blending = trained_models
        logging.info('Модели для ансамбля обучены')

    def blending_predict(self, test, cat_features):
        predictions = []
        for model, weight in zip(self.blending, self.best_weights):
            if isinstance(model, CatBoostClassifier):
                test[cat_features] = test[cat_features].astype('str')
                predictions.append(model.predict_proba(test)[:, 1] * weight)
            else:
                test[cat_features] = test[cat_features].astype('category')
                predictions.append(model.predict_proba(test)[:, 1] * weight)
        predictions = np.array(predictions)
        return np.sum(predictions, axis = 0)
        logging.info('Получено предсказание блендинга')

    def stacking(self, train, target, test, models, cat_features):
        blend_predictions = self.blending_predict(test, cat_features)

        return blend_predictions
        logging.info('Получено финальное предсказание')


    def train_ensemble(self, train, test, target, cat_features, models):
        self.calculate_roc_auc(models, train, target, cat_features)
        self.compute_weights(models, train, target, cat_features)
        self.train_blending_models(models, train, target, cat_features)
        final_predictions = self.stacking(train, target, test, models, cat_features)
        return final_predictions


In [None]:
class JAMAL:
    def __init__(self, random_seed: int):
        self.random_seed = random_seed

    def all_pipeline(self, train_path: str, test_path: str) -> pd.Series:
        ### Весь preprocessing
        preprocessor = Preprocessing(self.random_seed)
        self.train, self.test, self.target, self.cat_features = preprocessor.preprocess_all_data(train_path, test_path)
        del preprocessor
        gc.collect()
        ### Тюнинг
        models = Model_training(self.random_seed)
        # Трейнятся и тюнятся модели
        models.train_and_tune_models(self.train, self.target, self.cat_features)
        # Тюнится Ансамблирование
        models_for_ensemble = models.get_trained_models()
        ensemble = Ensemble(self.random_seed)
        y_pred = ensemble.train_ensemble(self.train, self.test, self.target, self.cat_features, models_for_ensemble)
        return y_pred

In [None]:
def make_predict():
    # Определяем пути к папкам 'data' и 'target_data'
    data = 'data'
    data_folder = os.listdir(data)

    # Словарь для train и test файлов
    train_test_files = {}

    # Сканируем папку `data`
    for dataset_name in data_folder:
        dataset_path = data + f'/{dataset_name}'

        if not os.path.isdir(dataset_path):
            continue  # Пропускаем файлы, если они есть на верхнем уровне

        # Ищем файлы train и test
        train_files = []
        test_file = None


        train_files = [
            os.path.join(dataset_path, f) for f in os.listdir(dataset_path) if f.endswith('.parquet') and 'train' in f
        ]

        test_file = next(
            (os.path.join(dataset_path, f) for f in os.listdir(dataset_path) if f.endswith('.parquet') and 'test' in f),
            None
        )
        print(train_files)
        jamal = JAMAL(42)
        y_pred = jamal.all_pipeline(train_path=train_files,
                                   test_path=test_file)

        del jamal
        gc.collect()
        predictions = pd.DataFrame()
        predictions['id'] = pd.read_parquet(test_file)['id']
        predictions['target'] = y_pred
        predictions.to_csv(f"predictions/{dataset_name}.csv", index=False)

if __name__ == "__main__":
    make_predict()