In [4]:
import os
import numpy as np
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from constants import DATASET_NAME, DATASET_PATH_PATTERN, TEST_SIZE, RANDOM_STATE
from utils import get_logger, load_params
import mlflow

PREPROCESSORS = {preprocessor.__name__: preprocessor for preprocessor in [OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler]}
STAGE_NAME = 'process_data'

def process_data():
    logger = get_logger(logger_name=STAGE_NAME)
    params = load_params(stage_name=STAGE_NAME)

    logger.info('Начали скачивать данные')
    dataset = load_dataset(DATASET_NAME)
    logger.info('Успешно скачали данные!')

    logger.info('Делаем предобработку данных')
    df = dataset['train'].to_pandas()
    target_column = 'income'
    columns = df.columns if params['features'] == 'all' else params['features']
    drop_cols = set(params['cols_to_drop'])
    columns = [col for col in columns if col not in drop_cols and col != target_column]

    X, y = df[columns], df[target_column]
    logger.info(f'    Используемые фичи: {columns}')

    all_cat_features = params['cats']
    cat_features = list(set(columns) & set(all_cat_features))
    num_features = list(set(columns) - set(all_cat_features))

    y: pd.Series = (y == '>50K').astype(int)
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=TEST_SIZE, 
                                                        shuffle=True, 
                                                        stratify=y, 
                                                        random_state=RANDOM_STATE
                                                        )

    X_train, X_test, y_train, y_test = map(lambda x: x.reset_index(drop=True), [X_train, X_test, y_train, y_test])

    if isinstance(params['train_size'], float):
        assert params['train_size'] <= 1, f'{params['train_size']=} must be <= 1'
        train_len = X_train.shape[0]
        X_train = X_train.iloc[np.arange(0, int(train_len * params['train_size']))] # Можем, потому что shuffle=True
        y_train = y_train.iloc[np.arange(0, int(train_len * params['train_size']))]

    elif isinstance(params['train_size'], int):
        assert X_train.shape[0] >= params['train_size'], f"{params['train_size']=} must be <=  {X_train.shape[0]=}"
        X_train = X_train.iloc[np.arange(0, params['train_size'])] # Можем, потому что shuffle=True
        y_train = y_train.iloc[np.arange(0, params['train_size'])]

    cats_preprocessor = PREPROCESSORS[params['cats_encoder']['name']](**params['cats_encoder']['params'])

    if params['num_scalers']['name'] is not None:
        num_preprocessor = PREPROCESSORS[params['num_scalers']['name']](**params['num_scalers']['params'])
    else:
        num_preprocessor = Pipeline([("_", "passthrough")])

    X_train_cats = cats_preprocessor.fit_transform(X_train[cat_features])
    X_test_cats = cats_preprocessor.transform(X_test[cat_features])

    X_train_num = num_preprocessor.fit_transform(X_train[num_features])
    X_test_num = num_preprocessor.transform(X_test[num_features])

    X_train = np.hstack([X_train_cats, X_train_num])
    X_test = np.hstack([X_test_cats, X_test_num])

    logger.info(f'    Размер тренировочного датасета: {len(y_train)}')
    logger.info(f'    Размер тестового датасета: {len(y_test)}')

    logger.info('Начали сохранять датасеты')
    os.makedirs(os.path.dirname(DATASET_PATH_PATTERN), exist_ok=True)
    for split, split_name in zip((X_train, X_test, y_train, y_test),
                                 ('X_train', 'X_test', 'y_train', 'y_test'),
                                 ):
        pd.DataFrame(split).to_csv(DATASET_PATH_PATTERN.format(split_name=split_name), index=False)
    logger.info('Успешно сохранили датасеты!')

    mlflow.log_params({"dataset_name": DATASET_NAME,
                       "n_features": len(columns),
                       "features": ",".join(columns),
                       "train_size_rows": int(len(y_train)),
                       "test_size_rows": int(len(y_test)),
                       "cat_features": ",".join(cat_features),
                       "num_features": ",".join(num_features),
                       "cats_encoder_name": params['cats_encoder']['name'],
                       "cats_encoder_params": str(params['cats_encoder']['params']),
                       "num_scaler_name": params['num_scalers']['name'],
                       "num_scaler_params": str(params['num_scalers']['params']),
                       "random_state": RANDOM_STATE})

if __name__ == '__main__':
    process_data()

2026-02-17 08:24:44,943 : INFO : process_data : Начали скачивать данные
2026-02-17 08:24:46,552 : INFO : process_data : Успешно скачали данные!
2026-02-17 08:24:46,552 : INFO : process_data : Делаем предобработку данных
2026-02-17 08:24:46,569 : INFO : process_data :     Используемые фичи: ['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'capital.gain', 'capital.loss', 'hours.per.week', 'native.country']
2026-02-17 08:24:46,709 : INFO : process_data :     Размер тренировочного датасета: 18233
2026-02-17 08:24:46,710 : INFO : process_data :     Размер тестового датасета: 9769
2026-02-17 08:24:46,710 : INFO : process_data : Начали сохранять датасеты
2026-02-17 08:24:48,586 : INFO : process_data : Успешно сохранили датасеты!


In [1]:
from scripts import process_data, train, evaluate

process_data()
train()
evaluate()

  from .autonotebook import tqdm as notebook_tqdm
2026-02-17 08:32:02,078 : INFO : process_data : Начали скачивать данные
2026-02-17 08:32:04,055 : INFO : process_data : Успешно скачали данные!
2026-02-17 08:32:04,056 : INFO : process_data : Делаем предобработку данных
2026-02-17 08:32:04,073 : INFO : process_data :     Используемые фичи: ['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'capital.gain', 'capital.loss', 'hours.per.week', 'native.country']
2026-02-17 08:32:04,213 : INFO : process_data :     Размер тренировочного датасета: 18233
2026-02-17 08:32:04,214 : INFO : process_data :     Размер тестового датасета: 9769
2026-02-17 08:32:04,215 : INFO : process_data : Начали сохранять датасеты
2026-02-17 08:32:06,061 : INFO : process_data : Успешно сохранили датасеты!
2026-02-17 08:32:06,121 : INFO : train : Начали считывать датасеты
2026-02-17 08:32:06,409 : INFO : train : Успешно считали датасеты!
2026-02-1

In [None]:
from typing import Callable
from sklearn.metrics import (accuracy_score,
                             precision_score,
                             recall_score,
                             f1_score,
                             roc_auc_score,
                             average_precision_score)
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore",
                        category=UndefinedMetricWarning)


def find_best_tresh(y_true: np.ndarray, 
                    y_probas: np.ndarray,
                    scorer: Callable,
                    ) -> tuple[float, float]:
    '''-> (best_score, best_trash)''' 
    # Буду через сетку перебирать, хоть и не очень правильно
    grid_n = 1_000
    grid = np.arange(0, grid_n) / grid_n
    bool_preds = y_probas[None,:] <= grid[:,None]
    scores = [scorer(y_true, bool_preds[i]) for i in range(grid_n)]
    best_tresh = np.argmax(scores)
    
    return scores[best_tresh], best_tresh / grid_n

def get_scores(scorers_with_tresh: list[callable],
               scorers_without_tresh: list[callable],
               y_true: np.ndarray,
               y_probas: np.ndarray,
               ) -> dict[str, dict[float, float|None]]:
    '''-> {scorer_name: {best_score: float,
                         best_tresh, float|None}}
    '''
    scores = {scorer.__name__: {"best_score": scorer(y_true, y_probas),
                                'best_tresh': None
                                } 
              for scorer in scorers_without_tresh}
    for scorer in scorers_with_tresh:
        score, tresh = find_best_tresh(y_true, y_probas, scorer)
        scores[scorer.__name__]  = {'best_score': score, 'best_tresh': tresh}
    
    return scores

get_scores(SCORERS_WITH_THESH,
           SCORERS_WITHOUT_THESH,
           np.array([1, 1, 0, 1, 0]),
           np.array([3/5, 3/5, 3/5, 3/5, 3/5]))

{'roc_auc_score': {'best_score': 0.5, 'best_tresh': None},
 'average_precision_score': {'best_score': 0.6, 'best_tresh': None},
 'accuracy_score': {'best_score': 0.6, 'best_tresh': np.float64(0.6)},
 'precision_score': {'best_score': 0.6, 'best_tresh': np.float64(0.6)},
 'recall_score': {'best_score': 1.0, 'best_tresh': np.float64(0.6)},
 'f1_score': {'best_score': 0.75, 'best_tresh': np.float64(0.6)}}

In [None]:
probas = np.array([1/2 , 1/3, 0.99])
tr = np.array([1 , 1, 0])
r = probas[None,:] < (np.arange(0, 1_000) / 1_000)[:,None]
np.argmax([accuracy_score(tr, r[i]) for i in range(1_000)])

np.int64(501)

In [14]:
(np.arange(0, 10_000) / 10_000)[None,:]

array([[0.000e+00, 1.000e-04, 2.000e-04, ..., 9.997e-01, 9.998e-01,
        9.999e-01]])

In [None]:
# accuracy
# precision
# recall
# f1-score
# ROC-AUC
# PR-AUC
#      6. артефакты (один из следующих типов):

# classification report
# confusion matrix
# feature importances
# csv с ошибками модели
# PR-кривая