<a href="https://colab.research.google.com/github/Resia05/supervised-learning/blob/main/%D0%B1%D0%B5%D0%B7_%D0%BF%D0%B0%D0%B9%D0%BF%D0%BB%D0%B0%D0%B9%D0%BD%D1%96%D0%B2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

def select_columns(df: pd.DataFrame) -> tuple:
    """
    Вибирає колонки для роботи.
    Повертає список колонок для вводу, цільову колонку, числові, категоріальні та міткові колонки.
    """
    input_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Geography', 'Gender']
    target_col = 'Exited'
    numeric_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
    categorical_cols = ['Geography']
    label_cols = ['Gender']
    return input_cols, target_col, numeric_cols, categorical_cols, label_cols

def split_data(df: pd.DataFrame, target_col: str) -> tuple:
    """
    Розбиває сирі дані на тренувальні та валідаційні набори.
    """
    return train_test_split(df, test_size=0.2, stratify=df[target_col], random_state=42)

def create_transformers(numeric_cols: list, categorical_cols: list, label_cols: list, scaler_numeric: bool) -> ColumnTransformer:
    """
    Створює трансформери для числових і категоріальних колонок.
    """
    transformers = []
    if scaler_numeric:
        scaler = MinMaxScaler()
        transformers.append(('scaler', scaler, numeric_cols))

    onehot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
    transformers.append(('onehot', onehot_encoder, categorical_cols))

    gender_encoder = OneHotEncoder(categories=[['Female', 'Male']], drop='first', sparse=False)
    transformers.append(('gender', gender_encoder, label_cols))

    preprocessor = ColumnTransformer(transformers=transformers)
    return preprocessor

def preprocess_data(raw_df: pd.DataFrame, scaler_numeric: bool = True) -> dict:
    """
    Попередня обробка сирих даних.
    """
    input_cols, target_col, numeric_cols, categorical_cols, label_cols = select_columns(raw_df)
    train_df, val_df = split_data(raw_df, target_col)
    X_train = train_df[input_cols]
    train_targets = train_df[target_col]
    X_val = val_df[input_cols]
    val_targets = val_df[target_col]

    preprocessor = create_transformers(numeric_cols, categorical_cols, label_cols, scaler_numeric)
    preprocessor.fit(X_train)

    X_train_transformed = preprocessor.transform(X_train)
    X_val_transformed = preprocessor.transform(X_val)

    transformed_cols = numeric_cols + list(preprocessor.named_transformers_['onehot'].get_feature_names_out()) + list(preprocessor.named_transformers_['gender'].get_feature_names_out())

    X_train_df = pd.DataFrame(X_train_transformed, columns=transformed_cols)
    X_val_df = pd.DataFrame(X_val_transformed, columns=transformed_cols)

    return {
        'X_train': X_train_df,
        'train_targets': train_targets,
        'X_val': X_val_df,
        'val_targets': val_targets,
        'input_cols': input_cols,
        'preprocessor': preprocessor
    }

def preprocess_new_data(new_df: pd.DataFrame, preprocessor: ColumnTransformer, input_cols: list) -> pd.DataFrame:
    """
    Попередня обробка нових даних для передбачення або оцінки моделі.
    """
    return pd.DataFrame(preprocessor.transform(new_df[input_cols]), columns=preprocessor.get_feature_names_out())
