<a href="https://colab.research.google.com/github/Resia05/supervised-learning/blob/main/process_bank_churn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from typing import Tuple, List, Dict, Any

def select_columns(df: pd.DataFrame) -> Tuple[List[str], str, List[str], List[str], List[str]]:
    """
    Вибирає колонки для роботи.
    Повертає список колонок для вводу, цільову колонку, числові, категоріальні та міткові колонки.
    """
    input_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Geography', 'Gender']
    target_col = 'Exited'
    numeric_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
    categorical_cols = ['Geography']
    label_cols = ['Gender']
    return input_cols, target_col, numeric_cols, categorical_cols, label_cols

def split_data(df: pd.DataFrame, target_col: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Розбиває сирі дані на тренувальні та валідаційні набори.
    """
    return train_test_split(df, test_size=0.2, stratify=df[target_col], random_state=42)

def create_transformers(numeric_cols: List[str], categorical_cols: List[str], label_cols: List[str], scaler_numeric: bool) -> ColumnTransformer:
    """
    Створює трансформери для числових і категоріальних колонок.
    """
    transformers = []
    if scaler_numeric:
        numeric_transformer = Pipeline(steps=[
            ('scaler', MinMaxScaler())
        ])
        transformers.append(('num', numeric_transformer, numeric_cols))

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
    ])
    transformers.append(('cat', categorical_transformer, categorical_cols))

    label_transformer = Pipeline(steps=[
        ('map_gender', OneHotEncoder(categories=[['Female', 'Male']], drop='first', sparse_output=False))
    ])
    transformers.append(('lbl', label_transformer, label_cols))

    preprocessor = ColumnTransformer(transformers=transformers)
    return preprocessor

def preprocess_data(raw_df: pd.DataFrame, scaler_numeric: bool = True) -> Dict[str, Any]:
    """
    Попередня обробка сирих даних.
    """
    input_cols, target_col, numeric_cols, categorical_cols, label_cols = select_columns(raw_df)
    train_df, val_df = split_data(raw_df, target_col)
    X_train = train_df[input_cols]
    train_targets = train_df[target_col]
    X_val = val_df[input_cols]
    val_targets = val_df[target_col]

    preprocessor = create_transformers(numeric_cols, categorical_cols, label_cols, scaler_numeric)
    preprocessor.fit(X_train)

    X_train_transformed = preprocessor.transform(X_train)
    X_val_transformed = preprocessor.transform(X_val)

    transformed_cols = preprocessor.transformers_[0][2] + preprocessor.transformers_[1][1].named_steps['onehot'].get_feature_names_out().tolist() + preprocessor.transformers_[2][1].named_steps['map_gender'].get_feature_names_out().tolist()

    X_train_df = pd.DataFrame(X_train_transformed, columns=transformed_cols)
    X_val_df = pd.DataFrame(X_val_transformed, columns=transformed_cols)

    return {
        'X_train': X_train_df,
        'train_targets': train_targets,
        'X_val': X_val_df,
        'val_targets': val_targets,
        'input_cols': input_cols,
        'preprocessor': preprocessor
    }

def preprocess_new_data(new_df: pd.DataFrame, preprocessor: ColumnTransformer, input_cols: List[str]) -> pd.DataFrame:
    """
    Попередня обробка нових даних для передбачення або оцінки моделі.
    """
    return pd.DataFrame(preprocessor.transform(new_df[input_cols]), columns=preprocessor.get_feature_names_out())

