## Setup & Imports

In [19]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
import sys
import os
import pandas as pd
import numpy as np
import warnings

In [21]:
sys.path.append(os.path.abspath('..'))

import src.config as cfg
from src.preprocessing import DataPreprocessor
from src.encoders import GlobalEncoder
from src.segmentation import SegmentManager
from src.modeling import ModelTrainer
from src.evaluation import ModelEvaluator
from src.interpretation import ShapAnalyzer
from src.deploy import ModelDeployer

In [22]:
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

## Data Loading

In [23]:
print(f"Loading data from: {cfg.DATA_PATH}")

Loading data from: dataset/


In [24]:
df_train = pd.read_csv(os.path.join('..', cfg.DATA_PATH, cfg.TRAIN_FILE))
df_test = pd.read_csv(os.path.join('..', cfg.DATA_PATH, cfg.TEST_FILE))

In [25]:
print(f"Train shape: {df_train.shape}")
print(f"Test shape:  {df_test.shape}")

Train shape: (103904, 25)
Test shape:  (25976, 25)


In [26]:
df_train.head(3)

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,1,5,3,5,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,3,1,3,1,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,2,5,5,5,5,4,3,4,4,4,5,0,0.0,satisfied


## Global Preprocessing

In [27]:
dp = DataPreprocessor()

In [28]:
cfg.ID_COLS

['Unnamed: 0', 'id']

In [29]:
df_train = dp.drop_columns(df_train, cfg.ID_COLS)
df_test = dp.drop_columns(df_test, cfg.ID_COLS)

Dropped columns: ['Unnamed: 0', 'id']
Dropped columns: ['Unnamed: 0', 'id']


In [30]:
cfg.DELAY_INPUT_COLS

['Departure Delay in Minutes', 'Arrival Delay in Minutes']

In [31]:
df_train = dp.fill_na(df_train, cfg.DELAY_INPUT_COLS, value=0)
df_test = dp.fill_na(df_test, cfg.DELAY_INPUT_COLS, value=0)

Filled NaNs with 0 in 1 columns.
Filled NaNs with 0 in 1 columns.


In [32]:
df_train = dp.transform_log_sum(df_train, cfg.DELAY_INPUT_COLS, cfg.DELAY_OUTPUT_COL, drop_input=True)
df_test = dp.transform_log_sum(df_test, cfg.DELAY_INPUT_COLS, cfg.DELAY_OUTPUT_COL, drop_input=True)

Log-transform applied to 'Total Delay Log'. Dropped: ['Departure Delay in Minutes', 'Arrival Delay in Minutes']
Log-transform applied to 'Total Delay Log'. Dropped: ['Departure Delay in Minutes', 'Arrival Delay in Minutes']


In [33]:
zero_stats = dp.analyze_zeros(df_train, columns=cfg.SERVICE_COLS)


=== Zero Values Analysis ===
                                   Zeros Count  Zeros %
Departure/Arrival time convenient         5300     5.10
Ease of Online booking                    4487     4.32
Inflight wifi service                     3103     2.99
Online boarding                           2428     2.34
Leg room service                           472     0.45
Food and drink                             107     0.10
Inflight entertainment                      14     0.01
Cleanliness                                 12     0.01
Gate location                                1     0.00
Seat comfort                                 1     0.00
On-board service                             3     0.00
Checkin service                              1     0.00
Inflight service                             3     0.00

Total columns checked: 14


In [34]:
cfg.SERVICE_COLS

['Inflight wifi service',
 'Departure/Arrival time convenient',
 'Ease of Online booking',
 'Gate location',
 'Food and drink',
 'Online boarding',
 'Seat comfort',
 'Inflight entertainment',
 'On-board service',
 'Leg room service',
 'Baggage handling',
 'Checkin service',
 'Inflight service',
 'Cleanliness']

In [35]:
df_train = dp.replace_values(df_train, cfg.SERVICE_COLS, old_value=0, new_value=np.nan)
df_test = dp.replace_values(df_test, cfg.SERVICE_COLS, old_value=0, new_value=np.nan)

Replaced '0' with 'nan' in 13 columns.
Replaced '0' with 'nan' in 10 columns.


## Global Encoding

In [None]:
encoder = GlobalEncoder(
    manual_mappings=cfg.ENCODER_MANUAL_CONFIG,
    auto_cols=cfg.AUTO_ENCODING_COLS
)

In [None]:
df_train = encoder.fit_transform(df_train)
df_test = encoder.transform(df_test)

In [None]:
df_train[cfg.TARGET_COL] = df_train[cfg.TARGET_COL].map(cfg.TARGET_MAP)
df_test[cfg.TARGET_COL] = df_test[cfg.TARGET_COL].map(cfg.TARGET_MAP)

In [None]:
print("Encoding complete. Example data:")
df_train[['Class', 'Class_Encoded', 'Type of Travel', 'Gender']].head()

## The Core Loop: Segmentation -> Modeling -> Analysis

In [None]:
seg_manager = SegmentManager(cfg.SEGMENT_CONFIGS, cfg.TARGET_COL, min_samples=cfg.MIN_SEGMENT_SIZE)
trainer = ModelTrainer(fixed_params=cfg.XGB_FIXED_PARAMS, random_state=cfg.RANDOM_STATE)
evaluator = ModelEvaluator()
deployer = ModelDeployer(save_path=cfg.MODELS_SAVE_PATH)

In [None]:
all_metrics = {}

In [None]:
for data in seg_manager.iterate_segments(df_train, df_test):
    seg_name = data['name']
    X_train, y_train = data['X_train'], data['y_train']
    X_test, y_test = data['X_test'], data['y_test']

    print(f"\nTraining model for: {seg_name} ...")

    # 1. Обучение (GridSearch + Auto Balancing)
    best_model = trainer.train(X_train, y_train, param_grid=cfg.XGB_PARAM_GRID, scoring='f1')

    # 2. Сохранение
    deployer.save_model(best_model, filename=seg_name)

    # 3. Валидация
    if X_test is not None:
        y_pred = best_model.predict(X_test)
        y_proba = best_model.predict_proba(X_test)[:, 1]

        # Считаем метрики
        metrics = evaluator.calculate_metrics(y_test, y_pred, y_proba)
        all_metrics[seg_name] = metrics

        print(f"Metrics: {metrics}")

        # Рисуем Confusion Matrix (опционально, можно закомментировать для экономии места)
        # evaluator.plot_confusion_matrix(y_test, y_pred, title=f"CM: {seg_name}")

    # 4. Интерпретация (SHAP)
    # Инициализируем анализатор (тяжелая операция, делается 1 раз на сегмент)
    shap_analyzer = ShapAnalyzer(best_model, X_train)

    print(f"--- SHAP Insights for {seg_name} ---")

    # A. Summary Plot (Топ драйверы)
    shap_analyzer.plot_summary(X_test, max_display=10)

    # B. Saturation Analysis (Поиск плато для Топ-3 признаков)
    top_df = shap_analyzer.get_top_features(X_test, top_n=3)
    top_feats = top_df['Feature'].tolist()

    for feat in top_feats:
        # Пропускаем бинарные признаки (Gender), строим только для шкал 1-5
        if X_train[feat].nunique() > 2:
            shap_analyzer.plot_dependence(X_test, feature=feat)

    # C. Delay Interaction Check (Спасает ли сервис от задержки?)
    # Проверяем взаимодействие между Топ-1 сервисом и Задержкой
    if cfg.DELAY_OUTPUT_COL in X_test.columns:
        top_service = top_feats[0]
        shap_analyzer.plot_interaction(X_test, feature_x=cfg.DELAY_OUTPUT_COL, feature_color=top_service)

    print("-" * 50)

## Final Comparison & Outcomes

In [None]:
print("=== FINAL MODEL COMPARISON ===")
df_results = ModelEvaluator.compare_segments(all_metrics)

# Красивый вывод с градиентом
df_results.style.background_gradient(cmap='Greens', subset=['F1-Score', 'Specificity'])