## Setup & Imports

In [58]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [59]:
import sys
import os
import pandas as pd
import numpy as np
import warnings

In [60]:
sys.path.append(os.path.abspath('..'))

import src.config as cfg
from src.preprocessing import DataPreprocessor
from src.encoders import GlobalEncoder
from src.segmentation import SegmentManager
from src.modeling import ModelTrainer
from src.evaluation import ModelEvaluator
from src.interpretation import ShapAnalyzer
from src.deploy import ModelDeployer

In [61]:
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

## Data Loading

In [62]:
print(f"Loading data from: {cfg.DATA_PATH}")

Loading data from: dataset/


In [63]:
df_train = pd.read_csv(os.path.join('..', cfg.DATA_PATH, cfg.TRAIN_FILE))
df_test = pd.read_csv(os.path.join('..', cfg.DATA_PATH, cfg.TEST_FILE))

In [64]:
print(f"Train shape: {df_train.shape}")
print(f"Test shape:  {df_test.shape}")

Train shape: (103904, 25)
Test shape:  (25976, 25)


In [65]:
df_train.head(3)

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,1,5,3,5,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,3,1,3,1,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,2,5,5,5,5,4,3,4,4,4,5,0,0.0,satisfied


## Global Preprocessing

In [66]:
dp = DataPreprocessor()

In [67]:
cfg.ID_COLS

['Unnamed: 0', 'id']

In [68]:
df_train = dp.drop_columns(df_train, cfg.ID_COLS)
df_test = dp.drop_columns(df_test, cfg.ID_COLS)

Dropped columns: ['Unnamed: 0', 'id']
Dropped columns: ['Unnamed: 0', 'id']


In [69]:
cfg.DELAY_INPUT_COLS

['Departure Delay in Minutes', 'Arrival Delay in Minutes']

In [70]:
df_train = dp.fill_na(df_train, cfg.DELAY_INPUT_COLS, value=0)
df_test = dp.fill_na(df_test, cfg.DELAY_INPUT_COLS, value=0)

Filled NaNs with 0 in 1 columns.
Filled NaNs with 0 in 1 columns.


In [71]:
df_train = dp.transform_log_sum(df_train, cfg.DELAY_INPUT_COLS, cfg.DELAY_OUTPUT_COL, drop_input=True)
df_test = dp.transform_log_sum(df_test, cfg.DELAY_INPUT_COLS, cfg.DELAY_OUTPUT_COL, drop_input=True)

Log-transform applied to 'Total Delay Log'. Dropped: ['Departure Delay in Minutes', 'Arrival Delay in Minutes']
Log-transform applied to 'Total Delay Log'. Dropped: ['Departure Delay in Minutes', 'Arrival Delay in Minutes']


In [72]:
zero_stats = dp.analyze_zeros(df_train, columns=cfg.SERVICE_COLS)


=== Zero Values Analysis ===
                                   Zeros Count  Zeros %
Departure/Arrival time convenient         5300     5.10
Ease of Online booking                    4487     4.32
Inflight wifi service                     3103     2.99
Online boarding                           2428     2.34
Leg room service                           472     0.45
Food and drink                             107     0.10
Inflight entertainment                      14     0.01
Cleanliness                                 12     0.01
Gate location                                1     0.00
Seat comfort                                 1     0.00
On-board service                             3     0.00
Checkin service                              1     0.00
Inflight service                             3     0.00

Total columns checked: 14


In [73]:
cfg.SERVICE_COLS

['Inflight wifi service',
 'Departure/Arrival time convenient',
 'Ease of Online booking',
 'Gate location',
 'Food and drink',
 'Online boarding',
 'Seat comfort',
 'Inflight entertainment',
 'On-board service',
 'Leg room service',
 'Baggage handling',
 'Checkin service',
 'Inflight service',
 'Cleanliness']

In [74]:
df_train = dp.replace_values(df_train, cfg.SERVICE_COLS, old_value=0, new_value=np.nan)
df_test = dp.replace_values(df_test, cfg.SERVICE_COLS, old_value=0, new_value=np.nan)

Replaced '0' with 'nan' in 13 columns.
Replaced '0' with 'nan' in 10 columns.


## Global Encoding

In [75]:
cfg.ENCODER_MANUAL_CONFIG

{'Class': {'Map': {'Eco': 0, 'Eco Plus': 1, 'Business': 2},
  'Suffix': '_Encoded'}}

In [76]:
cfg.AUTO_ENCODING_COLS

['Gender', 'Customer Type']

In [77]:
encoder = GlobalEncoder(
    manual_mappings=cfg.ENCODER_MANUAL_CONFIG,
    auto_cols=cfg.AUTO_ENCODING_COLS
)

In [78]:
df_train.head(5)

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,satisfaction,Total Delay Log
0,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3.0,4.0,3.0,1.0,5.0,3.0,5.0,5.0,4.0,3.0,4,4.0,5.0,5.0,neutral or dissatisfied,3.78419
1,Male,disloyal Customer,25,Business travel,Business,235,3.0,2.0,3.0,3.0,1.0,3.0,1.0,1.0,1.0,5.0,3,1.0,4.0,1.0,neutral or dissatisfied,2.079442
2,Female,Loyal Customer,26,Business travel,Business,1142,2.0,2.0,2.0,2.0,5.0,5.0,5.0,5.0,4.0,3.0,4,4.0,4.0,5.0,satisfied,0.0
3,Female,Loyal Customer,25,Business travel,Business,562,2.0,5.0,5.0,5.0,2.0,2.0,2.0,2.0,2.0,5.0,3,1.0,4.0,2.0,neutral or dissatisfied,3.044522
4,Male,Loyal Customer,61,Business travel,Business,214,3.0,3.0,3.0,3.0,4.0,5.0,5.0,3.0,3.0,4.0,4,3.0,3.0,3.0,satisfied,0.0


In [79]:
df_train = encoder.fit_transform(df_train)
df_test = encoder.transform(df_test)

GlobalEncoder fitted on 2 auto-columns.
Created column 'Class_Encoded' using manual mapping.
Created column 'Class_Encoded' using manual mapping.


In [80]:
df_train.head(5)

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,satisfaction,Total Delay Log,Class_Encoded
0,1,0,13,Personal Travel,Eco Plus,460,3.0,4.0,3.0,1.0,5.0,3.0,5.0,5.0,4.0,3.0,4,4.0,5.0,5.0,neutral or dissatisfied,3.78419,1
1,1,1,25,Business travel,Business,235,3.0,2.0,3.0,3.0,1.0,3.0,1.0,1.0,1.0,5.0,3,1.0,4.0,1.0,neutral or dissatisfied,2.079442,2
2,0,0,26,Business travel,Business,1142,2.0,2.0,2.0,2.0,5.0,5.0,5.0,5.0,4.0,3.0,4,4.0,4.0,5.0,satisfied,0.0,2
3,0,0,25,Business travel,Business,562,2.0,5.0,5.0,5.0,2.0,2.0,2.0,2.0,2.0,5.0,3,1.0,4.0,2.0,neutral or dissatisfied,3.044522,2
4,1,0,61,Business travel,Business,214,3.0,3.0,3.0,3.0,4.0,5.0,5.0,3.0,3.0,4.0,4,3.0,3.0,3.0,satisfied,0.0,2


In [81]:
cfg.TARGET_MAP

{'neutral or dissatisfied': 0, 'satisfied': 1}

In [82]:
df_train[cfg.TARGET_COL] = df_train[cfg.TARGET_COL].map(cfg.TARGET_MAP)
df_test[cfg.TARGET_COL] = df_test[cfg.TARGET_COL].map(cfg.TARGET_MAP)

In [83]:
print("Encoding complete. Example data:")
df_train[['Class', 'Class_Encoded', 'Type of Travel', 'Gender']].head()

Encoding complete. Example data:


Unnamed: 0,Class,Class_Encoded,Type of Travel,Gender
0,Eco Plus,1,Personal Travel,1
1,Business,2,Business travel,1
2,Business,2,Business travel,0
3,Business,2,Business travel,0
4,Business,2,Business travel,1


## The Core Loop: Segmentation -> Modeling -> Analysis

In [84]:
seg_manager = SegmentManager(cfg.SEGMENT_CONFIGS, cfg.TARGET_COL, min_samples=cfg.MIN_SEGMENT_SIZE)
trainer = ModelTrainer(fixed_params=cfg.XGB_FIXED_PARAMS, random_state=cfg.RANDOM_STATE)
evaluator = ModelEvaluator()
deployer = ModelDeployer(base_path="artifacts")

In [85]:
from src.pipeline import TrainingPipeline

pipeline = TrainingPipeline(trainer, evaluator, deployer)

In [57]:
all_metrics = {}

In [None]:
for data in seg_manager.iterate_segments(df_train, df_test):

    metrics = pipeline.run_segment(data)

    if metrics:
        all_metrics[data['name']] = metrics

## Final Comparison & Outcomes

In [None]:
print("\n=== FINAL COMPARISON ===")
df_results = ModelEvaluator.compare_segments(all_metrics)

df_results.to_csv("artifacts/reports/final_comparison.csv")

display(df_results.style.background_gradient(cmap='Greens'))