In [2]:
import sys
from pathlib import Path

# Add project root to path
root = Path.cwd()
while not (root / "src").exists() and root != root.parent:
    root = root.parent
if str(root) not in sys.path:
    sys.path.insert(0, str(root))

print(f"Project root: {root}")
print(f"src exists: {(root / 'src').is_dir()}")

# Imports
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from src.pipelines import HotelCancellationClassification
from src.utils import ModelMetadata

# Configuration
DATA_PATH = root / "data" / "processed" / "hotel_bookings_processed.csv"
TARGET_COL = "is_canceled"
RANDOM_STATE = 42

# Verify data exists
if not DATA_PATH.exists():
    raise FileNotFoundError(f"Processed data not found: {DATA_PATH}")

print(f"\nData path: {DATA_PATH}")
print(f"File size: {DATA_PATH.stat().st_size / 1024 / 1024:.2f} MB")

Project root: d:\Uni\LUND\2. Intro to Programming\Project\DABN13---Project
src exists: True

Data path: d:\Uni\LUND\2. Intro to Programming\Project\DABN13---Project\data\processed\hotel_bookings_processed.csv
File size: 33.00 MB


In [3]:
df = pd.read_csv(DATA_PATH)

# DON'T one-hot encode here - let the pipeline handle it
print(f"Data shape: {df.shape}")
print(f"\nTarget distribution:\n{df[TARGET_COL].value_counts(normalize=True)}")
print(f"\nFeature types:\n{df.dtypes.value_counts()}")
df.head()

Data shape: (86544, 69)

Target distribution:
is_canceled
0    0.724025
1    0.275975
Name: proportion, dtype: float64

Feature types:
bool       46
int64      17
float64     6
Name: count, dtype: int64


Unnamed: 0,is_canceled,children,babies,is_repeated_guest,required_car_parking_spaces,total_of_special_requests,stays_in_weekend_nights,stays_in_week_nights,previous_cancellations,previous_bookings_not_canceled,...,arrival_date_month_November,arrival_date_month_October,arrival_date_month_September,adults_cat_2,adults_cat_3+,total_of_special_requests_cat_1,total_of_special_requests_cat_2+,stays_in_weekend_nights_cat_1,stays_in_weekend_nights_cat_2,stays_in_weekend_nights_cat_3+
0,0,0.0,0,0,0,0,0,0,0,0,...,False,False,False,True,False,False,False,False,False,False
1,0,0.0,0,0,0,0,0,0,0,0,...,False,False,False,True,False,False,False,False,False,False
2,0,0.0,0,0,0,0,0,1,0,0,...,False,False,False,False,False,False,False,False,False,False
3,0,0.0,0,0,0,0,0,1,0,0,...,False,False,False,False,False,False,False,False,False,False
4,0,0.0,0,0,0,1,0,2,0,0,...,False,False,False,True,False,True,False,False,False,False


In [4]:
analysis = HotelCancellationClassification(
    data_path=DATA_PATH,
    target_col=TARGET_COL,
    features=None,  # Use all columns except target
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=True,
    experiment_name="hotel_cancellation_v1"
)

# Create train/val/test split (70% train, 15% val, 15% test)
analysis.prepare_split(val_size=0.15)

Features: 22 numeric, 0 categorical
Split: train=56253 val=12982 test=17309 | TrainPos=0.276 ValPos=0.276 TestPos=0.276


In [8]:
print("=" * 70)
print("GRID SEARCH OPTIMIZATION")
print("=" * 70)

# Logistic Regression Grid Search
print("\n1. Logistic Regression GridSearch")
analysis.grid_search(
    estimator=LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
    param_grid={
        "C": [0.01, 0.1, 1.0, 10.0, 100.0],
        "penalty": ["l2"],
        "solver": ["lbfgs", "saga"]
    },
    scoring="f1",
    cv=5,
    use_validation=True
)

# Decision Tree Grid Search
print("\n2. Decision Tree GridSearch")
analysis.grid_search(
    estimator=DecisionTreeClassifier(random_state=RANDOM_STATE),
    param_grid={
        "max_depth": [3, 4, 5, 6, 8, 10, None],
        "min_samples_split": [2, 5, 10, 20],
        "min_samples_leaf": [1, 2, 4]
    },
    scoring="f1",
    cv=5,
    use_validation=True
)

print("\n" + "=" * 70)
print("GRID SEARCH COMPLETE")
print("=" * 70)

GRID SEARCH OPTIMIZATION

1. Logistic Regression GridSearch
Fitting 5 folds for each of 10 candidates, totalling 50 fits

2. Decision Tree GridSearch
Fitting 5 folds for each of 84 candidates, totalling 420 fits

GRID SEARCH COMPLETE


In [5]:
# Random Forest Grid Search
print("\n3. Random Forest GridSearch")
from sklearn.ensemble import RandomForestClassifier

analysis.grid_search(
    estimator=RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1),
    param_grid={
        "n_estimators": [100, 200, 300],
        "max_depth": [5, 10, 15, None],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
        "max_features": ["sqrt", "log2"]
    },
    scoring="f1",
    cv=5,
    use_validation=True
)

print("\n" + "=" * 70)
print("GRID SEARCH COMPLETE")
print("=" * 70)


3. Random Forest GridSearch
Fitting 5 folds for each of 216 candidates, totalling 1080 fits


KeyboardInterrupt: 

In [6]:
try:
    from xgboost import XGBClassifier
    
    print("=" * 70)
    print("XGBOOST GRID SEARCH")
    print("=" * 70)
    
    analysis.grid_search(
        estimator=XGBClassifier(
            objective="binary:logistic",
            eval_metric="logloss",
            tree_method="hist",
            random_state=RANDOM_STATE,
            verbosity=0
        ),
        param_grid={
            "n_estimators": [100, 200, 300],
            "max_depth": [3, 4, 6],
            "learning_rate": [0.01, 0.05, 0.1],
            "subsample": [0.8, 1.0],
            "colsample_bytree": [0.8, 1.0]
        },
        scoring="f1",
        cv=3,  # Fewer folds due to large param space
        use_validation=True
    )
    
    print("\n✓ XGBoost training complete")
    
    # Update best model check
    best_xgb = analysis.metadata.best(metric="test_f1")
    print(f"\nBest model after XGBoost: {best_xgb['algorithm']}")
    
except ImportError:
    print("XGBoost not installed. Skipping.")
    print("Install with: pip install xgboost")

XGBOOST GRID SEARCH
Fitting 3 folds for each of 108 candidates, totalling 324 fits

✓ XGBoost training complete

Best model after XGBoost: XGBClassifier(GridSearch)


In [11]:
print("\n" + "=" * 70)
print("EXPERIMENT SUMMARY")
print("=" * 70)

analysis.metadata.summary()

# Find best models by different metrics
print("\n" + "-" * 70)
print("BEST MODELS BY METRIC")
print("-" * 70)

for metric in ["test_f1", "test_roc_auc", "test_accuracy"]:
    best = analysis.metadata.best(metric=metric)
    if best:
        print(f"\nBest by {metric}:")
        print(f"  Algorithm: {best['algorithm']}")
        print(f"  Params: {best['hyperparameters']}")
        print(f"  Score: {best['results'].get(metric, 'N/A'):.4f}")


EXPERIMENT SUMMARY

EXPERIMENT SUMMARY [hotel_cancellation_v1]
----------------------------------------------------------------------


ValueError: Invalid format specifier '.3f if acc else 0:.3f' for object of type 'float'

In [12]:
# Save metadata to JSON
analysis.metadata.save()  # Saves to models/experiments/hotel_cancellation_v1.json

# Save best model by F1 score
analysis.metadata.save_best_model(metric="test_f1")

print("✓ Metadata saved to: models/experiments/hotel_cancellation_v1.json")
print("✓ Best model saved to: models/best_models/")

Metadata written: models\experiments\hotel_cancellation_v1.json (total 3 experiments)
Best model copied: xgbclassifier_grid.joblib -> models\best_models\best_xgbclassifier.joblib


ValueError: Unknown format code 'f' for object of type 'str'

In [None]:
best = analysis.metadata.best(metric="test_f1")

if best:
    print(f"Evaluating best model: {best['algorithm']}")
    print(f"Hyperparameters: {best['hyperparameters']}")
    
    # Comprehensive evaluation report
    analysis.plot_evaluation_report(
        experiment=best,
        figsize=(16, 10),
        save_path="reports/figures/best_model_evaluation.png"
    )
else:
    print("No experiments found.")

In [None]:
best_tree = analysis.metadata.best(metric="test_f1", algorithm="DecisionTreeClassifier")

if best_tree:
    print("Decision Tree Feature Importance Analysis")
    analysis.display_feature_importance(
        experiment=best_tree,
        top_n=20,
        figsize=(12, 8)
    )
else:
    print("No decision tree experiments found.")