In [1]:
import os
import json
import pandas as pd
import numpy as np
import joblib
import shap
import lime.lime_tabular
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
df = pd.read_csv("../data/H1.csv")
df = df.drop(columns=['Company', 'Agent', 'ReservationStatusDate'], errors='ignore')

# Encode categorical features
cat_cols = df.select_dtypes(include='object').columns.tolist()
encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    encoders[col] = le

In [6]:
# Split target & features
target = df['IsCanceled']
features = df.drop(columns=['IsCanceled'])

# Create artifacts directory
os.makedirs('artifacts', exist_ok=True)

# Save encoders and feature names
joblib.dump(encoders, 'artifacts/encoders.joblib')
json.dump(features.columns.tolist(), open('artifacts/feature_names.json', 'w'))

# Scale data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)
joblib.dump(scaler, 'artifacts/scaler.joblib')

# Fit PCA
pca2 = PCA(n_components=2)
pca3 = PCA(n_components=3)
_ = pca2.fit_transform(X_scaled)
_ = pca3.fit_transform(X_scaled)
joblib.dump(pca2, 'artifacts/pca2.joblib')
joblib.dump(pca3, 'artifacts/pca3.joblib')

['artifacts/pca3.joblib']

In [7]:
# Train-test split (for model evaluation)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, target, test_size=0.3, random_state=42, stratify=target
)

# Train Isolation Forest
model = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)
model.fit(X_train)
joblib.dump(model, 'artifacts/iso_forest.joblib')



['artifacts/iso_forest.joblib']

In [8]:
# Compute SHAP explainer and values on full dataset
shap_explainer = shap.TreeExplainer(model)
shap_values = shap_explainer.shap_values(X_scaled)
joblib.dump(shap_explainer, 'artifacts/shap_explainer.joblib')
np.save('artifacts/shap_values.npy', shap_values)


In [13]:
import dill

# Build LIME explainer (use dill for pickling lambdas)
lime_explainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=X_scaled,
    feature_names=features.columns.tolist(),
    class_names=['Normal', 'Anomaly'],
    mode='classification',
    verbose=False,
    random_state=42
)
# Save with dill to handle lambdas
with open('artifacts/lime_explainer.pkl', 'wb') as f:
    dill.dump(lime_explainer, f)