In [10]:
import os
import json
import pandas as pd
import numpy as np
import joblib
import shap
import lime.lime_tabular
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split


# Data Preperation

In [11]:
from sklearn.preprocessing import OrdinalEncoder

df = pd.read_csv("../data/H1.csv")
df = df.drop(columns=['Company', 'Agent', 'ReservationStatusDate'], errors='ignore')

cat_cols = df.select_dtypes(include='object').columns.tolist()
encoder = OrdinalEncoder()
df[cat_cols] = encoder.fit_transform(df[cat_cols].astype(str))


In [12]:
df.to_csv('artifacts/processed_data.csv', index=False)


# Split target & features
target = df['IsCanceled']
# features = df.drop(columns=['IsCanceled'])

normal_data = df[df['IsCanceled'] == 0]
features = df.drop(columns=['IsCanceled'])

# Create artifacts directory
os.makedirs('artifacts', exist_ok=True)

json.dump(features.columns.tolist(), open('artifacts/feature_names.json', 'w'))


In [None]:
normal_data = df[df['IsCanceled'] == 0]
abnormal_data = df[df['IsCanceled'] == 1]


# Train-test split (for model evaluation)
X_normal_train, X_normal_test = train_test_split(
    normal_data.drop(columns=['IsCanceled']), test_size=0.3, random_state=42
)

X_abnormal_test = abnormal_data.drop(columns=['IsCanceled'])

X_test = pd.concat([X_normal_test, X_abnormal_test])
y_test = pd.Series(
    [0] * len(X_normal_test) + [1] * len(X_abnormal_test), 
    index=X_test.index
)

['artifacts/iso_forest.joblib']

# Modeling

In [None]:
# Train Isolation Forest
model = IsolationForest(n_estimators=150, contamination=0.30, random_state=104)
model.fit(X_normal_train)
joblib.dump(model, 'artifacts/iso_forest.joblib')


# Evaluation

In [14]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

import matplotlib.pyplot as plt

# Predict anomalies on test set
y_pred = model.predict(X_test)
# IsolationForest: -1 = anomaly, 1 = normal
# Convert to binary: 1 = anomaly, 0 = normal
y_pred_bin = (y_pred == -1).astype(int)

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred_bin)

# Plot and save confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Normal', 'Anomaly'], yticklabels=['Normal', 'Anomaly'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.savefig('artifacts/confusion_matrix.png')
plt.close()

In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import balanced_accuracy_score

# Calculate model performance metrics

metrics = {
    "balanced_accuracy": balanced_accuracy_score(y_test, y_pred_bin),
    "precision": precision_score(y_test, y_pred_bin),
    "recall": recall_score(y_test, y_pred_bin),
    "f1_score": f1_score(y_test, y_pred_bin)
}

# Save metrics to JSON
with open('artifacts/model_metrics.json', 'w') as f:
    json.dump(metrics, f, indent=4)
    
print("Model performance metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

Model performance metrics:
balanced_accuracy: 0.4898
precision: 0.5435
recall: 0.2696
f1_score: 0.3604


In [16]:
# Compute SHAP explainer and values on full dataset
shap_explainer = shap.TreeExplainer(model)
shap_values = shap_explainer.shap_values(features)
joblib.dump(shap_explainer, 'artifacts/shap_explainer.joblib')
np.save('artifacts/shap_values.npy', shap_values)


In [17]:
import dill

# Build LIME explainer (use dill for pickling lambdas)
lime_explainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=features.values,
    feature_names=features.columns.tolist(),
    class_names=['Normal', 'Anomaly'],
    mode='classification',
    verbose=False,
    random_state=42
)
# Save with dill to handle lambdas
with open('artifacts/lime_explainer.pkl', 'wb') as f:
    dill.dump(lime_explainer, f)

# Deployment via Dashboard