1. Project Setup

# Add project root to sys.path so local imports work
import sys
from pathlib import Path
project_root = Path.cwd().parent  # Assuming the notebook is in notebook/
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))


2. Import Required Modules

In [None]:
import pandas as pd
import joblib
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve

# Your custom project modules
from vehicle_insurance_fraud_detection.dataset import load_clean_data
from vehicle_insurance_fraud_detection.features import split_features_targets
from vehicle_insurance_fraud_detection.modeling.tune_and_save_best_xgb import XGBoostTuner
from vehicle_insurance_fraud_detection.plots import (
    plot_class_distribution,
    plot_feature_importance,
    plot_confusion_matrix,
    plot_roc_curve
)
from vehicle_insurance_fraud_detection.config import MODELS_DIR


3. Load and Explore Dataset

In [None]:
df = load_clean_data()
df.head()
df.info()
df.describe()


4. Visualize Class Imbalance

_, y = split_features_targets(df)
fig = plot_class_distribution(y)
fig.show()


5. Split Features and Targets

X, y = split_features_targets(df)


6. Train Best Model with SMOTE + Hyperparameter Tuning

from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

tuner = XGBoostTuner()
best_model = tuner.tune(X_resampled, y_resampled)


7. Feature Importance

# Extract importance
model = joblib.load(MODELS_DIR / "best_xgb_model.pkl")
feature_names = X.columns
importances = model.named_steps["xgb"].feature_importances_

importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
})

fig = plot_feature_importance(importance_df)
fig.show()


8. Evaluation on Training Data (for now)

y_pred = model.predict(X)
y_proba = model.predict_proba(X)[:, 1]

cm = confusion_matrix(y, y_pred)
fig = plot_confusion_matrix(cm)
fig.show()

roc_score = roc_auc_score(y, y_proba)
fpr, tpr, _ = roc_curve(y, y_proba)
fig = plot_roc_curve(fpr, tpr, roc_score)
fig.show()


9. Save Artifacts 

# Save feature names
with open(MODELS_DIR / "feature_names.txt", "w") as f:
    for name in feature_names:
        f.write(f"{name}\n")


10. Conclusion

- Model: XGBoost with SMOTE
- Best F1 Score from CV: (check tuner output)
- The model predicts fraudulent insurance claims with reasonable confidence.
- You can deploy this model via Streamlit, which you've already done.


##### All the entire notebook is on my GitHub