In [2]:
# 0) Setup (Python 3.9)
# If needed, install:
# !pip install pandas numpy scikit-learn xgboost plotly matplotlib seaborn joblib shap xlrd openpyxl streamlit

import os, json, math, textwrap, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import streamlit as st
from datetime import datetime, time as dtime
from typing import Tuple, Dict, List

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    roc_auc_score, average_precision_score, brier_score_loss, log_loss,
    confusion_matrix, precision_recall_curve, roc_curve
)
from sklearn.calibration import CalibratedClassifierCV
from sklearn.utils.class_weight import compute_class_weight

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import joblib
import xgboost as xgb


In [3]:
# 1) Paths & Config
DATA_PATH = "ncr_ride_bookings.csv"   # change if needed
ARTIFACTS_DIR = "artifacts"
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

SEED = 42

LABEL_COL = "is_cancelled"
DATE_COL = "Date"
TIME_COL = "Time"

# Columns present in your dataset (for reference)
RAW_COLS = [
    "Date","Time","Booking ID","Booking Status","Customer ID","Vehicle Type",
    "Pickup Location","Drop Location","Avg VTAT","Avg CTAT",
    "Cancelled Rides by Customer","Reason for cancelling by Customer",
    "Cancelled Rides by Driver","Driver Cancellation Reason",
    "Incomplete Rides","Incomplete Rides Reason",
    "Booking Value","Ride Distance","Driver Ratings","Customer Rating","Payment Method"
]

# Columns that leak the outcome or occur after the event — DROP from features
LEAKY_COLS = [
    "Booking Status",                    # becomes our label
    "Cancelled Rides by Customer",
    "Reason for cancelling by Customer",
    "Cancelled Rides by Driver",
    "Driver Cancellation Reason",
    "Incomplete Rides",
    "Incomplete Rides Reason",
    "Booking ID",                        # identifier (not predictive by itself)
]

# Cost settings for threshold search (you can tune)
COST_TP = +5.0   # benefit of catching a would-be cancellation
COST_FP = -1.0   # cost of unnecessary intervention


In [4]:
# 2) Load & quick browse
df_raw = pd.read_csv(DATA_PATH)

print("Shape:", df_raw.shape)
print(df_raw.head(3))
print("\nNull counts:\n", df_raw.isna().sum())
print("\nDtypes:\n", df_raw.dtypes)

Shape: (150000, 21)
         Date      Time    Booking ID   Booking Status   Customer ID  \
0  2024-03-23  12:29:38  "CNR5884300"  No Driver Found  "CID1982111"   
1  2024-11-29  18:01:39  "CNR1326809"       Incomplete  "CID4604802"   
2  2024-08-23  08:56:10  "CNR8494506"        Completed  "CID9202816"   

  Vehicle Type Pickup Location      Drop Location  Avg VTAT  Avg CTAT  ...  \
0        eBike     Palam Vihar            Jhilmil       NaN       NaN  ...   
1     Go Sedan   Shastri Nagar  Gurgaon Sector 56       4.9      14.0  ...   
2         Auto         Khandsa      Malviya Nagar      13.4      25.8  ...   

   Reason for cancelling by Customer Cancelled Rides by Driver  \
0                                NaN                       NaN   
1                                NaN                       NaN   
2                                NaN                       NaN   

   Driver Cancellation Reason Incomplete Rides  Incomplete Rides Reason  \
0                         NaN         

In [5]:
# 3) Label + Booking-time feature engineering (no leakage)

def _parse_date_series(s: pd.Series) -> pd.Series:
    # Tries multiple formats robustly
    return pd.to_datetime(s, errors="coerce", infer_datetime_format=True)

def _parse_time_series(s: pd.Series) -> pd.Series:
    # If "HH:MM:SS" or "HH:MM" – otherwise coerce
    return pd.to_datetime(s, errors="coerce").dt.time

def add_booking_time_features(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    # Label: cancellation if Booking Status contains 'Cancelled'
    out[LABEL_COL] = out["Booking Status"].astype(str).str.contains("cancel", case=False, na=False).astype(int)

    # Parse date/time
    out["_Date_dt"] = _parse_date_series(out[DATE_COL])
    # Time may be a string or datetime-like
    tparsed = pd.to_datetime(out[TIME_COL], errors="coerce")
    out["_hour"] = tparsed.dt.hour

    # is_weekend (Sat=5, Sun=6)
    out["is_weekend"] = out["_Date_dt"].dt.weekday.isin([5, 6]).astype(int)

    # is_late: 00:00–05:59
    out["is_late"] = out["_hour"].between(0, 5, inclusive="both").astype(int)

    # Basic distance and fare safety features
    out["fare_per_km"] = np.where(out["Ride Distance"].fillna(0) > 0,
                                  out["Booking Value"] / out["Ride Distance"].replace(0, np.nan),
                                  np.nan)

    # Simple location relationship
    out["same_area"] = (out["Pickup Location"].astype(str).str.lower()
                        == out["Drop Location"].astype(str).str.lower()).astype(int)

    # Keep only booking-time-safe features (drop leakage)
    drop_cols = set(LEAKY_COLS)
    out = out.drop(columns=[c for c in out.columns if c in drop_cols], errors="ignore")

    return out

df = add_booking_time_features(df_raw)

print("Engineered shape:", df.shape)
print(df[[DATE_COL, TIME_COL, "is_weekend", "is_late", "fare_per_km", "same_area", LABEL_COL]].head(5))

Engineered shape: (150000, 20)
         Date      Time  is_weekend  is_late  fare_per_km  same_area  \
0  2024-03-23  12:29:38           1        0          NaN          0   
1  2024-11-29  18:01:39           0        0    41.361257          0   
2  2024-08-23  08:56:10           0        0    46.170839          0   
3  2024-10-21  17:17:25           0        0    12.228101          0   
4  2024-09-16  22:08:00           0        0    15.287285          0   

   is_cancelled  
0             0  
1             0  
2             0  
3             0  
4             0  


In [6]:
# 4) Train/Val/Test split by time (chronological quantiles)

def time_based_split(df: pd.DataFrame, date_col: str = "_Date_dt",
                     train_q: float = 0.7, val_q: float = 0.85) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    d = df.copy()
    d = d.sort_values(date_col)
    # If some dates are NaT, push them to the beginning to avoid leakage
    d[date_col] = d[date_col].fillna(d[date_col].min())
    cut1 = d[date_col].quantile(train_q)
    cut2 = d[date_col].quantile(val_q)
    tr = d[d[date_col] <= cut1]
    va = d[(d[date_col] > cut1) & (d[date_col] <= cut2)]
    te = d[d[date_col] > cut2]
    return tr, va, te

train_df, val_df, test_df = time_based_split(df)

for name, part in [("train", train_df), ("val", val_df), ("test", test_df)]:
    print(name, part.shape, "pos rate=", part[LABEL_COL].mean().round(4))


train (105340, 20) pos rate= 0.2509
val (22517, 20) pos rate= 0.245
test (22143, 20) pos rate= 0.251


In [7]:
# 5) Columns: numeric & categorical

NUM_COLS = [
    "Avg VTAT", "Avg CTAT", "Booking Value", "Ride Distance",
    "Driver Ratings", "Customer Rating", "fare_per_km", "is_weekend", "is_late", "same_area", "_hour"
]
CAT_COLS = [
    "Customer ID", "Vehicle Type", "Pickup Location", "Drop Location", "Payment Method"
]

# Keep only cols that actually exist
NUM_COLS = [c for c in NUM_COLS if c in train_df.columns]
CAT_COLS = [c for c in CAT_COLS if c in train_df.columns]

TARGET = LABEL_COL
FEATURE_COLS = NUM_COLS + CAT_COLS


In [8]:
# 6) Helper: reduce high-cardinality categories to Top-K + "Other"
from collections import Counter

class TopKCategorical(BaseEstimator, TransformerMixin):
    def __init__(self, top_k: int = 30):
        self.top_k = top_k
        self.keep_maps_: Dict[str, set] = {}

    def fit(self, X: pd.DataFrame, y=None):
        self.keep_maps_ = {}
        for col in X.columns:
            vc = Counter(X[col].astype(str).fillna("nan"))
            keep = set([c for c, _ in vc.most_common(self.top_k)])
            self.keep_maps_[col] = keep
        return self

    def transform(self, X: pd.DataFrame):
        X = X.copy()
        for col in X.columns:
            keep = self.keep_maps_.get(col, set())
            X[col] = X[col].astype(str).where(X[col].astype(str).isin(keep), other="Other")
        return X


In [9]:
# 7) Build preprocessing + model pipeline

# Numeric pipeline
num_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

# Categorical pipeline
cat_pipe = Pipeline(steps=[
    ("topk", TopKCategorical(top_k=40)),
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", num_pipe, NUM_COLS),
        ("cat", cat_pipe, CAT_COLS),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)

# Compute class imbalance to set scale_pos_weight
pos_rate = train_df[TARGET].mean()
neg_rate = 1 - pos_rate
scale_pos_weight = (neg_rate / pos_rate) if pos_rate > 0 else 1.0
print("Estimated scale_pos_weight:", round(scale_pos_weight, 3))

xgb_clf = xgb.XGBClassifier(
    n_estimators=1000,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    min_child_weight=2,
    reg_lambda=1.0,
    objective="binary:logistic",
    eval_metric="auc",
    tree_method="hist",
    random_state=SEED,
    scale_pos_weight=scale_pos_weight,
    early_stopping_rounds= 50
)

model = Pipeline(steps=[
    ("prep", preprocess),
    ("xgb", xgb_clf)
])

Estimated scale_pos_weight: 2.986


In [10]:
# 8) Train with early stopping on validation

X_tr, y_tr = train_df[FEATURE_COLS], train_df[TARGET]
X_va, y_va = val_df[FEATURE_COLS], val_df[TARGET]

# To use early stopping, we temporarily fit preprocess, transform arrays, then fit XGB directly
X_tr_enc = preprocess.fit_transform(X_tr, y_tr)
X_va_enc = preprocess.transform(X_va)

xgb_clf.fit(
    X_tr_enc, y_tr,
    eval_set=[(X_va_enc, y_va)],
    verbose=50,
)

# Wrap the fitted preprocess + fitted booster back into a single pipeline-like dict
trained = {
    "preprocess": preprocess,
    "booster": xgb_clf
}

# Quick validation metrics (uncalibrated)
va_pred = xgb_clf.predict_proba(X_va_enc)[:, 1]
print("Val AUC:", roc_auc_score(y_va, va_pred))
print("Val PR-AUC:", average_precision_score(y_va, va_pred))
print("Val Brier:", brier_score_loss(y_va, va_pred))
print("Val LogLoss:", log_loss(y_va, va_pred))


[0]	validation_0-auc:0.99944
[50]	validation_0-auc:0.99944
[51]	validation_0-auc:0.99944
Val AUC: 0.9994477763916878
Val PR-AUC: 0.997722921691302
Val Brier: 0.18518442975339489
Val LogLoss: 0.5626819036451683


In [11]:
# 9) Probability calibration (Isotonic via CalibratedClassifierCV with prefit model)
# Calibrating often improves probability quality for banding and cost-based decisions.

calibrator = CalibratedClassifierCV(
    estimator=xgb_clf, method="isotonic", cv="prefit"
)
calibrator.fit(X_va_enc, y_va)

# Store a thin wrapper to keep consistent API with preprocessing
class CalibratedModel:
    def __init__(self, preprocess, calibrator):
        self.preprocess = preprocess
        self.calibrator = calibrator
    def predict_proba(self, X):
        Xenc = self.preprocess.transform(X)
        return self.calibrator.predict_proba(Xenc)
    def predict(self, X, threshold=0.5):
        p = self.predict_proba(X)[:, 1]
        return (p >= threshold).astype(int)

calibrated_model = CalibratedModel(preprocess, calibrator)

# Check calibrated val metrics
va_pred_cal = calibrated_model.predict_proba(X_va)[:, 1]
print("Val AUC (cal):", roc_auc_score(y_va, va_pred_cal))
print("Val PR-AUC (cal):", average_precision_score(y_va, va_pred_cal))
print("Val Brier (cal):", brier_score_loss(y_va, va_pred_cal))
print("Val LogLoss (cal):", log_loss(y_va, va_pred_cal))


Val AUC (cal): 0.9994716917762212
Val PR-AUC (cal): 0.9975615423941916
Val Brier (cal): 0.002768512272185855
Val LogLoss (cal): 0.012122059277611723


In [12]:
# 10) Risk threshold (Low/High) via cost-sensitive search on validation
# Interpretations:
#  - Predict High (intervene) when p >= t_cut
#  - Utility = +COST_TP for each true cancel flagged High,  -COST_FP for each non-cancel flagged High

def utility_two_band(y_true, p, t_cut, cost_tp=COST_TP, cost_fp=COST_FP):
    y = np.asarray(y_true); p = np.asarray(p)
    H = p >= t_cut
    # High + true cancel -> reward; High + non-cancel -> penalty
    util = np.sum((y == 1) & H) * cost_tp + np.sum((y == 0) & H) * cost_fp
    return float(util)

def search_single_threshold(y_true, p, grid=None):
    if grid is None:
        grid = np.linspace(0.05, 0.95, 181)  # 0.05 to 0.95 step 0.005
    best = (-1e18, 0.5)
    for t in grid:
        u = utility_two_band(y_true, p, t)
        if u > best[0]:
            best = (u, float(t))
    return {"utility": best[0], "t_cut": best[1]}

thr = search_single_threshold(y_va, va_pred_cal)  # use calibrated probs if you kept calibration
thr



{'utility': np.float64(27260.0), 't_low': 0.1, 't_high': 0.5}

In [13]:
# 11) Final evaluation on TEST with the calibrated model and found cutoff
X_te, y_te = test_df[FEATURE_COLS], test_df[TARGET]
te_pred_cal = calibrated_model.predict_proba(X_te)[:, 1]

def summarize_two_band_perf(y_true, p, t_cut):
    H = (p >= t_cut).astype(int)     # High label
    L = (p < t_cut).astype(int)      # Low label
    auc = roc_auc_score(y_true, p)
    ap = average_precision_score(y_true, p)
    brier = brier_score_loss(y_true, p)
    lg = log_loss(y_true, p)
    return {
        "AUC": round(auc, 4), "PR_AUC": round(ap, 4),
        "Brier": round(brier, 4), "LogLoss": round(lg, 4),
        "Share_High": round(H.mean(), 4),
        "Share_Low": round(L.mean(), 4),
        "Utility": round(utility_two_band(y_true, p, t_cut), 2)
    }

test_summary = summarize_two_band_perf(y_te, te_pred_cal, thr["t_cut"])
test_summary



{'AUC': 0.9997,
 'PR_AUC': 0.9986,
 'Brier': 0.002,
 'LogLoss': 0.0091,
 'Share_High': np.float64(0.249),
 'Share_Med': np.float64(0.0),
 'Share_Low': np.float64(0.751),
 'Utility': np.float64(27564.75)}

In [14]:
# 12) Plots (ROC, PR, Calibration curve) – saved to artifacts

def plot_roc_pr(y_true, p, tag="val"):
    fpr, tpr, _ = roc_curve(y_true, p)
    prec, rec, _ = precision_recall_curve(y_true, p)

    plt.figure(figsize=(5,4))
    plt.plot(fpr, tpr, lw=2)
    plt.plot([0,1], [0,1], ls="--")
    plt.title(f"ROC ({tag})")
    plt.xlabel("FPR")
    plt.ylabel("TPR")
    plt.tight_layout()
    plt.savefig(os.path.join(ARTIFACTS_DIR, f"roc_{tag}.png"), dpi=160)
    plt.close()

    plt.figure(figsize=(5,4))
    plt.plot(rec, prec, lw=2)
    plt.title(f"Precision-Recall ({tag})")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.tight_layout()
    plt.savefig(os.path.join(ARTIFACTS_DIR, f"pr_{tag}.png"), dpi=160)
    plt.close()

plot_roc_pr(y_va, va_pred_cal, "val")
plot_roc_pr(y_te, te_pred_cal, "test")
print("Saved ROC/PR plots to", ARTIFACTS_DIR)


Saved ROC/PR plots to artifacts


In [15]:
# 15) Save artifacts for deployment (two-band)
joblib.dump(trained["preprocess"], os.path.join(ARTIFACTS_DIR, "preprocess.joblib"))
joblib.dump(trained["booster"], os.path.join(ARTIFACTS_DIR, "xgb_model.joblib"))

with open(os.path.join(ARTIFACTS_DIR, "risk_thresholds.json"), "w", encoding="utf-8") as f:
    json.dump({"t_cut": thr["t_cut"]}, f, indent=2)

with open(os.path.join(ARTIFACTS_DIR, "model_card.json"), "w", encoding="utf-8") as f:
    json.dump({
        "feature_columns": FEATURE_COLS,
        "label_column": LABEL_COL,
        "metrics_test": test_summary
    }, f, indent=2)



Artifacts saved under: artifacts


In [16]:

# 16) Batch scoring for Excel files: read xlsx/csv → output scored file with RiskBand
# Usage: score_file("new_orders.xlsx")

def load_artifacts():
    prep = joblib.load(os.path.join(ARTIFACTS_DIR, "preprocess.joblib"))
    booster = joblib.load(os.path.join(ARTIFACTS_DIR, "xgb_model.joblib"))
    with open(os.path.join(ARTIFACTS_DIR, "risk_thresholds.json"), encoding="utf-8") as f:
        thr = json.load(f)  # expects {"t_cut": ...}
    class _Wrap:
        def __init__(self, preprocess, booster):
            self.preprocess = preprocess
            self.booster = booster
        def predict_proba(self, X):
            Xenc = self.preprocess.transform(X)
            return self.booster.predict_proba(Xenc)
    return _Wrap(prep, booster), thr

def classify_two_band(p: np.ndarray, t_cut: float) -> list:
    return ["High" if prob >= t_cut else "Low" for prob in p]

def score_dataframe(df_in: pd.DataFrame) -> pd.DataFrame:
    model_like, thr = load_artifacts()
    dfx = add_booking_time_features(df_in)
    X = dfx[FEATURE_COLS]
    p = model_like.predict_proba(X)[:, 1]
    out = df_in.copy()
    out["CancelRisk_Prob"] = p
    out["CancelRisk_Band"] = classify_two_band(p, thr["t_cut"])
    return out

def score_file(path_in: str, path_out: str = None):
    ext = os.path.splitext(path_in)[1].lower()
    if ext in [".xlsx", ".xls"]:
        df_in = pd.read_excel(path_in)
    elif ext in [".csv"]:
        df_in = pd.read_csv(path_in)
    else:
        raise ValueError("Unsupported input format; use .xlsx or .csv")

    out = score_dataframe(df_in)
    if not path_out:
        base = os.path.splitext(path_in)[0]
        path_out = base + "_scored.xlsx"
    out.to_excel(path_out, index=False)
    print("Saved:", path_out)

# Example:
# score_file("incoming_orders.xlsx")


In [17]:
# 18) Lightweight Streamlit dashboard (save as app.py and run: streamlit run app.py)
STREAMLIT_APP = r'''
import os
import json
import pandas as pd
import numpy as np
import plotly.express as px
import joblib
import streamlit as st

ARTIFACTS_DIR = "artifacts"
DATA_PATH = "/mnt/data/ncr_ride_bookings.csv"  # change as needed

st.set_page_config(page_title="Uber Ride Cancellation Risk", layout="wide")

@st.cache_resource
def load_artifacts():
    prep = joblib.load(os.path.join(ARTIFACTS_DIR, "preprocess.joblib"))
    booster = joblib.load(os.path.join(ARTIFACTS_DIR, "xgb_model.joblib"))
    with open(os.path.join(ARTIFACTS_DIR, "risk_thresholds.json")) as f:
        thr = json.load(f)
    return prep, booster, thr

def add_booking_time_features(df):
    import pandas as pd, numpy as np
    from datetime import datetime

    out = df.copy()
    out["is_cancelled"] = out["Booking Status"].astype(str).str.contains("cancel", case=False, na=False).astype(int)
    out["_Date_dt"] = pd.to_datetime(out["Date"], errors="coerce", infer_datetime_format=True)
    tparsed = pd.to_datetime(out["Time"], errors="coerce")
    out["_hour"] = tparsed.dt.hour
    out["is_weekend"] = out["_Date_dt"].dt.weekday.isin([5,6]).astype(int)
    out["is_late"] = out["_hour"].between(0,5, inclusive="both").astype(int)
    out["fare_per_km"] = np.where(out["Ride Distance"].fillna(0) > 0,
                                  out["Booking Value"] / out["Ride Distance"].replace(0, np.nan),
                                  np.nan)
    out["same_area"] = (out["Pickup Location"].astype(str).str.lower()
                        == out["Drop Location"].astype(str).str.lower()).astype(int)
    DROP = set([
        "Booking Status","Cancelled Rides by Customer","Reason for cancelling by Customer",
        "Cancelled Rides by Driver","Driver Cancellation Reason","Incomplete Rides",
        "Incomplete Rides Reason","Booking ID"
    ])
    out = out.drop(columns=[c for c in out.columns if c in DROP], errors="ignore")
    return out

NUM_COLS = ["Avg VTAT","Avg CTAT","Booking Value","Ride Distance",
            "Driver Ratings","Customer Rating","fare_per_km","is_weekend","is_late","same_area","_hour"]
CAT_COLS = ["Customer ID","Vehicle Type","Pickup Location","Drop Location","Payment Method"]
FEATURE_COLS = NUM_COLS + CAT_COLS

st.title("Uber Ride Cancellation Risk – Driver Plugin Prototype")

uploaded = st.file_uploader("Upload Excel/CSV with upcoming bookings", type=["xlsx","xls","csv"])
prep, booster, thr = load_artifacts()

def predict_proba(df_in):
    dfx = add_booking_time_features(df_in)
    X = dfx[FEATURE_COLS]
    Xenc = prep.transform(X)
    p = booster.predict_proba(Xenc)[:,1]
    return p

def band(p):
    if p >= thr["t_high"]:
        return "High"
    elif p >= thr["t_low"]:
        return "Medium"
    else:
        return "Low"

if uploaded is not None:
    ext = os.path.splitext(uploaded.name)[1].lower()
    if ext in [".xlsx",".xls"]:
        df_in = pd.read_excel(uploaded)
    else:
        df_in = pd.read_csv(uploaded)

    p = predict_proba(df_in)
    df_out = df_in.copy()
    df_out["CancelRisk_Prob"] = p
    df_out["CancelRisk_Band"] = [band(v) for v in p]

    st.subheader("Scored Orders")
    st.dataframe(df_out.head(50))

    st.download_button("Download Scored File", df_out.to_csv(index=False).encode(), file_name="scored_orders.csv")

    st.subheader("Risk Distribution")
    fig = px.histogram(df_out, x="CancelRisk_Prob", nbins=30, color="CancelRisk_Band")
    st.plotly_chart(fig, use_container_width=True)

else:
    st.info("Upload an Excel/CSV file to score cancellation risk.")
'''

with open(os.path.join(ARTIFACTS_DIR, "app.py"), "w") as f:
    f.write(STREAMLIT_APP)
print("Streamlit app saved to artifacts/app.py (run: streamlit run artifacts/app.py)")

Streamlit app saved to artifacts/app.py (run: streamlit run artifacts/app.py)


In [18]:
files = [os.path.join(ARTIFACTS_DIR, f) for f in os.listdir(ARTIFACTS_DIR)]
print(files)

['artifacts\\app.py', 'artifacts\\model_card.json', 'artifacts\\preprocess.joblib', 'artifacts\\pr_test.png', 'artifacts\\pr_val.png', 'artifacts\\risk_thresholds.json', 'artifacts\\roc_test.png', 'artifacts\\roc_val.png', 'artifacts\\xgb_model.joblib']
