1) Define the objective + guard against leakage

Decide what you can know at prediction time. If you’re quoting a price at purchase time, drop any features that peek into the future (e.g., COVID around departure if not known yet).

In [None]:
import numpy as np
import pandas as pd

# Target (log helps linear models + stabilizes variance)
TARGET = "mean_net_ticket_price"
df = df.copy()
df["y_log"] = np.log1p(df[TARGET])

# Candidate features you likely have (add/remove as needed)
num_feats = [
    "lead_time_days",
    "Culmulative_sales","route_sales_rank","sales_diff_1",
    "price_to_route_month_mean","route_month_mean_price",
    # covid @ purchase ONLY if allowed at quote time:
    "covid_cases_at_purchase","stringency_at_purchase",
    # OPTIONAL (if you allow future info at train time; otherwise comment out):
    # "covid_cases_at_departure","covid_7d_sum_before_departure","covid_14d_sum_before_departure","covid_30d_sum_before_departure",
]

bool_feats = [
    "isReturn","isOneway","isNormCabin",
    "is_weekend_dept","is_weekend_purchase",
    "is_holiday_dept","is_holiday_purchase",   # SG earlier
    "is_sg_holiday_dept","is_us_holiday_dept","is_cn_holiday_dept","is_fr_holiday_dept","is_in_holiday_dept","is_ru_holiday_dept",
    "is_sg_holiday_purchase","is_us_holiday_purchase","is_cn_holiday_purchase","is_fr_holiday_purchase","is_in_holiday_purchase","is_ru_holiday_purchase",
    "return_x_norm","oneway_x_norm","return_x_weekend","oneway_x_weekend",
]

cat_feats = [
    "dept_weekday","purchase_weekday","dept_season",
    "Train_Number_All","Customer_Cat"
]

# Keep only columns that exist
num_feats  = [c for c in num_feats  if c in df.columns]
bool_feats = [c for c in bool_feats if c in df.columns]
cat_feats  = [c for c in cat_feats  if c in df.columns]

X_cols = num_feats + bool_feats + cat_feats
print("Using features:", X_cols)


Time-aware train/test split

Don’t shuffle randomly when there’s time involved.

In [1]:
# Ensure dates are datetime
for c in ["Purchase_Date","Dept_Date"]:
    if c in df.columns:
        df[c] = pd.to_datetime(df[c], errors="coerce")

df_sorted = df.sort_values("Purchase_Date").reset_index(drop=True)

cut = int(len(df_sorted) * 0.8)  # 80% train, 20% test (time ordered)
train = df_sorted.iloc[:cut].copy()
test  = df_sorted.iloc[cut:].copy()

X_train, y_train = train[X_cols], train["y_log"]
X_test,  y_test  = test[X_cols],  test["y_log"]

print(train["Purchase_Date"].min(), "→", train["Purchase_Date"].max(), "| TRAIN")
print(test["Purchase_Date"].min(),  "→", test["Purchase_Date"].max(),  "| TEST")


NameError: name 'df' is not defined

Preprocess + LASSO (with CV)

Scale numeric/boolean, one-hot categoricals, and run LassoCV.

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LassoCV, ElasticNetCV, LinearRegression
from sklearn.pipeline import Pipeline
import numpy as np

numeric = num_feats + bool_feats   # scale both numeric + 0/1 flags
categorical = cat_feats

pre = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(with_mean=True, with_std=True), numeric),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse=False), categorical),
    ],
    remainder="drop"
)

alphas = np.logspace(-4, 1.5, 80)  # search from 1e-4 to ~31
lasso = LassoCV(alphas=alphas, cv=5, random_state=42, n_jobs=-1, max_iter=20000)

pipe_lasso = Pipeline([
    ("pre", pre),
    ("model", lasso)
])

pipe_lasso.fit(X_train, y_train)

print("Best alpha (LASSO):", pipe_lasso.named_steps["model"].alpha_)
print("Non-zero coefs:", np.sum(pipe_lasso.named_steps["model"].coef_ != 0))


ModuleNotFoundError: No module named 'sklearn'

Evaluate (log space + back-transform)

Report MAE/RMSE/ MAP E on the original price scale.

In [3]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def evaluate(pipe, X_tr, y_tr, X_te, y_te, label="Model"):
    pred_tr = pipe.predict(X_tr)
    pred_te = pipe.predict(X_te)

    # back-transform
    y_tr_hat = np.expm1(pred_tr)
    y_te_hat = np.expm1(pred_te)
    y_tr_true = np.expm1(y_tr)
    y_te_true = np.expm1(y_te)

    def mape(y_true, y_pred):
        return np.mean(np.abs((y_true - y_pred) / np.clip(y_true, 1e-8, None))) * 100

    metrics = {
        "MAE_tr": mean_absolute_error(y_tr_true, y_tr_hat),
        "RMSE_tr": mean_squared_error(y_tr_true, y_tr_hat, squared=False),
        "MAPE_tr_%": mape(y_tr_true, y_tr_hat),
        "R2_tr": r2_score(y_tr, pred_tr),  # R^2 in log space

        "MAE_te": mean_absolute_error(y_te_true, y_te_hat),
        "RMSE_te": mean_squared_error(y_te_true, y_te_hat, squared=False),
        "MAPE_te_%": mape(y_te_true, y_te_hat),
        "R2_te": r2_score(y_te, pred_te),
    }
    print(label, metrics)
    return metrics

m_lasso = evaluate(pipe_lasso, X_train, y_train, X_test, y_test, "LASSO")
# Optional:
# m_enet  = evaluate(pipe_enet,  X_train, y_train, X_test, y_test, "ElasticNet")


ModuleNotFoundError: No module named 'sklearn'

Which features did LASSO keep?

Get feature names after preprocessing and list non-zero coefficients (standardized).

In [4]:
# Get transformed feature names
cat_names = pipe_lasso.named_steps["pre"].named_transformers_["cat"].get_feature_names_out(categorical) if categorical else np.array([])
feat_names = np.concatenate([numeric, cat_names])

coefs = pipe_lasso.named_steps["model"].coef_
mask = coefs != 0
selected = pd.DataFrame({
    "feature": feat_names[mask],
    "coef_std": coefs[mask]
}).sort_values("coef_std", key=lambda s: np.abs(s), ascending=False)

print(f"LASSO kept {mask.sum()} / {len(feat_names)} features")
selected.head(30)


NameError: name 'categorical' is not defined

Human-readable formula

Coefficients above are for standardized inputs. To get a simple linear formula in original units, refit a plain LinearRegression on only the selected columns using the same one-hot, but without scaling numeric features.

In [5]:
# Build a 'no-scale' preprocessor (one-hot only), then restrict to selected columns
pre_noscale = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric + bool_feats),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse=False), categorical),
    ],
    remainder="drop"
)

# Fit on TRAIN to get transformed matrix & names
Xtr_n = pre_noscale.fit_transform(X_train)
cat_names_n = pre_noscale.named_transformers_["cat"].get_feature_names_out(categorical) if categorical else np.array([])
feat_names_n = np.concatenate([numeric + bool_feats, cat_names_n])

# Keep only the features LASSO selected (intersection by name)
keep_set = set(selected["feature"])
keep_idx = [i for i, n in enumerate(feat_names_n) if n in keep_set]

Xtr_sel = Xtr_n[:, keep_idx]
Xte_sel = pre_noscale.transform(X_test)[:, keep_idx]
feat_sel_names = [feat_names_n[i] for i in keep_idx]

ols = LinearRegression()
ols.fit(Xtr_sel, y_train)

coef = pd.DataFrame({"feature": feat_sel_names, "coef": ols.coef_}).sort_values("coef", key=lambda s: np.abs(s), ascending=False)
intercept = ols.intercept_

print("Linear model (log-price) with selected features")
print("Intercept:", intercept)
coef.head(30)


NameError: name 'ColumnTransformer' is not defined

Your formula in log space is:
log(1 + price) = Intercept + Σ coef_i * feature_i
To predict price: price_hat = exp(Intercept + Σ coef_i * feature_i) - 1.

(If you want statistical p-values / confidence intervals, swap LinearRegression with statsmodels.api.OLS on Xtr_sel and y_train.)

Which ones to keep? (beyond LASSO)

Permutation importance (model-agnostic): keeps features that truly affect predictions on the holdout.

Stability selection (bootstrap): keeps features repeatedly chosen across resamples.

In [6]:
from sklearn.inspection import permutation_importance

perm = permutation_importance(pipe_lasso, X_test, y_test, n_repeats=40, random_state=42, n_jobs=-1)
perm_imp = pd.DataFrame({
    "feature": feat_names,
    "import_mean": perm.importances_mean,
    "import_std": perm.importances_std
}).sort_values("import_mean", ascending=False)

perm_imp.head(20)


ModuleNotFoundError: No module named 'sklearn'