In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV,RepeatedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge,Lasso,ElasticNetCV,BayesianRidge
from sklearn.preprocessing import OneHotEncoder, RobustScaler, FunctionTransformer, PowerTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.compose import TransformedTargetRegressor
import xgboost as xgb

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
train = train[train['Transport_Cost'] > 0].copy()
hospital_id_test = test["Hospital_Id"]
drop_cols = ["Hospital_Id", "Supplier_Name", "Hospital_Location"]
train = train.drop(columns=drop_cols)
test = test.drop(columns=drop_cols)

In [None]:
target = "Transport_Cost"
train = train.dropna(subset=[target])

In [None]:
numeric_features = ['Equipment_Height', 'Equipment_Weight', 'Supplier_Reliability']
categorical_unknown = ['Equipment_Type', 'Transport_Method', 'Rural_Hospital']
no_missing_numerical = ['Equipment_Value', 'Base_Transport_Fee']
no_missing_categorical = ['Fragile_Equipment', 'Hospital_Info', 'CrossBorder_Shipping','Urgent_Shipping', 'Installation_Service']
d_features = ['Order_Placed_Date', 'Delivery_Date']

In [None]:
def compute_date_features(dates):
    df = dates.copy()
    df = df.apply(pd.to_datetime, format="%m/%d/%y", errors="coerce")
    order = df.iloc[:, 0]
    delivery = df.iloc[:, 1]
    duration = (delivery - order).dt.days
    o_dow = order.dt.dayofweek.fillna(-1).astype(float)
    o_month = order.dt.month.fillna(0).astype(float)
    o_is_weekend = o_dow.isin([5, 6]).astype(float)
    d_dow = delivery.dt.dayofweek.fillna(-1).astype(float)
    d_month = delivery.dt.month.fillna(0).astype(float)
    d_is_weekend = d_dow.isin([5, 6]).astype(float)
    def cycle(x, period):
        r = x.replace(-1, 0)
        rad = 2 * np.pi * r / period
        return np.sin(rad), np.cos(rad)
    o_dow_sin, o_dow_cos = cycle(o_dow, 7)
    o_month_sin, o_month_cos = cycle(o_month, 12)
    features = pd.DataFrame({
        "delivery_days": duration,
        "order_dow_sin": o_dow_sin,
        "order_dow_cos": o_dow_cos,
        "order_month_sin": o_month_sin,
        "order_month_cos": o_month_cos,
        "order_is_weekend": o_is_weekend,
        "delivery_is_weekend": d_is_weekend,
    })
    features.index = dates.index
    return features

In [None]:
class EquipmentFeatureAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        # Define all the new columns this transformer will create
        self.new_cols = [
            'Value_Per_Kg', 'Base_Cost_Per_Kg', 'CrossBorder_Urgent', 
            'Fragile_Urgent', 'Rural_CrossBorder', 'Complex_Shipping'
        ]
    
    def fit(self, X, y=None):
        # Nothing to learn, just return self
        return self
    
    def transform(self, X):
        # Start by copying the input DataFrame
        df = X.copy()
        
        # --- 1. Initialize all new columns as NaN ---
        # This GUARANTEES they exist, which will fix the KeyError.
        for col in self.new_cols:
            df[col] = np.nan

        # --- 2. Define calculation columns ---
        numeric_cols = ['Equipment_Value', 'Equipment_Weight', 'Base_Transport_Fee']
        categorical_flags = [
            'Rural_Hospital', 'Fragile_Equipment', 'CrossBorder_Shipping',
            'Urgent_Shipping', 'Installation_Service'
        ]
        
        # --- 3. Convert all calculation columns to numeric ---
        # We use .get() to avoid errors if a column is missing
        for col in numeric_cols:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
            
        for col in categorical_flags:
            if col in df.columns:
                
                df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

        
        try:
            df['Value_Per_Kg'] = df['Equipment_Value'] / (df['Equipment_Weight'] + 1)
            df['Base_Cost_Per_Kg'] = df['Base_Transport_Fee'] / (df['Equipment_Weight'] + 1)
            
            df['CrossBorder_Urgent'] = df.get('CrossBorder_Shipping', 0) * df.get('Urgent_Shipping', 0)
            df['Fragile_Urgent'] = df.get('Fragile_Equipment', 0) * df.get('Urgent_Shipping', 0)
            df['Rural_CrossBorder'] = df.get('Rural_Hospital', 0) * df.get('CrossBorder_Shipping', 0)

            df['Complex_Shipping'] = (
                df.get('CrossBorder_Shipping', 0) +
                df.get('Urgent_Shipping', 0) +
                df.get('Fragile_Equipment', 0) +
                df.get('Installation_Service', 0)
            )
        except Exception as e:
            
            pass
        
        return df

In [None]:
t_numeric = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

In [None]:
t_categorical = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

In [None]:
t_no_missing_numerical = Pipeline([
    ('scaler', RobustScaler())
])

In [None]:
t_no_missing_categorical = Pipeline([
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

In [None]:
t_d_features = Pipeline([
    ('date_feat', FunctionTransformer(compute_date_features, validate=False)),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler(with_centering=False))
])

In [None]:

new_numeric_features = [
    'Value_Per_Kg', 'Base_Cost_Per_Kg', 'CrossBorder_Urgent', 'Fragile_Urgent', 
    'Rural_CrossBorder', 'Complex_Shipping'
]

preprocessor = ColumnTransformer(transformers=[
    ('num', t_numeric, numeric_features),
    ('cat_unknown', t_categorical, categorical_unknown),
    ('date', t_d_features, d_features),
    ('num_no_missing', t_no_missing_numerical, no_missing_numerical),
    ('cat_no_missing', t_no_missing_categorical, no_missing_categorical),
    
    ('new_num', t_numeric, new_numeric_features)
], remainder='drop')

In [None]:
X = train.drop(columns=[target])
y = train[target].replace([np.inf, -np.inf], np.nan).fillna(0)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
models = {
    "ElasticNet": {
        "model": ElasticNetCV(max_iter=30000, random_state=42, l1_ratio=[0.05,0.99,20], alphas=np.logspace(-5, 1,100)),
        "params": {} 
    },
    "RandomForest": {
        "model": RandomForestRegressor(random_state=42, n_jobs=-1),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [10, 15, None],
            "min_samples_split": [2, 5],
            "min_samples_leaf": [1, 2],
            "max_features": ["sqrt", "log2"]
        }
    },
    "XGBoost": {
        "model": xgb.XGBRegressor(random_state=42, n_jobs=-1, eval_metric='rmse'),
        "params": {
            "n_estimators": [200, 300],
            "learning_rate": [0.03, 0.05],
            "max_depth": [5, 6],
            "min_child_weight": [2,3],
            "subsample": [0.8],
            "colsample_bytree": [0.8],
            "reg_alpha": [0.5],
            "reg_lambda": [2.0],
            "gamma": [0.1]
        }
    },
    "AdaBoost": {
        "model": AdaBoostRegressor(random_state=42),
        "params": {
            "n_estimators": [50,100, 200],
            "learning_rate": [0.01, 0.05,0.1,1.0],
        }
    },
    "Ridge": {
        "model": Ridge(random_state=42),
        "params": {
            "alpha": np.logspace(-4, 2, 20)
        }
    },
    "Lasso": {
        "model": Lasso(max_iter=20000, random_state=42),
        "params": {
            "alpha": np.logspace(-4, 1, 30)
        }
    },
    "BayesianRidge": {
        "model": BayesianRidge(),
        "params": {
            "alpha_1": [1e-6, 1e-5, 1e-4],
            "alpha_2": [1e-6, 1e-5, 1e-4],
            "lambda_1": [1e-6, 1e-5, 1e-4],
            "lambda_2": [1e-6, 1e-5, 1e-4]
        }
    }
}

In [None]:
results = []
for name, config in models.items():
    print(f"\n Training {name}...")
    base_model = config["model"]
    param_grid = config["params"]

    pipe = Pipeline([
        ("feature_adder", EquipmentFeatureAdder()), 
        ("preprocessor", preprocessor),
        ("regressor", TransformedTargetRegressor(
            regressor=base_model,
            transformer=PowerTransformer(method='yeo-johnson')
        ))
    ])

    if param_grid:
        grid_params = {f"regressor__regressor__{k}": v for k, v in param_grid.items()}
        grid = GridSearchCV(pipe, grid_params, scoring="r2", cv=3, n_jobs=-1, verbose=1)
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        print(f"Best Params for {name}: {grid.best_params_}")
    else:
        best_model = pipe.fit(X_train, y_train)

    y_pred = best_model.predict(X_val)
    r2 = r2_score(y_val, y_pred)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    results.append({"Model": name, "R2": r2, "RMSE": rmse})
    print(f"{name:15s} | R² = {r2:.4f} | RMSE = {rmse:.2f}")

results_df = pd.DataFrame(results).sort_values(by="R2", ascending=False)
print("\n Model Comparison:")
print(results_df)



 Training ElasticNet...
ElasticNet      | R² = 0.2939 | RMSE = 39576.50

 Training KernelRidge...
Fitting 3 folds for each of 18 candidates, totalling 54 fits
Best Params for KernelRidge: {'regressor__regressor__alpha': 1.0, 'regressor__regressor__gamma': 0.01, 'regressor__regressor__kernel': 'rbf'}
KernelRidge     | R² = 0.0347 | RMSE = 46274.15

 Training SVR...
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best Params for SVR: {'regressor__regressor__C': 1, 'regressor__regressor__epsilon': 0.3, 'regressor__regressor__gamma': 'scale'}
SVR             | R² = 0.3892 | RMSE = 36809.55

 Training RandomForest...
Fitting 3 folds for each of 48 candidates, totalling 144 fits
Best Params for RandomForest: {'regressor__regressor__max_depth': None, 'regressor__regressor__max_features': 'log2', 'regressor__regressor__min_samples_leaf': 2, 'regressor__regressor__min_samples_split': 2, 'regressor__regressor__n_estimators': 100}
RandomForest    | R² = 0.2912 | RMSE = 39651.70

 Tr

In [None]:
best_model_name = results_df.iloc[3]["Model"]
print(f"\n Best model: {best_model_name}")
best_base_model = models[best_model_name]["model"]


 Best model: ElasticNet


In [None]:
final_pipeline = Pipeline([
    ("feature_adder", EquipmentFeatureAdder()), # <-- ADD THIS STEP BACK IN
    ("preprocessor", preprocessor),
    ("regressor", TransformedTargetRegressor(
        regressor=best_base_model,
        transformer=PowerTransformer(method='yeo-johnson')
    ))
])

final_pipeline.fit(X, y)

In [None]:
y_pred_test = final_pipeline.predict(test)
submission = pd.DataFrame({
    "Hospital_Id": hospital_id_test,
    "Transport_Cost": y_pred_test
})
submission.to_csv("submission_ridge.csv", index=False)
print(f"\n submission_best_model.csv created using {best_model_name}")


 submission_best_model.csv created using ElasticNet
