In [1]:
import pandas as pd

from scripts.feature_engineering import get_feature_transformer

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, make_scorer, r2_score

In [2]:
INPUT_DATA = 'data/data_processed.csv'

In [3]:
df = pd.read_csv(INPUT_DATA)

In [4]:
target = "Ewltp (g/km)"
features = ["m (kg)", "Ft", "ec (cm3)", "ep (KW)", "age_months"] # "Country", "Mk", 

X = df[features]
y = df[target]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [6]:
models = [
    ("LinearRegression", LinearRegression()),
    ("Ridge", Ridge(alpha=1.0)),
    ("RandomForest", RandomForestRegressor(n_estimators=100, max_depth=None)),
    ("GradientBoosting", GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3))
]

In [7]:
feature_transformer = get_feature_transformer()

scoring = {
    "neg_mse": make_scorer(mean_squared_error, greater_is_better=False),
    "r2": "r2",
}

In [8]:
for name, model in models:
    pipeline = Pipeline([
        ('features', feature_transformer),
        ('model', model)
    ])
    cv_results = cross_validate(
        pipeline, X_train, y_train,
        cv=5,
        scoring=scoring,
        return_train_score=True,
        n_jobs=-1
    )
    
    print(f"{name}:")
    print(f"  Train MSE: {cv_results['train_neg_mse'].mean():.4f}")
    print(f"  Train R2: {cv_results['train_r2'].mean():.4f}")
    print(f"  Val MSE: {cv_results['test_neg_mse'].mean():.4f}")
    print(f"  Val R2: {cv_results['test_r2'].mean():.4f}")

LinearRegression:
  Train MSE: -232.4599
  Train R2: 0.8646
  Val MSE: -232.4629
  Val R2: 0.8646
Ridge:
  Train MSE: -232.4599
  Train R2: 0.8646
  Val MSE: -232.4629
  Val R2: 0.8646
RandomForest:
  Train MSE: -12.4578
  Train R2: 0.9927
  Val MSE: -15.3424
  Val R2: 0.9911
GradientBoosting:
  Train MSE: -118.2623
  Train R2: 0.9311
  Val MSE: -118.3361
  Val R2: 0.9311
