# Baseline Model: Linear Regression

# 1 - Information

In [None]:
# Author: Pierre Oreistein

# 2 - Packages

In [None]:
%reload_kedro

In [None]:
# Math Packages
import numpy as np

# Data Handling Packages
import pandas as pd

# Machine Learning Packages
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.base import RegressorMixin

# Logger
import logging

# Typing
from typing import Dict, Union, List

# Prevent unecessary warnings
from warnings import filterwarnings
filterwarnings("ignore", ".*`should_run_async`.*")

# 3 - Baseline Model: Linear Regression

In [None]:
def create_baseline_model() -> RegressorMixin:
    """Create Baseline model"""
    # Initialisation of the model
    model = LinearRegression()
    return model

In [None]:
def train_baseline_model(model: RegressorMixin, train_df: pd.DataFrame) -> RegressorMixin:
    """Train the baseline model."""
    # Extract X and Y
    X_train_df = train_df
    y_train_df = X_train_df.pop('RUL')
    
    # Train the baseline model
    model.fit(X_train_df, y_train_df)
    
    return model

In [None]:
def evaluate(
    y_true: Union[np.array, pd.DataFrame],
    y_hat: Union[np.array, pd.DataFrame],
    label: str='test'
) -> Dict[str, Union[float, List[float]]]:
    """Evaluate the model."""
    # Compute the different metrics
    mse = mean_squared_error(y_true, y_hat)
    rmse = np.sqrt(mse)
    r2_metric = r2_score(y_true, y_hat)
    
    # Save the metrics
    metrics_dct = {
        "mse": mse,
        "rmse": rmse,
        "r2_score": r2_metric
    }
    
    # Log the metrics
    log = logging.getLogger("kedro.custom")
    logging.info(f"{label} set RMSE:{rmse}, R2:{r2_metric}")
    
    return metrics_dct

In [None]:
# Load the training dataset
train_df = catalog.load("train_preprocessed_df")

# Create the baseline model
model = create_baseline_model()

# Train the baseline model
model = train_baseline_model(model=model, train_df=train_df)

In [None]:
# Load Testing Dataset
X_test_df = catalog.load("X_test_preprocessed_df")
y_test_true = catalog.load("y_test_raw")

# Predict and evaluate the model
y_test_pred = model.predict(X_test_df).reshape(-1, 1)

# Evaluate the model
metrics = evaluate(y_true=y_test_true, y_hat=y_test_pred, label="test")