# Load Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import root_mean_squared_error,mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# Paths to datasets


In [2]:
paths = {
    "Raw": "../../data/processed/training_numeric_cleaned.csv",
    "IQR": "../../data/processed/training_outlier_removed.csv",
    "Winsorized": "../../data/processed/training_winsorized.csv"
}

# Models


In [3]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42)
}

# Run Methods

In [4]:
for version, path in paths.items():
    print(f"\n📁 Dataset version: {version}")
    
    # Wczytanie danych
    df = pd.read_csv(path,sep="\t")
    target = "ActualTOW"
    X = df.drop(columns=target)
    y = df[target]

    # Podział na trening/test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Identyfikacja cech
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X.select_dtypes(include=['object']).columns.tolist()

    # Transformery
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    for model_name, model in models.items():
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', model)
        ])
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        rmse = root_mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        print(f"{model_name} RMSE: {rmse:.2f} | MAE: {mae:.2f}")
        print(f"{model_name} RMSE: {rmse:.2f}")
    print(f"🔍 RMSE: {rmse:.2f} | MAE: {mae:.2f}")


📁 Dataset version: Raw
Linear Regression RMSE: 754.83 | MAE: 514.25
Linear Regression RMSE: 754.83
Random Forest RMSE: 825.93 | MAE: 601.18
Random Forest RMSE: 825.93
XGBoost RMSE: 770.70 | MAE: 562.27
XGBoost RMSE: 770.70
🔍 RMSE: 770.70 | MAE: 562.27

📁 Dataset version: IQR
Linear Regression RMSE: 709.64 | MAE: 496.39
Linear Regression RMSE: 709.64
Random Forest RMSE: 808.87 | MAE: 590.43
Random Forest RMSE: 808.87
XGBoost RMSE: 749.78 | MAE: 551.08
XGBoost RMSE: 749.78
🔍 RMSE: 749.78 | MAE: 551.08

📁 Dataset version: Winsorized
Linear Regression RMSE: 713.34 | MAE: 497.75
Linear Regression RMSE: 713.34
Random Forest RMSE: 814.14 | MAE: 589.98
Random Forest RMSE: 814.14
XGBoost RMSE: 763.65 | MAE: 552.81
XGBoost RMSE: 763.65
🔍 RMSE: 763.65 | MAE: 552.81
