# Research Question 2: How does the inclusion of additional engineered features affect the predictive performance of regression models for defect rates in synthetic manufacturing data? 

In [16]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

#### Loading Data

In [13]:
# Load the training dataset (original and engineered features)
X_train_original = pd.read_csv('Train_Test_Data/X_train_original.csv')
y_train_original = pd.read_csv('Train_Test_Data/y_train_original.csv')

X_train_engineered = pd.read_csv('Train_Test_Data/X_train_engineered.csv')
y_train_engineered = pd.read_csv('Train_Test_Data/y_train_engineered.csv')

#### Model Definition, Training and Evaluation

In [9]:
# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Support Vector Regressor': SVR(),
    'LightGBM': lgb.LGBMRegressor(random_state=42)
}

In [14]:
# Function to train and evaluate models using cross-validation on the training set only
def train_and_evaluate(models, X_train, y_train, cv=5):
    results = {}
    for model_name, model in models.items():
        # Train the model using cross-validation
        print(f"Training {model_name} with cross-validation...")
        mae_scores = -cross_val_score(model, X_train, y_train.values.ravel(), cv=cv, scoring='neg_mean_absolute_error')
        rmse_scores = -cross_val_score(model, X_train, y_train.values.ravel(), cv=cv, scoring='neg_root_mean_squared_error')
        r2_scores = cross_val_score(model, X_train, y_train.values.ravel(), cv=cv, scoring='r2')
        
        # Calculate average metrics
        mae = mae_scores.mean()
        rmse = rmse_scores.mean()
        r2 = r2_scores.mean()
        
        # Store results
        results[model_name] = {
            'MAE': mae,
            'RMSE': rmse,
            'R²': r2
        }
    return results

In [17]:
# Train and evaluate models on original features
print("Evaluating models on original features...")
results_original = train_and_evaluate(models, X_train_original, y_train_original)

# Train and evaluate models on engineered features
print("Evaluating models on engineered features...")
results_engineered = train_and_evaluate(models, X_train_engineered, y_train_engineered)

Evaluating models on original features...
Training Linear Regression with cross-validation...
Training Random Forest with cross-validation...
Training Support Vector Regressor with cross-validation...
Training LightGBM with cross-validation...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000263 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3122
[LightGBM] [Info] Number of data points in the train set: 2073, number of used features: 16
[LightGBM] [Info] Start training from score 2.722736
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000199 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3121
[LightGBM] [Info] Number of data points in the train set: 2073, number of used features: 16
[LightGBM] [Info] Start training from score 2.723380
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of 

#### Results

In [18]:
# Display results
print("\nResults on Original Features:")
for model_name, metrics in results_original.items():
    print(f"{model_name}: MAE={metrics['MAE']:.4f}, RMSE={metrics['RMSE']:.4f}, R²={metrics['R²']:.4f}")

print("\nResults on Engineered Features:")
for model_name, metrics in results_engineered.items():
    print(f"{model_name}: MAE={metrics['MAE']:.4f}, RMSE={metrics['RMSE']:.4f}, R²={metrics['R²']:.4f}")


Results on Original Features:
Linear Regression: MAE=1.1401, RMSE=1.3147, R²=-0.0070
Random Forest: MAE=1.1521, RMSE=1.3353, R²=-0.0388
Support Vector Regressor: MAE=1.1399, RMSE=1.3132, R²=-0.0048
LightGBM: MAE=1.1751, RMSE=1.3783, R²=-0.1070

Results on Engineered Features:
Linear Regression: MAE=1.1439, RMSE=1.3205, R²=-0.0159
Random Forest: MAE=1.1550, RMSE=1.3375, R²=-0.0422
Support Vector Regressor: MAE=1.1420, RMSE=1.3163, R²=-0.0097
LightGBM: MAE=1.1821, RMSE=1.3814, R²=-0.1119
