# Initial Model Building
This file is responsible for Train-test split. Training basic models to get a baseline performance. Evaluating models using chosen metrics (e.g., accuracy, F1-score, RMSE).
Our evaluation is based on MAE.

In [1]:
import pandas as pd
from src.data.data_fetcher import get_all_features, get_raw_data
from src.features.feature_engineering import prepare_data

In [2]:
train_a, train_b, train_c, X_train_estimated_a, X_train_estimated_b, X_train_estimated_c, X_train_observed_a, X_train_observed_b, X_train_observed_c, X_test_estimated_a, X_test_estimated_b, X_test_estimated_c = get_raw_data()

In [3]:
targets = train_a
features_observed = X_train_observed_a
features_estimated = X_train_estimated_a
features_test = X_test_estimated_a
targets.head(), features_observed.head(), features_estimated.head(), features_test.head()

(                 time  pv_measurement
 0 2019-06-02 22:00:00            0.00
 1 2019-06-02 23:00:00            0.00
 2 2019-06-03 00:00:00            0.00
 3 2019-06-03 01:00:00            0.00
 4 2019-06-03 02:00:00           19.36,
         date_forecast  absolute_humidity_2m:gm3  air_density_2m:kgm3  \
 0 2019-06-02 22:00:00                       7.7                1.230   
 1 2019-06-02 22:15:00                       7.7                1.229   
 2 2019-06-02 22:30:00                       7.7                1.228   
 3 2019-06-02 22:45:00                       7.7                1.226   
 4 2019-06-02 23:00:00                       7.7                1.225   
 
    ceiling_height_agl:m  clear_sky_energy_1h:J  clear_sky_rad:W  \
 0           1744.900024                    0.0              0.0   
 1           1734.000000                    0.0              0.0   
 2           1723.500000                    0.0              0.0   
 3           1713.400024                    0.0      

Preprocessing steps:

In [4]:

# Call the function with the file paths
X_train, y_train, X_val, y_val, X_test, train_data, val_data = prepare_data(targets, 
                                                                            features_observed, 
                                                                            features_estimated, 
                                                                            features_test)

# Check the prepared DataFrames
X_train.head(), y_train.head(), X_val.head(), y_val.head(), X_test.head(), train_data.head(), val_data.head()


(                     absolute_humidity_2m:gm3  air_density_2m:kgm3  \
 2019-12-01 09:00:00                     4.800              1.27200   
 2019-12-01 10:00:00                     4.825              1.27300   
 2019-12-01 11:00:00                     4.800              1.27350   
 2019-12-01 12:00:00                     4.750              1.27525   
 2019-12-01 13:00:00                     4.650              1.27700   
 
                      ceiling_height_agl:m  clear_sky_energy_1h:J  \
 2019-12-01 09:00:00            456.250000           27331.500000   
 2019-12-01 10:00:00            581.299988           98679.046875   
 2019-12-01 11:00:00            704.500000          142199.703125   
 2019-12-01 12:00:00            922.450012          125991.281250   
 2019-12-01 13:00:00           1301.649902           61200.398438   
 
                      clear_sky_rad:W  cloud_base_agl:m  dew_or_rime:idx  \
 2019-12-01 09:00:00        17.549999        456.250000              0.0   
 201

### Linear regression model

In [7]:
%pip install scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error


# Train a linear regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
X_val.head(), y_val.head()

Collecting scikit-learn
  Using cached scikit_learn-1.3.1-cp39-cp39-win_amd64.whl (9.3 MB)
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.2.0-py3-none-any.whl (15 kB)
Collecting joblib>=1.1.1
  Using cached joblib-1.3.2-py3-none-any.whl (302 kB)
Collecting scipy>=1.5.0
  Using cached scipy-1.11.2-cp39-cp39-win_amd64.whl (44.1 MB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.3.2 scikit-learn-1.3.1 scipy-1.11.2 threadpoolctl-3.2.0
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\Bruker\OneDrive\NTNU semester 05\TDT4173 Maskinlæring\ml_power_predictor\venv\Scripts\python.exe -m pip install --upgrade pip' command.


Linear regression predictions

In [8]:
# Predict on the validation set
y_val_pred = lr_model.predict(X_val)

# Evaluate the model using MAE
mae = mean_absolute_error(y_val, y_val_pred)
print('MAE:', mae)


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- date_calc


# Fine-tuning the model

In [None]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline

# Create polynomial features
poly = PolynomialFeatures(degree=2)

# Scale features
scaler = StandardScaler()

# Initialize Ridge Regression model with regularization strength alpha
ridge_model = Ridge(alpha=1.0)

# Create a pipeline with polynomial feature creation, scaling, and Ridge Regression
model = make_pipeline(poly, scaler, ridge_model)

# Train the model with training data
model.fit(X_train, y_train)

# Make predictions on validation data
y_val_pred = model.predict(X_val)

# Calculate Mean Absolute Error on validation data
mae_val = mean_absolute_error(y_val, y_val_pred)
