# Initial Model Building
This file is responsible for Train-test split. Training basic models to get a baseline performance. Evaluating models using chosen metrics (e.g., accuracy, F1-score, RMSE).
Our evaluation is based on MAE.

In [10]:
import pandas as pd
from src.data.data_fetcher import get_all_features, get_raw_data
from src.features.feature_engineering import prepare_data

%pip install scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline


Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\Bruker\OneDrive\NTNU semester 05\TDT4173 Maskinlæring\ml_power_predictor\venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [11]:
train_a, train_b, train_c, X_train_estimated_a, X_train_estimated_b, X_train_estimated_c, X_train_observed_a, X_train_observed_b, X_train_observed_c, X_test_estimated_a, X_test_estimated_b, X_test_estimated_c = get_raw_data()

In [12]:
targets = train_a
features_observed = X_train_observed_a
features_estimated = X_train_estimated_a
features_test = X_test_estimated_a
targets.head(), features_observed.head(), features_estimated.head(), features_test.head()

(                 time  pv_measurement
 0 2019-06-02 22:00:00            0.00
 1 2019-06-02 23:00:00            0.00
 2 2019-06-03 00:00:00            0.00
 3 2019-06-03 01:00:00            0.00
 4 2019-06-03 02:00:00           19.36,
         date_forecast  absolute_humidity_2m:gm3  air_density_2m:kgm3  \
 0 2019-06-02 22:00:00                       7.7                1.230   
 1 2019-06-02 22:15:00                       7.7                1.229   
 2 2019-06-02 22:30:00                       7.7                1.228   
 3 2019-06-02 22:45:00                       7.7                1.226   
 4 2019-06-02 23:00:00                       7.7                1.225   
 
    ceiling_height_agl:m  clear_sky_energy_1h:J  clear_sky_rad:W  \
 0           1744.900024                    0.0              0.0   
 1           1734.000000                    0.0              0.0   
 2           1723.500000                    0.0              0.0   
 3           1713.400024                    0.0      

Preprocessing steps:

In [13]:

# Call the function with the file paths
X_train, y_train, X_val, y_val, X_test, train_data, val_data = prepare_data(targets, 
                                                                            features_observed, 
                                                                            features_estimated, 
                                                                            features_test)




# Check the prepared DataFrames
X_train.head(1), y_train.head(1), X_val.head(1), y_val.head(1), X_test.head(1), train_data.head(1), val_data.head(1)


(                     absolute_humidity_2m:gm3  air_density_2m:kgm3  \
 2019-12-01 09:00:00                       4.8                1.272   
 
                      ceiling_height_agl:m  clear_sky_energy_1h:J  \
 2019-12-01 09:00:00                456.25                27331.5   
 
                      clear_sky_rad:W  cloud_base_agl:m  dew_or_rime:idx  \
 2019-12-01 09:00:00        17.549999            456.25              0.0   
 
                      dew_point_2m:K  diffuse_rad:W  diffuse_rad_1h:J  ...  \
 2019-12-01 09:00:00      273.150024          5.175       9751.150391  ...   
 
                      sun_azimuth:d  sun_elevation:d  \
 2019-12-01 09:00:00     155.780746          2.32725   
 
                      super_cooled_liquid_water:kgm2  t_1000hPa:K  \
 2019-12-01 09:00:00                            0.25   274.350006   
 
                      total_cloud_cover:p  visibility:m  wind_speed_10m:ms  \
 2019-12-01 09:00:00                100.0   7155.975098               4.

### Linear regression model

In [14]:
# Train a linear regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
X_val.head(), y_val.head()

(                              date_calc  absolute_humidity_2m:gm3  \
 2022-12-15 03:00:00 2022-12-14 08:00:04                     2.700   
 2022-12-15 04:00:00 2022-12-14 08:00:04                     2.850   
 2022-12-15 05:00:00 2022-12-14 08:00:04                     3.050   
 2022-12-15 06:00:00 2022-12-14 08:00:04                     3.200   
 2022-12-15 07:00:00 2022-12-14 08:00:04                     3.325   
 
                      air_density_2m:kgm3  ceiling_height_agl:m  \
 2022-12-15 03:00:00              1.30475           3000.000000   
 2022-12-15 04:00:00              1.30150           2487.949951   
 2022-12-15 05:00:00              1.29825           2234.199951   
 2022-12-15 06:00:00              1.29575           2787.399902   
 2022-12-15 07:00:00              1.29400           2269.899902   
 
                      clear_sky_energy_1h:J  clear_sky_rad:W  cloud_base_agl:m  \
 2022-12-15 03:00:00                    0.0              0.0        360.349976   
 2022-12-1

Linear regression predictions

In [15]:
# Predict on the validation set
# X_val contains a column date_calc, which is not in X_train
# Thus, the model will fail to make predictions
# We need to remove this column before making predictions
X_val = X_val.drop(['date_calc'], axis=1)

y_val_pred = lr_model.predict(X_val)
# y_val_pred = lr_model.predict(X_train)

print(y_val_pred.shape, y_val_pred.shape)

# Evaluate the model using MAE
mae = mean_absolute_error(y_val, y_val_pred)
# mae = mean_absolute_error(y_train, y_val_pred)
print('MAE:', mae)


(372,) (372,)
MAE: 109.61028441070229


# Fine-tuning the model

In [18]:

# Create polynomial features
poly = PolynomialFeatures(degree=2)

# Scale features
scaler = StandardScaler()

# Initialize Ridge Regression model with regularization strength alpha
ridge_model = Ridge(alpha=1.0)

# Create a pipeline with polynomial feature creation, scaling, and Ridge Regression
model = make_pipeline(poly, scaler, ridge_model)

# Train the model with training data
model.fit(X_train, y_train)

# Make predictions on validation data
y_val_pred = model.predict(X_val)

# Calculate Mean Absolute Error on validation data
mae_val = mean_absolute_error(y_val, y_val_pred)
print('MAE:', mae_val)


MAE: 372.5174423915083
