# Initial Model Building
This file is responsible for Train-test split. Training basic models to get a baseline performance. Evaluating models using chosen metrics (e.g., accuracy, F1-score, RMSE).
Our evaluation is based on MAE.

In [24]:
import pandas as pd
from src.data.data_fetcher import get_all_features, get_raw_data

In [25]:
train_a, train_b, train_c, X_train_estimated_a, X_train_estimated_b, X_train_estimated_c, X_train_observed_a, X_train_observed_b, X_train_observed_c, X_test_estimated_a, X_test_estimated_b, X_test_estimated_c = get_raw_data()

Preprocessing steps:

In [26]:
# Define file paths for each location
file_paths = {
    'location_a': {
        'X_train_observed':  'data/raw/location_a_X_train_observed.csv',
        'X_train_estimated': 'data/raw/location_a_X_train_estimated.csv',
        'X_test_estimated':  'data/raw/location_a_X_test_estimated.csv',
    },
    'location_b': {
        'X_train_observed':  'data/raw/location_b_X_train_observed.csv',
        'X_train_estimated': 'data/raw/location_b_X_train_estimated.csv',
        'X_test_estimated':  'data/raw/location_b_X_test_estimated.csv',
    },
    'location_c': {
        'X_train_observed':  'data/raw/location_c_X_train_observed.csv',
        'X_train_estimated': 'data/raw/location_c_X_train_estimated.csv',
        'X_test_estimated':  'data/raw/location_c_X_test_estimated.csv',
    }
}

# load the data
data = {}
for location, paths in file_paths.items():
    data[location] = {}
    for dataset_name, path in paths.items():
        data[location][dataset_name] = pd.read_csv(path)

# Display the first few rows of the datasets for location A as an example
print(data['location_a'].keys())

target_file_paths = {
    'location_a': 'data/raw/A_train_targets.csv',
    'location_b': 'data/raw/B_train_targets.csv',
    'location_c': 'data/raw/C_train_targets.csv'
}
# Load the target datasets for each location
for location, path in target_file_paths.items():
    data[location]['train_targets'] = pd.read_csv(path)

# Display the first few rows of the target datasets for location A as an example
data['location_a']['train_targets'].head()

dict_keys(['X_train_observed', 'X_train_estimated', 'X_test_estimated'])


Unnamed: 0,time,pv_measurement
0,2019-06-02 22:00:00,0.0
1,2019-06-02 23:00:00,0.0
2,2019-06-03 00:00:00,0.0
3,2019-06-03 01:00:00,0.0
4,2019-06-03 02:00:00,19.36


In [32]:
from datetime import datetime

# Data for Location A
X_train_observed_A = data['location_a']['X_train_observed']
X_train_estimated_A = data['location_a']['X_train_estimated']
train_targets_A = data['location_a']['train_targets']

# Convert date columns to datetime format
X_train_observed_A['date_forecast'] = pd.to_datetime(X_train_observed_A['date_forecast'])
X_train_estimated_A['date_forecast'] = pd.to_datetime(X_train_estimated_A['date_forecast'])
train_targets_A['time'] = pd.to_datetime(train_targets_A['time'])

# Aggregate the 15-minute weather data to hourly by taking the mean
X_train_observed_A_hourly = X_train_observed_A.resample('H', on='date_forecast').mean()
X_train_estimated_A_hourly = X_train_estimated_A.resample('H', on='date_forecast').mean()

# Combine observed and estimated weather data
X_train_A_combined = pd.concat([X_train_observed_A_hourly, X_train_estimated_A_hourly]).sort_index()

# Merge weather data with the target data
merged_data_A = pd.merge_asof(X_train_A_combined, train_targets_A, left_index=True, right_on='time', direction='nearest')

# Check the first few rows of the merged dataset
merged_data_A.head()


                     absolute_humidity_2m:gm3  air_density_2m:kgm3  \
date_forecast                                                        
2019-06-02 22:00:00                     7.700              1.22825   
2019-06-02 23:00:00                     7.700              1.22350   
2019-06-03 00:00:00                     7.875              1.21975   
2019-06-03 01:00:00                     8.425              1.21800   
2019-06-03 02:00:00                     8.950              1.21800   

                     ceiling_height_agl:m  clear_sky_energy_1h:J  \
date_forecast                                                      
2019-06-02 22:00:00              1728.950                   0.00   
2019-06-02 23:00:00              1689.825                   0.00   
2019-06-03 00:00:00              1563.225                   0.00   
2019-06-03 01:00:00              1283.425                 208.65   
2019-06-03 02:00:00              1003.500               32468.15   

                     clear_sky_r

In [None]:

# Extracting features from the timestamp
merged_data_A['hour'] = merged_data_A['time'].dt.hour
merged_data_A['day_of_week'] = merged_data_A['time'].dt.dayofweek
merged_data_A['month'] = merged_data_A['time'].dt.month

### Linear regression model

In [None]:
# Drop rows with NaN values
X_train_cleaned = X_train.dropna()
y_train_cleaned = y_train[X_train_cleaned.index]

X_val_cleaned = X_val.dropna()
y_val_cleaned = y_val[X_val_cleaned.index]

# Train a linear regression model on the cleaned data
lr_model.fit(X_train_cleaned, y_train_cleaned)

# Predict on the cleaned validation set
y_val_pred_cleaned = lr_model.predict(X_val_cleaned)

# Evaluate the model using MAE on the cleaned validation set
mae_cleaned = mean_absolute_error(y_val_cleaned, y_val_pred_cleaned)
mae_cleaned


Linear regression predictions

In [None]:
# Data preprocessing for X_test_estimated
X_test_A = data['location_a']['X_test_estimated']
X_test_A['date_forecast'] = pd.to_datetime(X_test_A['date_forecast'])
X_test_A_hourly = X_test_A.resample('H', on='date_forecast').mean()

# Feature engineering for X_test_estimated
X_test_A_hourly['hour'] = X_test_A_hourly.index.hour
X_test_A_hourly['day_of_week'] = X_test_A_hourly.index.dayofweek
X_test_A_hourly['month'] = X_test_A_hourly.index.month

# Predict using the linear regression model
X_test_A_cleaned = X_test_A_hourly.dropna()
y_test_pred_A = lr_model.predict(X_test_A_cleaned)

# Create a DataFrame for predictions
predictions_A = pd.DataFrame({
    'date_forecast': X_test_A_cleaned.index,
    'predicted_pv_measurement': y_test_pred_A
})

predictions_A.head()
