In [12]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [13]:
# Load preprocessed data
data = pd.read_csv('data_preprocessed_for_qnn.csv')

In [14]:
# Print column names to debug
print("Columns in the dataset:")
print(data.columns)

Columns in the dataset:
Index(['generation biomass', 'generation fossil brown coal/lignite',
       'generation fossil coal-derived gas', 'generation fossil gas',
       'generation fossil hard coal', 'generation fossil oil',
       'generation fossil oil shale', 'generation fossil peat',
       'generation geothermal', 'generation hydro pumped storage consumption',
       'generation hydro run-of-river and poundage',
       'generation hydro water reservoir', 'generation marine',
       'generation nuclear', 'generation other', 'generation other renewable',
       'generation solar', 'generation waste', 'generation wind offshore',
       'generation wind onshore', 'forecast solar day ahead',
       'forecast wind onshore day ahead', 'total load forecast',
       'total load actual', 'price day ahead', 'price actual', 'temp',
       'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed',
       'wind_deg', 'rain_1h', 'rain_3h', 'snow_3h', 'clouds_all', 'weather_id',
       'time'

In [15]:
# Updated feature selection list based on actual column names
features = [
    'generation biomass', 'generation fossil brown coal/lignite', 'generation fossil coal-derived gas',
    'generation fossil gas', 'generation fossil hard coal', 'generation fossil oil', 'generation fossil oil shale',
    'generation fossil peat', 'generation geothermal', 'generation hydro pumped storage consumption',
    'generation hydro run-of-river and poundage', 'generation hydro water reservoir', 'generation marine',
    'generation nuclear', 'generation other', 'generation other renewable', 'generation solar', 'generation waste',
    'generation wind offshore', 'generation wind onshore', 'forecast solar day ahead', 'forecast wind onshore day ahead',
    'total load forecast', 'price day ahead'
]

In [16]:
# Check if all features are present in the dataset
missing_features = [feature for feature in features if feature not in data.columns]
if missing_features:
    raise KeyError(f"Missing columns in the dataset: {missing_features}")

In [17]:
# Prepare feature matrix and target variable
X = data[features]
y = data['total load actual']

# Normalize features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [18]:
# Ensure the dataset has at least 35,000 rows for a valid split
if len(data) < 35000:
    raise ValueError("The dataset must have at least 35,000 rows to create training and testing sets with the desired sizes.")


In [19]:
# Split the data into training, validation, and testing sets
X_temp, X_test, y_temp, y_test = train_test_split(X_scaled, y, test_size=5260, random_state=42)  # 15% for testing
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, train_size=24544, random_state=42)  # 70% for training

In [20]:
# Convert the split data into DataFrames
X_train_df = pd.DataFrame(X_train, columns=features)
X_val_df = pd.DataFrame(X_val, columns=features)
X_test_df = pd.DataFrame(X_test, columns=features)
y_train_df = pd.DataFrame(y_train, columns=['total load actual'])
y_val_df = pd.DataFrame(y_val, columns=['total load actual'])
y_test_df = pd.DataFrame(y_test, columns=['total load actual'])

In [21]:
# Save the training, validation, and testing datasets to CSV files
X_train_df.to_csv('X_train.csv', index=False)
X_val_df.to_csv('X_val.csv', index=False)
X_test_df.to_csv('X_test.csv', index=False)
y_train_df.to_csv('y_train.csv', index=False)
y_val_df.to_csv('y_val.csv', index=False)
y_test_df.to_csv('y_test.csv', index=False)

In [22]:
# Print shapes of the splits
print(f"X_train shape: {X_train_df.shape}")
print(f"X_val shape: {X_val_df.shape}")
print(f"X_test shape: {X_test_df.shape}")
print(f"y_train shape: {y_train_df.shape}")
print(f"y_val shape: {y_val_df.shape}")
print(f"y_test shape: {y_test_df.shape}")

X_train shape: (24544, 24)
X_val shape: (5260, 24)
X_test shape: (5260, 24)
y_train shape: (24544, 1)
y_val shape: (5260, 1)
y_test shape: (5260, 1)
