In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
color_pal = sns.color_palette()


from sklearn.base import clone
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder


## Import Data Files for Analysis

In [2]:
temperature_vic = pd.read_csv("C:/Users/aryan2/Assessment Data/temperature_vic.csv")
temperature_qld = pd.read_csv("C:/Users/aryan2/Assessment Data/temperature_qld.csv")
temperature_sa = pd.read_csv("C:/Users/aryan2/Assessment Data/temperature_sa.csv")
forecastdemand_vic = pd.read_csv("C:/Users/aryan2/Assessment Data/forecastdemand_vic.csv")
forecastdemand_qld = pd.read_csv("C:/Users/aryan2/Assessment Data/forecastdemand_qld.csv")
forecastdemand_sa = pd.read_csv("C:/Users/aryan2/Assessment Data/forecastdemand_sa.csv")
totaldemand_vic = pd.read_csv("C:/Users/aryan2/Assessment Data/totaldemand_vic.csv")
totaldemand_qld = pd.read_csv("C:/Users/aryan2/Assessment Data/totaldemand_qld.csv")
totaldemand_sa = pd.read_csv("C:/Users/aryan2/Assessment Data/totaldemand_sa.csv")


## Inspect and Clean Data

In [23]:
# For demand_vic
total_records_demand_vic = len(forecastdemand_vic)
duplicate_count_demand_vic = forecastdemand_vic.duplicated('DATETIME').sum()
print("Duplicate count in demand_vic = {} out of {} records".format(duplicate_count_demand_vic, total_records_demand_vic))

# For temperature_vic
total_records_temp_vic = len(temperature_vic)
duplicate_count_temp_vic = temperature_vic.duplicated('DATETIME').sum()
print("Duplicate count in temperature_vic = {} out of {} records".format(duplicate_count_temp_vic, total_records_temp_vic))

# For totaldemand_vic
total_records_totaldemand_vic = len(totaldemand_vic)
duplicate_count_totaldemand_vic = totaldemand_vic.duplicated('DATETIME').sum()
print("Duplicate count in totaldemand_vic = {} out of {} records".format(duplicate_count_totaldemand_vic, total_records_totaldemand_vic))



Duplicate count in demand_vic = 4021759 out of 4095592 records
Duplicate count in temperature_vic = 0 out of 141681 records
Duplicate count in totaldemand_vic = 0 out of 196513 records


In [24]:
# For demand_sa
total_records_demand_sa = len(forecastdemand_sa)
duplicate_count_demand_sa = forecastdemand_sa.duplicated('DATETIME').sum()
print("Duplicate count in demand_sa = {} out of {} records".format(duplicate_count_demand_sa, total_records_demand_sa))

# For temperature_sa
total_records_temp_sa = len(temperature_sa)
duplicate_count_temp_sa = temperature_sa.duplicated('DATETIME').sum()
print("Duplicate count in temperature_sa = {} out of {} records".format(duplicate_count_temp_sa, total_records_temp_sa))

# For totaldemand_sa
total_records_totaldemand_sa = len(totaldemand_sa)
duplicate_count_totaldemand_sa = totaldemand_sa.duplicated('DATETIME').sum()
print("Duplicate count in totaldemand_sa = {} out of {} records".format(duplicate_count_totaldemand_sa, total_records_totaldemand_sa))


Duplicate count in demand_sa = 0 out of 73833 records
Duplicate count in temperature_sa = 0 out of 208085 records
Duplicate count in totaldemand_sa = 0 out of 196512 records


In [25]:
# For demand_qld
total_records_demand_qld = len(forecastdemand_qld)
duplicate_count_demand_qld = forecastdemand_qld.duplicated('DATETIME').sum()
print("Duplicate count in demand_qld = {} out of {} records".format(duplicate_count_demand_qld, total_records_demand_qld))

# For temperature_qld
total_records_temp_qld = len(temperature_qld)
duplicate_count_temp_qld = temperature_qld.duplicated('DATETIME').sum()
print("Duplicate count in temperature_qld = {} out of {} records".format(duplicate_count_temp_qld, total_records_temp_qld))

# For totaldemand_qld
total_records_totaldemand_qld = len(totaldemand_qld)
duplicate_count_totaldemand_qld = totaldemand_qld.duplicated('DATETIME').sum()
print("Duplicate count in totaldemand_qld = {} out of {} records".format(duplicate_count_totaldemand_qld, total_records_totaldemand_qld))


Duplicate count in demand_qld = 4021759 out of 4095592 records
Duplicate count in temperature_qld = 0 out of 208085 records
Duplicate count in totaldemand_qld = 0 out of 196513 records


# Remove Duplicates

In [31]:
# Remove duplicates from forecastdemand_qld
forecastdemand_qld_no_duplicates = forecastdemand_qld.drop_duplicates(subset='DATETIME', keep='last')

# Extract duplicates from forecastdemand_qld
duplicates_forecastdemand_qld = forecastdemand_qld[~forecastdemand_qld.index.isin(forecastdemand_qld_no_duplicates.index)]

# Save duplicates to a CSV file
duplicates_forecastdemand_qld.to_csv("forecastdemand_qld_duplicates.csv", index=False)

# Verify that duplicates are removed from forecastdemand_qld
print("Original forecastdemand_qld shape:", forecastdemand_qld.shape)
print("forecastdemand_qld shape after removing duplicates:", forecastdemand_qld_no_duplicates.shape)


Original forecastdemand_qld shape: (4095592, 6)
forecastdemand_qld shape after removing duplicates: (73833, 6)


In [32]:
# Remove duplicates from forecastdemand_vic
forecastdemand_vic_no_duplicates = forecastdemand_vic.drop_duplicates(subset='DATETIME', keep='last')

# Extract duplicates from forecastdemand_qld
duplicates_forecastdemand_vic = forecastdemand_vic[~forecastdemand_vic.index.isin(forecastdemand_vic_no_duplicates.index)]

# Save duplicates to a CSV file
duplicates_forecastdemand_vic.to_csv("forecastdemand_vic_duplicates.csv", index=False)

# Verify that duplicates are removed from forecastdemand_qld
print("Original forecastdemand_vic shape:", forecastdemand_vic.shape)
print("forecastdemand_vic shape after removing duplicates:", forecastdemand_vic_no_duplicates.shape)


Original forecastdemand_vic shape: (4095592, 6)
forecastdemand_vic shape after removing duplicates: (73833, 6)


## Copy the remove duplicate values list back to the master lists

In [34]:
forecastdemand_qld = forecastdemand_qld_no_duplicates

In [35]:
forecastdemand_qld.describe()

Unnamed: 0,PREDISPATCHSEQNO,PERIODID,FORECASTDEMAND
count,73833.0,73833.0,73833.0
mean,2018690000.0,1.021765,6212.168522
std,1214770.0,0.902165,896.399377
min,2016123000.0,1.0,3764.77
25%,2018012000.0,1.0,5505.05
50%,2019021000.0,1.0,6102.77
75%,2020023000.0,1.0,6815.45
max,2021032000.0,57.0,9964.84


In [33]:
forecastdemand_vic = forecastdemand_vic_no_duplicates

In [37]:
forecastdemand_vic.describe()

Unnamed: 0,PREDISPATCHSEQNO,PERIODID,FORECASTDEMAND
count,73833.0,73833.0,73833.0
mean,2018690000.0,1.021765,4889.051609
std,1214770.0,0.902165,869.253987
min,2016123000.0,1.0,2354.24
25%,2018012000.0,1.0,4238.98
50%,2019021000.0,1.0,4788.08
75%,2020023000.0,1.0,5398.37
max,2021032000.0,57.0,9580.89


In [38]:
forecastdemand_sa.describe()

Unnamed: 0,PREDISPATCHSEQNO,PERIODID,FORECASTDEMAND
count,73833.0,73833.0,73833.0
mean,2018687000.0,55.502377,1282.122297
std,1215063.0,13.854351,331.53848
min,2016123000.0,32.0,194.63
25%,2018012000.0,44.0,1083.04
50%,2019021000.0,56.0,1253.84
75%,2020023000.0,68.0,1448.9
max,2021032000.0,79.0,3081.02


## Now the count of records in all 3 x states are equal!!!

## Part 2 Merging Tables

# Start with South Australia

In [39]:
temperature_sa['DATETIME'] = pd.to_datetime(temperature_sa['DATETIME'])

# Minimum and maximum dates
min_date = temperature_sa['DATETIME'].min()
max_date = temperature_sa['DATETIME'].max()

# Average time between each time sample
time_diff = temperature_sa['DATETIME'].diff().mean()

print("Minimum Date:", min_date)
print("Maximum Date:", max_date)
print("Average Time Between Samples:", time_diff)

Minimum Date: 2010-01-01 00:00:00
Maximum Date: 2021-03-18 00:00:00
Average Time Between Samples: 0 days 00:28:19.898118067


In [41]:
# Convert 'DATETIME' column to datetime type if it's not already
totaldemand_sa['DATETIME'] = pd.to_datetime(totaldemand_sa['DATETIME'])

# Minimum and maximum dates
min_date = totaldemand_sa['DATETIME'].min()
max_date = totaldemand_sa['DATETIME'].max()

# Average time between each time sample
time_diff = totaldemand_sa['DATETIME'].diff().mean()

print("Minimum Date:", min_date)
print("Maximum Date:", max_date)
print("Average Time Between Samples:", time_diff)

### Stopped Here Today - 

In [42]:
# Convert 'DATETIME' column to datetime type if it's not already
forecastdemand_qld['DATETIME'] = pd.to_datetime(forecastdemand_qld['DATETIME'])

# Minimum and maximum dates
min_date = forecastdemand_qld['DATETIME'].min()
max_date = forecastdemand_qld['DATETIME'].max()

# Average time between each time sample
time_diff = forecastdemand_qld['DATETIME'].diff().mean()

print("Minimum Date:", min_date)
print("Maximum Date:", max_date)
print("Average Time Between Samples:", time_diff)

Minimum Date: 2017-01-01 00:00:00
Maximum Date: 2021-03-19 04:00:00
Average Time Between Samples: 0 days 00:30:00


In [42]:
# Convert 'DATETIME' column to datetime type if it's not already
forecastdemand_qld['DATETIME'] = pd.to_datetime(forecastdemand_qld['DATETIME'])

# Minimum and maximum dates
min_date = forecastdemand_qld['DATETIME'].min()
max_date = forecastdemand_qld['DATETIME'].max()

# Average time between each time sample
time_diff = forecastdemand_qld['DATETIME'].diff().mean()

print("Minimum Date:", min_date)
print("Maximum Date:", max_date)
print("Average Time Between Samples:", time_diff)

Minimum Date: 2017-01-01 00:00:00
Maximum Date: 2021-03-19 04:00:00
Average Time Between Samples: 0 days 00:30:00


#QLD

In [41]:
# Convert 'DATETIME' column to datetime type if it's not already
totaldemand_sa['DATETIME'] = pd.to_datetime(totaldemand_sa['DATETIME'])

# Minimum and maximum dates
min_date = totaldemand_sa['DATETIME'].min()
max_date = totaldemand_sa['DATETIME'].max()

# Average time between each time sample
time_diff = totaldemand_sa['DATETIME'].diff().mean()

print("Minimum Date:", min_date)
print("Maximum Date:", max_date)
print("Average Time Between Samples:", time_diff)

In [42]:
# Convert 'DATETIME' column to datetime type if it's not already
forecastdemand_qld['DATETIME'] = pd.to_datetime(forecastdemand_qld['DATETIME'])

# Minimum and maximum dates
min_date = forecastdemand_qld['DATETIME'].min()
max_date = forecastdemand_qld['DATETIME'].max()

# Average time between each time sample
time_diff = forecastdemand_qld['DATETIME'].diff().mean()

print("Minimum Date:", min_date)
print("Maximum Date:", max_date)
print("Average Time Between Samples:", time_diff)

Minimum Date: 2017-01-01 00:00:00
Maximum Date: 2021-03-19 04:00:00
Average Time Between Samples: 0 days 00:30:00


## Explore Dataset - Plot a Month worth of Temperature Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Convert 'DATETIME' column to datetime type if it's not already
temperature_sa['DATETIME'] = pd.to_datetime(temperature_sa['DATETIME'])

# Filter data for a single month (for example, January 2024)
start_date = '2010-01-01'
end_date = '2010-01-31'
temperature_sa_single_month = temperature_sa[(temperature_sa['DATETIME'] >= start_date) & (temperature_sa['DATETIME'] <= end_date)]

# Plotting
plt.figure(figsize=(10, 6))
plt.plot(temperature_sa_single_month['DATETIME'], temperature_sa_single_month['TEMPERATURE'], marker='o', linestyle='-')
plt.title('Temperature vs. Datetime (January 2010)')
plt.xlabel('Datetime')
plt.ylabel('Temperature')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Convert 'DATETIME' column to datetime type if it's not already
totaldemand_sa['DATETIME'] = pd.to_datetime(totaldemand_sa['DATETIME'])

# Filter data for a single month (for example, January 2024)
start_date = '2010-01-01'
end_date = '2010-03-31'
totaldemand_sa_single_month = totaldemand_sa[(totaldemand_sa['DATETIME'] >= start_date) & (totaldemand_sa['DATETIME'] <= end_date)]

# Plotting
plt.figure(figsize=(10, 6))
plt.plot(totaldemand_sa_single_month['DATETIME'], totaldemand_sa_single_month['TOTALDEMAND'], marker='o', linestyle='-')
plt.title('TOTALDEMAND vs. Datetime (January - March 2010)')
plt.xlabel('Datetime')
plt.ylabel('TOTALDEMAND')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
sa_df = pd.merge(temperature_sa, totaldemand_sa, on='DATETIME', how='inner')

# Merge merged_df with forecastdemand_sa on 'DATETIME'
sa_df = pd.merge(sa_df, forecastdemand_sa, on='DATETIME', how='inner')

print(sa_df)

In [None]:
temperature_sa['DATETIME'] = pd.to_datetime(temperature_sa['DATETIME'])

# Minimum and maximum dates
min_date = temperature_sa['DATETIME'].min()
max_date = temperature_sa['DATETIME'].max()

# Average time between each time sample
time_diff = temperature_sa['DATETIME'].diff().mean()

print("Minimum Date:", min_date)
print("Maximum Date:", max_date)
print("Average Time Between Samples:", time_diff)

In [None]:
# Convert 'DATETIME' column to datetime type if it's not already
sa_df['DATETIME'] = pd.to_datetime(sa_df['DATETIME'])

# Find the index of the first date after the minimum date
start_index = sa_df.index[sa_df['DATETIME'] >= '2017-01-01 00:00:00'][0]

# Find the index of the last date before the maximum date
end_index = sa_df.index[sa_df['DATETIME'] <= '2021-03-18 00:00:00'][-1]

# Slice the DataFrame to keep only the values within the specified date range
sa_df = sa_df[start_index:end_index+1]

print(sa_df)

In [None]:
column_names = sa_df.columns.tolist()
print(column_names)

In [None]:
# Check for null or missing values
null_values = sa_df.isnull().any()

# Print columns with missing values
print("Columns with missing values:")
print(null_values[null_values])

# Count total missing values
total_missing = sa_df.isnull().sum().sum()
print("Total missing values:", total_missing)


In [None]:
# Calculate the correlation between TEMPERATURE and TOTALDEMAND
correlation = sa_df['TEMPERATURE'].corr(sa_df['TOTALDEMAND'])

print("Correlation between TEMPERATURE and TOTALDEMAND:", correlation)


In [None]:
print(sa_df.describe())

# Further data exploration

In [None]:

# Scatter Plot
plt.figure(figsize=(8, 6))
sns.scatterplot(x='TEMPERATURE', y='TOTALDEMAND', data=sa_df)
plt.title('Scatter Plot of TEMPERATURE vs TOTALDEMAND')
plt.xlabel('TEMPERATURE')
plt.ylabel('TOTALDEMAND')
plt.show()

# Line Plot
# Create figure and axis objects
fig, ax1 = plt.subplots(figsize=(10, 6))

# Plot TOTALDEMAND on the primary y-axis
sns.lineplot(x=sa_df.index, y='TOTALDEMAND', data=sa_df, label='TOTALDEMAND', ax=ax1, color='b')
ax1.set_ylabel('TOTALDEMAND', color='b')
ax1.tick_params('y', colors='b')

# Create a secondary y-axis for TEMPERATURE
ax2 = ax1.twinx()
sns.lineplot(x=sa_df.index, y='TEMPERATURE', data=sa_df, label='TEMPERATURE', ax=ax2, color='r')
ax2.set_ylabel('TEMPERATURE', color='r')
ax2.tick_params('y', colors='r')

# Set titles and legend
plt.title('Line Plot of TOTALDEMAND and TEMPERATURE')
ax1.set_xlabel('Time')
ax1.legend(loc='upper left')
ax2.legend(loc='upper right')

plt.show()

# Box Plot
plt.figure(figsize=(8, 6))
sns.boxplot(x=pd.cut(sa_df['TEMPERATURE'], bins=5), y='TOTALDEMAND', data=sa_df)
plt.title('Box Plot of TOTALDEMAND by TEMPERATURE Ranges')
plt.xlabel('TEMPERATURE Range')
plt.ylabel('TOTALDEMAND')
plt.xticks(rotation=45)
plt.show()

# Histogram
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(sa_df['TEMPERATURE'], bins=20, kde=True)
plt.title('Distribution of TEMPERATURE')
plt.xlabel('TEMPERATURE')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
sns.histplot(sa_df['TOTALDEMAND'], bins=20, kde=True)
plt.title('Distribution of TOTALDEMAND')
plt.xlabel('TOTALDEMAND')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()


## Check NEW DF for null or missing values

In [None]:
# Filter data for January 2017
january_2017_df = sa_df[(sa_df['DATETIME'] >= '2017-01-01') & (sa_df['DATETIME'] <= '2017-01-31')]

# Plotting
fig, ax1 = plt.subplots(figsize=(10, 6))

# Primary y-axis (left)
ax1.plot(january_2017_df['DATETIME'], january_2017_df['FORECASTDEMAND'], label='Forecast Demand', color='blue')
ax1.plot(january_2017_df['DATETIME'], january_2017_df['TOTALDEMAND'], label='Total Demand', color='green')
ax1.set_xlabel('Datetime')
ax1.set_ylabel('Demand', color='black')
ax1.tick_params('y', colors='black')
ax1.legend(loc='upper left')

# Secondary y-axis (right) for temperature
ax2 = ax1.twinx()
ax2.plot(january_2017_df['DATETIME'], january_2017_df['TEMPERATURE'], label='Temperature', color='red')
ax2.set_ylabel('Temperature', color='black')
ax2.tick_params('y', colors='black')
ax2.legend(loc='upper right')

plt.title('Demand and Temperature for January 2017')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Add in additional features

In [None]:
# Define a function to map months to seasons in Australia
def get_season_australia(month):
    if 1 <= month <= 2 or month == 12:
        return 'Summer'
    elif 3 <= month <= 5:
        return 'Autumn'
    elif 6 <= month <= 8:
        return 'Winter'
    else:
        return 'Spring'

# Extract month from DATETIME column
sa_df['MONTH'] = sa_df['DATETIME'].dt.month

# Map months to seasons for Australia
sa_df['SEASON'] = sa_df['MONTH'].apply(get_season_australia)

# Drop the intermediate 'MONTH' column if not needed
# sa_df.drop(columns=['MONTH'], inplace=True)


In [None]:
# Extract weekday from DATETIME column
sa_df['WEEKDAY'] = sa_df['DATETIME'].dt.weekday

# Print out a few rows to verify
print(sa_df.head())


# Power Prices

In [None]:
# pip install kaggle

In [None]:
# import pandas as pd

# # Load power prices data from CSV file
# # Replace 'power_prices.csv' with the actual file path
# power_prices_df = pd.read_csv('power_prices.csv')

# # Assuming the DataFrame has columns 'Date' and 'Price', and 'Date' is in datetime format
# # If 'Date' is not in datetime format, you can convert it using pd.to_datetime()

# # Filter data for the periods 2017 to 2021
# start_date = '2017-01-01'
# end_date = '2021-12-31'

# power_prices_filtered_df = power_prices_df[(power_prices_df['Date'] >= start_date) & (power_prices_df['Date'] <= end_date)]

# # Display the first few rows of the filtered DataFrame
# print(power_prices_filtered_df.head())


### Add in holidays

In [None]:
# pip install holidays

In [None]:
# Extract unique years from DATETIME column
years = sa_df['DATETIME'].dt.year.unique()

# Print out the years
print("Years contained in sa_df:", years)


In [None]:
import pandas as pd
import holidays

# Generate Australian public holidays for a specific year
year = years  # Specify the year for which you want to generate public holidays
australian_holidays = holidays.AU(years=year)

# Convert the holidays to a DataFrame
holiday_dates = [(date, name) for date, name in australian_holidays.items()]
australian_public_holidays = pd.DataFrame(holiday_dates, columns=['Date', 'Holiday'])

# Print out the DataFrame
print(australian_public_holidays)


In [None]:


# Convert 'Date' column to datetime type in australian_public_holidays DataFrame
australian_public_holidays['Date'] = pd.to_datetime(australian_public_holidays['Date'])

# Merge sa_df with australian_public_holidays based on the date
sa_df = pd.merge(sa_df, australian_public_holidays, left_on='DATETIME', right_on='Date', how='left')

# Create a new column indicating whether each date is a public holiday or not
sa_df['IS_PUBLIC_HOLIDAY'] = sa_df['Holiday'].notnull().astype(int)

# Drop the intermediate 'Date' and 'Holiday' columns if not needed
sa_df.drop(columns=['Date', 'Holiday'], inplace=True)

# Print out a few rows to verify
print(sa_df.head())



In [None]:
# Report column types
column_types = sa_df.dtypes

# Print out the column types
print("Column types in sa_df:")
print(column_types)


In [None]:
# Calculate Cooling and Heating values for sa_df
sa_df['Cooling'] = sa_df['TEMPERATURE'].apply(lambda x: max(0, x - 24))
sa_df['Heating'] = sa_df['TEMPERATURE'].apply(lambda x: max(0, 20 - x))

# Print out a few rows to verify
print(sa_df[['Cooling', 'Heating']].head())

In [None]:
# Add a new column with an increasing index starting from 1
sa_df['INDEX'] = (sa_df['TEMPERATURE'].notnull()).cumsum().shift(fill_value=0) + 1

# Print out a few rows to verify
print(sa_df.head(20))

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Define categorical, numerical, and binary features
categorical_features = ['WEEKDAY']
numerical_features = ['YEAR', 'MONTH', 'DAY', 'HOUR', 'TEMPERATURE']
binary_features = ['IS_PUBLIC_HOLIDAY', 'Cooling', 'Heating']

# Split features and target
X = sa_df[categorical_features + numerical_features + binary_features]
y = sa_df['TOTALDEMAND']

# Define preprocessing steps for different types of features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),  # Standardize numerical features
        ('cat', OneHotEncoder(), categorical_features),  # One-hot encode categorical features
        ('bin', 'passthrough', binary_features)  # Include binary features as is
    ])

# Create a pipeline with preprocessing and linear regression model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline (preprocessing + linear regression model) on training data
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Test MSE:", mse)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print("R^2 Score:", r2)



Check Feature Importance

In [None]:
feature_names = list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)) + \
                numerical_features + binary_features

# Get coefficients of the linear regression model
coefficients = pipeline.named_steps['regressor'].coef_

# Create a DataFrame to display feature importance
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
feature_importance_df['Absolute Coefficient'] = feature_importance_df['Coefficient'].abs()  # Absolute values
feature_importance_df = feature_importance_df.sort_values(by='Absolute Coefficient', ascending=False)

# Display feature importance
print(feature_importance_df)

In [None]:
# Define a mapping of seasons to integers
season_mapping = {'Spring': 0, 'Summer': 1, 'Autumn': 2, 'Winter': 3}

# Map seasons to integers
sa_df['SEASON_INT'] = sa_df['SEASON'].map(season_mapping)

# Print out a few rows to verify
print(sa_df[['SEASON', 'SEASON_INT']].head())


In [None]:
column_types = sa_df.dtypes

# Print out the column types
print("Column types in sa_df:")
print(column_types)

# LSTM Model

In [None]:
# Split dataset into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Define the neural network model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with validation data
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_validation, y_validation), callbacks=[early_stopping])

# Evaluate the model
mse = model.evaluate(X_test, y_test)
print("Test MSE:", mse)


In [None]:
# In this attempt, i used Index (a number increasing from 0 upwards for each record - no date / time).
# it shows that specific times are important drivers with the low acccuracy generated.

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error

# # Extract relevant features from DATETIME column
# sa_df['YEAR'] = sa_df['DATETIME'].dt.year
# sa_df['MONTH'] = sa_df['DATETIME'].dt.month
# sa_df['DAY'] = sa_df['DATETIME'].dt.day
# sa_df['HOUR'] = sa_df['DATETIME'].dt.hour

# Select features and target
X = sa_df[['INDEX', 'TEMPERATURE', 'IS_PUBLIC_HOLIDAY', 'Cooling', 'Heating', 'WEEKDAY']].values
y = sa_df['TOTALDEMAND'].values

# Normalize features
scaler_X = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler_X.fit_transform(X)

# Normalize target
scaler_y = MinMaxScaler(feature_range=(0, 1))
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

# Define function to create time series dataset
def create_dataset(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:(i + time_steps)])
        ys.append(y[i + time_steps])
    return np.array(Xs), np.array(ys)

# Define time steps
TIME_STEPS = 7

# Create time series dataset
X_ts, y_ts = create_dataset(X_scaled, y_scaled, TIME_STEPS)

# Split dataset into train and test sets
split = int(0.8 * len(X_ts))
X_train, X_test = X_ts[:split], X_ts[split:]
y_train, y_test = y_ts[:split], y_ts[split:]

# Define LSTM model
model = Sequential([
    LSTM(units=50, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dense(units=1)
])

# Compile model
model.compile(optimizer='adam', loss='mean_squared_error')

# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1, callbacks=[early_stopping])

# Evaluate model
mse = model.evaluate(X_test, y_test)
print("Test MSE:", mse)

# Make predictions
y_pred_scaled = model.predict(X_test)
y_pred = scaler_y.inverse_transform(y_pred_scaled)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred_scaled))
print("Test RMSE:", rmse)


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Make predictions
y_pred = model.predict(X_test)

# Compute MSE
mse = mean_squared_error(y_test, y_pred)
print("Test MSE:", mse)

# Compute RMSE
rmse = np.sqrt(mse)
print("Test RMSE:", rmse)

# Compute MAE
mae = mean_absolute_error(y_test, y_pred)
print("Test MAE:", mae)

# Compute R^2
r2 = r2_score(y_test, y_pred)
print("R^2 Score:", r2)

# Report the results

# plotting the results shows a poor estimation.

In [None]:

import matplotlib.pyplot as plt
import numpy as np

# Make predictions on the test data
y_pred = model.predict(X_test)

# Inverse transform the scaled predicted and actual values to their original scale
y_pred_actual = scaler_y.inverse_transform(y_pred)
y_test_actual = scaler_y.inverse_transform(y_test)

# Get the total number of periods in the dataset
total_periods = len(y_test_actual)

# Randomly select 50 periods
random_indices = np.random.choice(total_periods, size=50, replace=False)

# Extract the actual and predicted values for the randomly selected periods
random_actual = y_test_actual[random_indices]
random_predicted = y_pred_actual[random_indices]

# Plot actual vs predicted values for the randomly selected periods
plt.figure(figsize=(10, 6))
plt.plot(random_actual, label='Actual', marker='o', linestyle='-')
plt.plot(random_predicted, label='Predicted', marker='o', linestyle='-')
plt.title('Actual vs Predicted TOTALDEMAND for Randomly Selected Periods')
plt.xlabel('Time Step')
plt.ylabel('TOTALDEMAND')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()



#Choosing a random sample of 50 points to plot:

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Make predictions on the test data
y_pred = model.predict(X_test)

# Inverse transform the scaled predicted and actual values to their original scale
y_pred_actual = scaler_y.inverse_transform(y_pred)
y_test_actual = scaler_y.inverse_transform(y_test)

# Get the total number of periods in the dataset
total_periods = len(y_test_actual)

# Randomly select 50 periods
random_indices = np.random.choice(total_periods, size=50, replace=False)

# Extract the actual and predicted values for the randomly selected periods
random_actual = y_test_actual[random_indices]
random_predicted = y_pred_actual[random_indices]

# Plot actual vs predicted values for the randomly selected periods
plt.figure(figsize=(10, 6))
plt.plot(random_actual, label='Actual', marker='o', linestyle='-')
plt.plot(random_predicted, label='Predicted', marker='o', linestyle='-')
plt.title('Actual vs Predicted TOTALDEMAND for Randomly Selected Periods')
plt.xlabel('Time Step')
plt.ylabel('TOTALDEMAND')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()



# Trying again with year, month, day

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping

# Select features and target
X = sa_df[['YEAR', 'MONTH', 'DAY', 'HOUR', 'TEMPERATURE', 'IS_PUBLIC_HOLIDAY', 'Cooling', 'Heating', 'WEEKDAY']].values
y = sa_df['TOTALDEMAND'].values

# Normalize features
scaler_X = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler_X.fit_transform(X)

# Normalize target
scaler_y = MinMaxScaler(feature_range=(0, 1))
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

# Define the neural network model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1, callbacks=[early_stopping])

# Evaluate the model
mse = model.evaluate(X_test, y_test)
print("Test MSE:", mse)


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.callbacks import EarlyStopping

# Select features and target
X = sa_df[['YEAR', 'MONTH', 'DAY', 'HOUR', 'TEMPERATURE', 'IS_PUBLIC_HOLIDAY', 'Cooling', 'Heating', 'WEEKDAY']].values
y = sa_df['TOTALDEMAND'].values

# Normalize features
scaler_X = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler_X.fit_transform(X)

# Normalize target
scaler_y = MinMaxScaler(feature_range=(0, 1))
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

# Reshape input data for LSTM model (samples, time steps, features)
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

# Define the LSTM model
model = Sequential([
    LSTM(units=50, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dense(units=1)
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1, callbacks=[early_stopping])

# Evaluate the model
mse = model.evaluate(X_test, y_test)
print("Test MSE:", mse)


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Make predictions
y_pred = model.predict(X_test)

# Compute MSE
mse = mean_squared_error(y_test, y_pred)
print("Test MSE:", mse)

# Compute RMSE
rmse = np.sqrt(mse)
print("Test RMSE:", rmse)

# Compute MAE
mae = mean_absolute_error(y_test, y_pred)
print("Test MAE:", mae)

# Compute R^2
r2 = r2_score(y_test, y_pred)
print("R^2 Score:", r2)

In [None]:
import matplotlib.pyplot as plt

# Make predictions on the test data
y_pred = model.predict(X_test)

# Inverse transform the scaled predicted and actual values to their original scale
y_pred_actual = scaler_y.inverse_transform(y_pred)
y_test_actual = scaler_y.inverse_transform(y_test)

# Extract the first 100 actual and predicted values
first_50_actual = y_test_actual[:50]
first_50_predicted = y_pred_actual[:50]

# Plot actual vs predicted values for the first 50 values
plt.figure(figsize=(10, 6))
plt.plot(first_50_actual, label='Actual', marker='o', linestyle='-')
plt.plot(first_50_predicted, label='Predicted', marker='o', linestyle='-')
plt.title('Actual vs Predicted TOTALDEMAND for the First 50 Values')
plt.xlabel('Time Step')
plt.ylabel('TOTALDEMAND')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()




# Note above model is struggling to predict the outlier peaks and troughs



In [None]:
unique_months = np.unique(X_test[:, :, 1])
print("Unique months in the test dataset:", unique_months)

# Randomly Select 50 Periods

In [None]:
# Make predictions on the test data
y_pred = model.predict(X_test)

# Inverse transform the scaled predicted and actual values to their original scale
y_pred_actual = scaler_y.inverse_transform(y_pred)
y_test_actual = scaler_y.inverse_transform(y_test)

# Get the total number of periods in the dataset
total_periods = len(y_test_actual)

# Randomly select 50 periods
random_indices = np.random.choice(total_periods, size=50, replace=False)

# Extract the actual and predicted values for the randomly selected periods
random_actual = y_test_actual[random_indices]
random_predicted = y_pred_actual[random_indices]

# Plot actual vs predicted values for the randomly selected periods
plt.figure(figsize=(10, 6))
plt.plot(random_actual, label='Actual', marker='o', linestyle='-')
plt.plot(random_predicted, label='Predicted', marker='o', linestyle='-')
plt.title('Actual vs Predicted TOTALDEMAND for Randomly Selected Periods')
plt.xlabel('Time Step')
plt.ylabel('TOTALDEMAND')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
