## 7.2HD Task

### Question: 1


In [8]:
import pandas as pd
#extracting data
main_data = pd.read_csv('dataset_city.csv')

print(main_data.head())

        DateTime  Temperature  Humidity  Wind Speed  general diffuse flows  \
0  1/1/2017 0:00        6.559      73.8       0.083                  0.051   
1  1/1/2017 0:10        6.414      74.5       0.083                  0.070   
2  1/1/2017 0:20        6.313      74.5       0.080                  0.062   
3  1/1/2017 0:30        6.121      75.0       0.083                  0.091   
4  1/1/2017 0:40        5.921      75.7       0.081                  0.048   

   diffuse flows  Zone 1 Power Consumption  Zone 2  Power Consumption  \
0          0.119               34055.69620                16128.87538   
1          0.085               29814.68354                19375.07599   
2          0.100               29128.10127                19006.68693   
3          0.096               28228.86076                18361.09422   
4          0.085               27335.69620                17872.34043   

   Zone 3  Power Consumption  
0                20240.96386  
1                20131.08434  

In [26]:
# Analysing the dataframe
print(main_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52416 entries, 0 to 52415
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   DateTime                   52416 non-null  datetime64[ns]
 1   Temperature                52416 non-null  float64       
 2   Humidity                   52416 non-null  float64       
 3   Wind Speed                 52416 non-null  float64       
 4   general diffuse flows      52416 non-null  float64       
 5   diffuse flows              52416 non-null  float64       
 6   Zone 1 Power Consumption   52416 non-null  float64       
 7   Zone 2  Power Consumption  52416 non-null  float64       
 8   Zone 3  Power Consumption  52416 non-null  float64       
 9   UnixTimestamp              52416 non-null  float64       
 10  aggregated_consumption     52416 non-null  float64       
dtypes: datetime64[ns](1), float64(10)
memory usage: 4.4 MB
None


In [9]:
from datetime import datetime

#creating a unix timestamp value for feeding into models.
main_data['DateTime'] = pd.to_datetime(main_data['DateTime'], format='%m/%d/%Y %H:%M')
main_data['UnixTimestamp'] = main_data['DateTime'].apply(lambda x: x.timestamp())

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import tensorflow as tf



# Extract features and targets
features = ['UnixTimestamp', 'Temperature', 'Humidity', 'Wind Speed', 'diffuse flows', 'general diffuse flows']
targets = ['Zone 1 Power Consumption', 'Zone 2  Power Consumption', 'Zone 3  Power Consumption']
main_data['aggregated_consumption'] = (main_data['Zone 1 Power Consumption'] + main_data['Zone 2  Power Consumption'] + main_data['Zone 3  Power Consumption'])

X = main_data[features]
y_quads = main_data[targets[0]]
y_smir = main_data[targets[1]]
y_boussafou = main_data[targets[2]]
y_aggrtd = main_data['aggregated_consumption']

# Checking the dimensions of the features and targets
print("Input features shape:", X.shape)
print("Quads target shape:", y_quads.shape)
print("Smir target shape:", y_smir.shape)
print("Boussafou target shape:", y_boussafou.shape)
print("Aggregate target shape:", y_aggrtd.shape)

Input features shape: (52416, 6)
Quads target shape: (52416,)
Smir target shape: (52416,)
Boussafou target shape: (52416,)


In [13]:
#Checking for missing values in features and targets
print("Missing values in input features:", X.isnull().sum().sum())
print("Missing values in Quads target:", y_quads.isnull().sum())
print("Missing values in Smir target:", y_smir.isnull().sum())
print("Missing values in Boussafou target:", y_boussafou.isnull().sum())
print("Missing values in Aggregated target:", y_aggrtd.isnull().sum())

Missing values in input features: 0
Missing values in Quads target: 0
Missing values in Smir target: 0
Missing values in Boussafou target: 0
Missing values in Aggregated target: 0


### Table II implementation

In [14]:
# Split into train and test sets
X_train, X_test, y_quads_train, y_quads_test, y_smir_train, y_smir_test, y_boussafou_train, y_boussafou_test, y_aggrtd_train, y_aggrtd_test = train_test_split(X, y_quads, y_smir, y_boussafou,y_aggrtd, test_size=0.25, random_state=42)

# Normalize data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


def create_ffnn(neurons=10, activation='selu'):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(neurons, activation=activation, input_shape=(X_train.shape[1],)),
        tf.keras.layers.Dense(1)
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error', metrics=['mean_absolute_error'])
    return model

models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(max_depth=None, min_samples_split=10, min_samples_leaf=10, max_features=9),
    'Random Forest': RandomForestRegressor(n_estimators=30, max_features=7, min_samples_split=2, min_samples_leaf=1),
    'SVR': SVR(kernel='rbf', C=10, gamma=0.01),
    'FFNN': create_ffnn(neurons=10, activation='selu')
}

# Train and evaluate models for each zone
zones = ['Quads', 'Smir', 'Boussafou', 'Aggregated']
results = []

for zone in zones:
    if zone == 'Quads':
        y_train = y_quads_train
        y_test = y_quads_test
    elif zone == 'Smir':
        y_train = y_smir_train
        y_test = y_smir_test
    elif zone == 'Boussafou':
        y_train = y_boussafou_train
        y_test = y_boussafou_test
    else:  # Aggregated
        y_train = y_aggrtd_train
        y_test = y_aggrtd_test

    for name, model in models.items():
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
        test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
        train_mae = mean_absolute_error(y_train, y_train_pred)
        test_mae = mean_absolute_error(y_test, y_test_pred)

        results.append([zone, name, train_rmse, test_rmse, train_mae, test_mae])



In [27]:
from tabulate import tabulate
headers = ["Zone", "Model", "RMSE (Train)", "RMSE (Test)", "MAE (Train)", "MAE (Test)"]
print(tabulate(results, headers, tablefmt="grid"))

+------------+-------------------+----------------+---------------+---------------+--------------+
| Zone       | Model             |   RMSE (Train) |   RMSE (Test) |   MAE (Train) |   MAE (Test) |
| Quads      | Linear Regression |       6281.93  |       6281.03 |      5154.09  |     5165.6   |
+------------+-------------------+----------------+---------------+---------------+--------------+
| Quads      | Decision Tree     |       2689.88  |       3694.92 |      1638.07  |     2233.77  |
+------------+-------------------+----------------+---------------+---------------+--------------+
| Quads      | Random Forest     |        952.923 |       2397.09 |       569.335 |     1481.15  |
+------------+-------------------+----------------+---------------+---------------+--------------+
| Quads      | SVR               |       7041.52  |       7058.75 |      5829.59  |     5854.35  |
+------------+-------------------+----------------+---------------+---------------+--------------+
| Quads   

### Table IV implementation

In [19]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam, SGD
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
from tabulate import tabulate

# Load the dataset
data = pd.read_csv('dataset_city.csv')

# Convert 'DateTime' column to datetime format and set it as the index
data['DateTime'] = pd.to_datetime(data['DateTime'])
data = data.set_index('DateTime')

# Resample the data to 1-hour intervals
data = data.resample('H').mean()

# Split the dataset into features and target variables for each zone and aggregated distribution
X = data[['Temperature', 'Humidity', 'Wind Speed', 'general diffuse flows', 'diffuse flows']]
# 'Zone 1 Power Consumption', 'Zone 2  Power Consumption', 'Zone 3  Power Consumption'
y_zones = [data['Zone 1 Power Consumption'] ,data['Zone 2  Power Consumption'], data['Zone 3  Power Consumption']]
y_aggregated = data[['Zone 1 Power Consumption', 'Zone 2  Power Consumption', 'Zone 3  Power Consumption']].sum(axis=1)

# Apply Min-Max scaling to the features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training (75%) and testing (25%) sets for each zone and aggregated distribution
train_test_data = [train_test_split(X_scaled, y, test_size=0.25, random_state=42) for y in y_zones + [y_aggregated]]

# Define the hyperparameter values for hourly predictions
param_values_hourly = {
    'Decision Tree': [
        {'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': 5},
        {'max_depth': None, 'min_samples_split': 3, 'min_samples_leaf': 10, 'max_features': 7},
        {'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': 9},
        {'max_depth': None, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_features': 9}
    ],
    'Random Forest': [
        {'n_estimators': 50, 'max_features': 3, 'min_samples_split': 2, 'min_samples_leaf': 1},
        {'n_estimators': 10, 'max_features': 7, 'min_samples_split': 3, 'min_samples_leaf': 1},
        {'n_estimators': 10, 'max_features': 7, 'min_samples_split': 3, 'min_samples_leaf': 10},
        {'n_estimators': 100, 'max_features': 5, 'min_samples_split': 2, 'min_samples_leaf': 1}
    ],
    'SVR': [
        {'kernel': 'rbf', 'C': 10, 'gamma': 0.01},
        {'kernel': 'rbf', 'C': 1, 'gamma': 0.01},
        {'kernel': 'rbf', 'C': 1000, 'gamma': 0.01},
        {'kernel': 'rbf', 'C': 1, 'gamma': 0.01}
    ],
    'FFNN': [
        {'batch_size': 100, 'epochs': 100, 'optimizer': SGD(learning_rate=0.001), 'neurons': 25, 'activation': 'relu'},
        {'batch_size': 350, 'epochs': 100, 'optimizer': Adam(learning_rate=0.001), 'neurons': 4, 'activation': 'selu'},
        {'batch_size': 250, 'epochs': 100, 'optimizer': Adam(learning_rate=0.001), 'neurons': 8, 'activation': 'selu'},
        {'batch_size': 250, 'epochs': 100, 'optimizer': Adam(learning_rate=0.001), 'neurons': 4, 'activation': 'selu'}
    ]
}

# Define the machine learning models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'SVR': SVR()
}

# Train and evaluate the models for each zone and aggregated distribution
table_data = []
headers = ["Model", "Zone", "RMSE (Train)", "RMSE (Test)", "MAE (Train)", "MAE (Test)"]
zones = ['Zone 1', 'Zone 2', 'Zone 3', 'Aggregated']

for model_name, model in models.items():
    for i, (X_train, X_test, y_train, y_test) in enumerate(train_test_data):
        if model_name in param_values_hourly:
            model.set_params(**param_values_hourly[model_name][i])

        model.fit(X_train, y_train)

        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)

        train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
        test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
        train_mae = mean_absolute_error(y_train, y_pred_train)
        test_mae = mean_absolute_error(y_test, y_pred_test)

        table_data.append([model_name, zones[i], train_rmse, test_rmse, train_mae, test_mae])

# Train and evaluate the FFNN model for each zone and aggregated distribution
for i, (X_train, X_test, y_train, y_test) in enumerate(train_test_data):
    params = param_values_hourly['FFNN'][i]
    ffnn_model = Sequential([
        Input(shape=(X_scaled.shape[1],)),
        Dense(params['neurons'], activation=params['activation']),
        Dense(1, activation='linear')
    ])
    ffnn_model.compile(optimizer=params['optimizer'], loss='mean_squared_error')
    ffnn_model.fit(X_train, y_train, epochs=params['epochs'], batch_size=params['batch_size'], verbose=0)

    y_pred_train = ffnn_model.predict(X_train)
    y_pred_test = ffnn_model.predict(X_test)

    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    train_mae = mean_absolute_error(y_train, y_pred_train)
    test_mae = mean_absolute_error(y_test, y_pred_test)

    table_data.append(['FFNN', zones[i], train_rmse, test_rmse, train_mae, test_mae])

print("1-Hour Prediction Results:")
print(tabulate(table_data, headers, tablefmt="grid"))

1-Hour Prediction Results:
+-------------------+------------+----------------+---------------+---------------+--------------+
| Model             | Zone       |   RMSE (Train) |   RMSE (Test) |   MAE (Train) |   MAE (Test) |
| Linear Regression | Zone 1     |        6272.76 |       6298.91 |       5165.35 |      5180.86 |
+-------------------+------------+----------------+---------------+---------------+--------------+
| Linear Regression | Zone 2     |        4698.17 |       4691.52 |       3796.74 |      3792.48 |
+-------------------+------------+----------------+---------------+---------------+--------------+
| Linear Regression | Zone 3     |        5553.23 |       5629.11 |       4425.35 |      4511.24 |
+-------------------+------------+----------------+---------------+---------------+--------------+
| Linear Regression | Aggregated |       14550.9  |      14736.5  |      11900.1  |     12055.7  |
+-------------------+------------+----------------+---------------+---------------

### Question : 2

In [25]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
from tabulate import tabulate

# Load the dataset
new_data = pd.read_csv('dataset_city.csv')

# Convert 'DateTime' column to datetime format and set it as the index
new_data['DateTime'] = pd.to_datetime(new_data['DateTime'])
new_data = new_data.set_index('DateTime')

# Resample the new_data to 1-hour intervals
new_data = new_data.resample('H').mean()

# Feature engineering
new_data['hour'] = new_data.index.hour
new_data['day_of_week'] = new_data.index.dayofweek
new_data['month'] = new_data.index.month

#created new compuned data to form new features
new_data['Temperature_Humidity'] = new_data['Temperature'] * new_data['Humidity']
new_data['WindSpeed_GeneralDiffuseFlows'] = new_data['Wind Speed'] * new_data['general diffuse flows']

# Split the dataset into features and target variables for each zone and aggregated distribution
X = new_data[['Temperature', 'Humidity', 'Wind Speed', 'general diffuse flows', 'diffuse flows',
          'hour', 'day_of_week', 'month', 'Temperature_Humidity', 'WindSpeed_GeneralDiffuseFlows']]
y_zones = [new_data['Zone 1 Power Consumption'] ,new_data['Zone 2  Power Consumption'], new_data['Zone 3  Power Consumption']]
y_aggregated = new_data[['Zone 1 Power Consumption', 'Zone 2  Power Consumption', 'Zone 3  Power Consumption']].sum(axis=1)

# Apply Min-Max scaling to the features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training (75%) and testing (25%) sets for each zone and aggregated distribution
train_test_data = [train_test_split(X_scaled, y, test_size=0.25, random_state=42) for y in y_zones + [y_aggregated]]

# Define the base models and their hyperparameters
base_models = [
    ('RF', RandomForestRegressor(n_estimators=100, max_depth=None, min_samples_split=5, min_samples_leaf=2)),
    ('XGB', XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, subsample=0.7)),
    ('SVR', SVR(C=1, gamma=0.1))
]

# Train and evaluate the ensemble model for each zone and aggregated distribution
table_data = []
headers = ["Zone", "RMSE (Train)", "RMSE (Test)", "MAE (Train)", "MAE (Test)"]
zones = ['Zone 1', 'Zone 2', 'Zone 3', 'Aggregated']

for i, (X_train, X_test, y_train, y_test) in enumerate(train_test_data):
    # Create the stacking ensemble
    ensemble = []

    # Train the base models
    for name, model in base_models:
        model.fit(X_train, y_train)
        ensemble.append((name, model))

    # Make predictions using the base models
    base_predictions = []
    for name, model in ensemble:
        base_predictions.append(model.predict(X_test))

    # Stack the base model predictions and train the meta-model
    stacked_predictions = np.column_stack(base_predictions)
    meta_model = LinearRegression()
    meta_model.fit(stacked_predictions, y_test)

    # Make predictions using the ensemble model
    ensemble_predictions_train = []
    ensemble_predictions_test = []
    for name, model in ensemble:
        ensemble_predictions_train.append(model.predict(X_train))
        ensemble_predictions_test.append(model.predict(X_test))

    ensemble_predictions_train = np.column_stack(ensemble_predictions_train)
    ensemble_predictions_test = np.column_stack(ensemble_predictions_test)

    ensemble_predictions_train = meta_model.predict(ensemble_predictions_train)
    ensemble_predictions_test = meta_model.predict(ensemble_predictions_test)

    # Evaluate the ensemble model
    train_rmse = np.sqrt(mean_squared_error(y_train, ensemble_predictions_train))
    test_rmse = np.sqrt(mean_squared_error(y_test, ensemble_predictions_test))
    train_mae = mean_absolute_error(y_train, ensemble_predictions_train)
    test_mae = mean_absolute_error(y_test, ensemble_predictions_test)

    table_data.append([zones[i], train_rmse, test_rmse, train_mae, test_mae])

print("Ensemble Model Results:")
print(tabulate(table_data, headers, tablefmt="grid"))

Ensemble Model Results:
+------------+----------------+---------------+---------------+--------------+
| Zone       |   RMSE (Train) |   RMSE (Test) |   MAE (Train) |   MAE (Test) |
| Zone 1     |       1022.21  |       1484.98 |       725.768 |     1054.87  |
+------------+----------------+---------------+---------------+--------------+
| Zone 2     |        703.786 |       1105.98 |       499.876 |      801.215 |
+------------+----------------+---------------+---------------+--------------+
| Zone 3     |        746.257 |       1206.62 |       511.247 |      785.297 |
+------------+----------------+---------------+---------------+--------------+
| Aggregated |       2098.84  |       3191.03 |      1475.93  |     2193.77  |
+------------+----------------+---------------+---------------+--------------+
