# Near Term Energy Demand Forecasting

## Introduction

In this project, we will

- Project Overview

- Problem Statement

- Metrics


In [1]:
# Importing Libraries

# Importing Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno 
import datetime as dt
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# Render figures directly in the notebook:
%matplotlib inline

# Render higher resolution images:
%config InlineBackend.figure_format = 'retina'

pd.set_option('display.max_rows', 500)

## 1. Data Exploration and Cleaning

In [2]:
# Reading the datasets

data_energy = pd.read_csv('../data/energy_dataset.csv', index_col=[0], parse_dates=True)
data_weather = pd.read_csv('../data/weather_features.csv', index_col=[0], parse_dates=True)

In [3]:
data_energy.columns

Index(['generation biomass', 'generation fossil brown coal/lignite',
       'generation fossil coal-derived gas', 'generation fossil gas',
       'generation fossil hard coal', 'generation fossil oil',
       'generation fossil oil shale', 'generation fossil peat',
       'generation geothermal', 'generation hydro pumped storage aggregated',
       'generation hydro pumped storage consumption',
       'generation hydro run-of-river and poundage',
       'generation hydro water reservoir', 'generation marine',
       'generation nuclear', 'generation other', 'generation other renewable',
       'generation solar', 'generation waste', 'generation wind offshore',
       'generation wind onshore', 'forecast solar day ahead',
       'forecast wind offshore eday ahead', 'forecast wind onshore day ahead',
       'total load forecast', 'total load actual', 'price day ahead',
       'price actual'],
      dtype='object')

> We are interested only in forecasting the total demand. Hence we will keep only the columns - `total load forecast` and     `total load actual` and remove the rest. 

In [4]:
# Filtering target columns

df = data_energy[['total load forecast', 'total load actual']]

### 1.1 Energy Dataset

In [5]:
df.head(10)

Unnamed: 0_level_0,total load forecast,total load actual
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-01 00:00:00+01:00,26118.0,25385.0
2015-01-01 01:00:00+01:00,24934.0,24382.0
2015-01-01 02:00:00+01:00,23515.0,22734.0
2015-01-01 03:00:00+01:00,22642.0,21286.0
2015-01-01 04:00:00+01:00,21785.0,20264.0
2015-01-01 05:00:00+01:00,21441.0,19905.0
2015-01-01 06:00:00+01:00,21285.0,20010.0
2015-01-01 07:00:00+01:00,21545.0,20377.0
2015-01-01 08:00:00+01:00,21443.0,20094.0
2015-01-01 09:00:00+01:00,21560.0,20637.0


In [6]:
# Checking the dataframe

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 35064 entries, 2015-01-01 00:00:00+01:00 to 2018-12-31 23:00:00+01:00
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   total load forecast  35064 non-null  float64
 1   total load actual    35028 non-null  float64
dtypes: float64(2)
memory usage: 821.8+ KB


> Both the columns `total load forecast` and `total load actual` are of type `float64` and no datatype conversions are necessary. 

In [7]:
df.describe()

Unnamed: 0,total load forecast,total load actual
count,35064.0,35028.0
mean,28712.129962,28696.939905
std,4594.100854,4574.98795
min,18105.0,18041.0
25%,24793.75,24807.75
50%,28906.0,28901.0
75%,32263.25,32192.0
max,41390.0,41015.0


In [8]:
# Checking for missing values

def find_missing_cols_perc(df, perc):
    '''
    Function to print a list a columns in a dataframe with missing values
    
    Arguments:
    df - dataframe
    perc - percentage of missing values
    
    Prints:
    List of columns with percentage of missing data greater than 'perc'
    
    '''
    print(df.columns[df.isnull().mean() > perc])

In [9]:
find_missing_cols_perc(df, 0), find_missing_cols_perc(df, 0.5)

Index(['total load actual'], dtype='object')
Index([], dtype='object')


(None, None)

> There are a few missing values in the column `total load actual`.

### 1.2 Weather Dataset

In [19]:
data_weather.head(5)

Unnamed: 0_level_0,city_name,temp,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,rain_1h,rain_3h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
dt_iso,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2015-01-01 00:00:00+01:00,Valencia,270.475,270.475,270.475,1001,77,1,62,0.0,0.0,0.0,0,800,clear,sky is clear,01n
2015-01-01 01:00:00+01:00,Valencia,270.475,270.475,270.475,1001,77,1,62,0.0,0.0,0.0,0,800,clear,sky is clear,01n
2015-01-01 02:00:00+01:00,Valencia,269.686,269.686,269.686,1002,78,0,23,0.0,0.0,0.0,0,800,clear,sky is clear,01n
2015-01-01 03:00:00+01:00,Valencia,269.686,269.686,269.686,1002,78,0,23,0.0,0.0,0.0,0,800,clear,sky is clear,01n
2015-01-01 04:00:00+01:00,Valencia,269.686,269.686,269.686,1002,78,0,23,0.0,0.0,0.0,0,800,clear,sky is clear,01n
