### Initial data processing 

In [1]:
import pandas as pd

#### Unzip files

In [2]:
# Joint forecastdemand_nsw files into a single file.
#data/NSW/forecastdemand_nsw.csv.zip.partaa
!cat ../data/NSW/forecastdemand_nsw.csv.zip.parta* > ../data/NSW/forecastdemand_nsw.csv.zip

In [3]:
# Unzip the data files into the data folder replacing the existing files if they exist.
!unzip ../data/NSW/forecastdemand_nsw.csv.zip -d ../data 
!unzip ../data/NSW/temperature_nsw.csv.zip -d ../data
!unzip ../data/NSW/totaldemand_nsw.csv.zip -d ../data

Archive:  ../data/NSW/forecastdemand_nsw.csv.zip
  inflating: ../data/forecastdemand_nsw.csv  
Archive:  ../data/NSW/temperature_nsw.csv.zip
  inflating: ../data/temperature_nsw.csv  
Archive:  ../data/NSW/totaldemand_nsw.csv.zip
  inflating: ../data/totaldemand_nsw.csv  


#### Forecast Demand Data

In [4]:
#read csv files into pandas dataframes
forecastdemand_nsw = pd.read_csv('../data/forecastdemand_nsw.csv')

In [8]:
forecastdemand_nsw.shape

(10906019, 6)

In [7]:
forecastdemand_nsw.head()

Unnamed: 0,PREDISPATCHSEQNO,REGIONID,PERIODID,FORECASTDEMAND,LASTCHANGED,DATETIME
0,2009123018,NSW1,71,7832.04,2009-12-30 12:31:49,2010-01-01 00:00:00
1,2009123019,NSW1,70,7832.04,2009-12-30 13:01:43,2010-01-01 00:00:00
2,2009123020,NSW1,69,7832.03,2009-12-30 13:31:36,2010-01-01 00:00:00
3,2009123021,NSW1,68,7832.03,2009-12-30 14:01:44,2010-01-01 00:00:00
4,2009123022,NSW1,67,7830.96,2009-12-30 14:31:35,2010-01-01 00:00:00


In [9]:
#count unique values in the REGIONID 
forecastdemand_nsw['REGIONID'].value_counts()

REGIONID
NSW1    10906019
Name: count, dtype: int64

In [10]:
forecastdemand_nsw['PERIODID'].value_counts()

PERIODID
32    196505
17    196505
30    196505
29    196505
28    196505
       ...  
75     20467
76     16373
77     12279
78      8187
79      4094
Name: count, Length: 79, dtype: int64

In [11]:
#min and max values in the forecastdemand_nsw dataframe DATETIME column 
forecastdemand_nsw['DATETIME'].min(), forecastdemand_nsw['DATETIME'].max()


('2010-01-01 00:00:00', '2021-03-18 00:00:00')

In [12]:
#show summary statistics
forecastdemand_nsw.describe()

Unnamed: 0,PREDISPATCHSEQNO,PERIODID,FORECASTDEMAND
count,10906020.0,10906020.0,10906020.0
mean,2015176000.0,29.97893,8066.719
std,3234616.0,18.69682,1315.959
min,2009123000.0,1.0,4422.46
25%,2012102000.0,14.0,7077.97
50%,2015081000.0,28.0,8007.55
75%,2018053000.0,43.0,8925.61
max,2021032000.0,79.0,14736.66


In [13]:
#data types of the forecastdemand_nsw dataframe
forecastdemand_nsw.dtypes

PREDISPATCHSEQNO      int64
REGIONID             object
PERIODID              int64
FORECASTDEMAND      float64
LASTCHANGED          object
DATETIME             object
dtype: object

In [14]:
#check for missing values
forecastdemand_nsw.isnull().sum()

PREDISPATCHSEQNO    0
REGIONID            0
PERIODID            0
FORECASTDEMAND      0
LASTCHANGED         0
DATETIME            0
dtype: int64

In [15]:
#change the data type of the DATETIME column to datetime
forecastdemand_nsw['DATETIME'] = pd.to_datetime(forecastdemand_nsw['DATETIME'])


In [16]:
forecastdemand_nsw.dtypes

PREDISPATCHSEQNO             int64
REGIONID                    object
PERIODID                     int64
FORECASTDEMAND             float64
LASTCHANGED                 object
DATETIME            datetime64[ns]
dtype: object

#### Total Demand NSW

In [18]:
totaldemand_nsw = pd.read_csv('../data/totaldemand_nsw.csv')

In [19]:
totaldemand_nsw.shape

(196513, 3)

In [20]:
#check for missing values
totaldemand_nsw.isnull().sum()

DATETIME       0
TOTALDEMAND    0
REGIONID       0
dtype: int64

In [21]:
#summary statistics of the totaldemand_nsw dataframe
totaldemand_nsw.describe()

Unnamed: 0,TOTALDEMAND
count,196513.0
mean,8113.145859
std,1299.532774
min,5074.63
25%,7150.07
50%,8053.23
75%,8958.55
max,14579.86


In [22]:
totaldemand_nsw.dtypes

DATETIME        object
TOTALDEMAND    float64
REGIONID        object
dtype: object

In [23]:
#min and max values in the totaldemand_nsw dataframe DATETIME column
totaldemand_nsw['DATETIME'].min(), totaldemand_nsw['DATETIME'].max()

('1/1/2010 0:00', '9/9/2020 9:30')

In [24]:
#min and max values in the totaldemand_nsw dataframe TOTALDEMAND column
totaldemand_nsw['TOTALDEMAND'].min(), totaldemand_nsw['TOTALDEMAND'].max()

(5074.63, 14579.86)

In [25]:
#totaldemand_nsw dataframe shape
totaldemand_nsw.shape

(196513, 3)

In [32]:
#10 max values in the totaldemand_nsw dataframe TOTALDEMAND column
totaldemand_nsw.nlargest(10, 'TOTALDEMAND')

Unnamed: 0,DATETIME,TOTALDEMAND,REGIONID
19041,1/2/2011 16:30,14579.86,NSW1
19040,1/2/2011 16:00,14525.73,NSW1
19136,3/2/2011 16:00,14449.88,NSW1
19042,1/2/2011 17:00,14440.04,NSW1
19088,2/2/2011 16:00,14417.03,NSW1
19135,3/2/2011 15:30,14408.45,NSW1
19087,2/2/2011 15:30,14403.24,NSW1
19039,1/2/2011 15:30,14337.52,NSW1
19137,3/2/2011 16:30,14333.12,NSW1
19138,3/2/2011 17:00,14315.54,NSW1


In [33]:
#10 min values in the TOTALDEMAND
totaldemand_nsw.nsmallest(10, 'TOTALDEMAND')

Unnamed: 0,DATETIME,TOTALDEMAND,REGIONID
52328,26/12/2012 4:00,5074.63,NSW1
52327,26/12/2012 3:30,5088.73,NSW1
69847,26/12/2013 3:30,5113.03,NSW1
69848,26/12/2013 4:00,5124.08,NSW1
87368,26/12/2014 4:00,5138.11,NSW1
52326,26/12/2012 3:00,5141.31,NSW1
87367,26/12/2014 3:30,5143.15,NSW1
52329,26/12/2012 4:30,5147.59,NSW1
69846,26/12/2013 3:00,5152.93,NSW1
69849,26/12/2013 4:30,5169.8,NSW1


### Temperature NSW Data

In [26]:
#read csv files into pandas dataframes for temperature and totaldemand
temperature_nsw = pd.read_csv('../data/temperature_nsw.csv')

In [27]:
#temperature_nsw dataframe shape
temperature_nsw.shape

(220326, 3)

In [28]:
#check for missing values
temperature_nsw.isnull().sum()

LOCATION       0
DATETIME       0
TEMPERATURE    0
dtype: int64

In [29]:
#summary statistics of the temperature_nsw dataframe
temperature_nsw.describe()

Unnamed: 0,TEMPERATURE
count,220326.0
mean,17.418827
std,5.849763
min,-1.3
25%,13.4
50%,17.7
75%,21.3
max,44.7


In [30]:
#highest 10 temperature in the temperature_nsw dataframe
temperature_nsw.nlargest(10, 'TEMPERATURE')

Unnamed: 0,LOCATION,DATETIME,TEMPERATURE
196404,Bankstown,4/1/2020 13:00,44.7
196405,Bankstown,4/1/2020 13:30,44.6
59193,Bankstown,18/1/2013 13:00,44.4
196403,Bankstown,4/1/2020 12:30,44.1
157349,Bankstown,7/1/2018 13:00,43.9
196402,Bankstown,4/1/2020 12:00,43.9
59191,Bankstown,18/1/2013 12:28,43.6
59190,Bankstown,18/1/2013 12:09,43.5
59192,Bankstown,18/1/2013 12:30,43.5
59194,Bankstown,18/1/2013 13:09,43.5


In [34]:
#lowest 10 temperature in the temperature_nsw dataframe
temperature_nsw.nsmallest(10, 'TEMPERATURE')

Unnamed: 0,LOCATION,DATETIME,TEMPERATURE
167568,Bankstown,16/7/2018 6:30,-1.3
167567,Bankstown,16/7/2018 6:00,-1.2
167564,Bankstown,16/7/2018 4:30,-0.9
167565,Bankstown,16/7/2018 5:00,-0.7
167566,Bankstown,16/7/2018 5:30,-0.7
167569,Bankstown,16/7/2018 7:00,-0.6
9401,Bankstown,30/6/2010 5:00,-0.5
167560,Bankstown,16/7/2018 2:30,-0.5
9402,Bankstown,30/6/2010 5:30,-0.4
167562,Bankstown,16/7/2018 3:30,-0.4


#### Data Wrangling

In [35]:
#copy totaldemand_nsw to a new dataframe
totaldemand_nsw_copy = totaldemand_nsw.copy()
#remove REGIONID column from totaldemand_nsw_copy dataframe
totaldemand_nsw_copy.drop('REGIONID', axis=1, inplace=True)

In [36]:
totaldemand_nsw_copy.head()

Unnamed: 0,DATETIME,TOTALDEMAND
0,1/1/2010 0:00,8038.0
1,1/1/2010 0:30,7809.31
2,1/1/2010 1:00,7483.69
3,1/1/2010 1:30,7117.23
4,1/1/2010 2:00,6812.03


In [37]:
#create a new DATE column with only date from DATETIME table datetime format is YYYY-MM-DD HH:MM:SS
totaldemand_nsw_copy['DATE'] = totaldemand_nsw_copy['DATETIME'].str.split(' ').str[0]

In [38]:
#convert datetime format to %m/%d/%Y %H:%M 
totaldemand_nsw_copy['DATETIME'] = pd.to_datetime(totaldemand_nsw_copy['DATETIME'], format='%d/%m/%Y %H:%M')

In [39]:
#convert DATE column to datetime data type
totaldemand_nsw_copy['DATE'] = pd.to_datetime(totaldemand_nsw_copy['DATE'], format='%d/%m/%Y')

In [40]:
#copy year from DATE column to a new column
totaldemand_nsw_copy['YEAR'] = totaldemand_nsw_copy['DATE'].dt.year

In [41]:
totaldemand_nsw_copy.dtypes

DATETIME       datetime64[ns]
TOTALDEMAND           float64
DATE           datetime64[ns]
YEAR                    int32
dtype: object

#### Combine dataset

In [42]:
#read totaldemand_nsw csv file
totaldemand = pd.read_csv('../data/totaldemand_nsw.csv')
totaldemand.head()

Unnamed: 0,DATETIME,TOTALDEMAND,REGIONID
0,1/1/2010 0:00,8038.0,NSW1
1,1/1/2010 0:30,7809.31,NSW1
2,1/1/2010 1:00,7483.69,NSW1
3,1/1/2010 1:30,7117.23,NSW1
4,1/1/2010 2:00,6812.03,NSW1


In [43]:
#drop REGIONID column
totaldemand = totaldemand.drop(columns=['REGIONID']) 
totaldemand.head()

Unnamed: 0,DATETIME,TOTALDEMAND
0,1/1/2010 0:00,8038.0
1,1/1/2010 0:30,7809.31
2,1/1/2010 1:00,7483.69
3,1/1/2010 1:30,7117.23
4,1/1/2010 2:00,6812.03


In [44]:
#dimension of the data
totaldemand.shape

(196513, 2)

In [45]:
#check for missing values
totaldemand.isnull().sum()

DATETIME       0
TOTALDEMAND    0
dtype: int64

In [46]:
#check for duplicates
totaldemand_duplicate = totaldemand[totaldemand.duplicated(subset='DATETIME')]
totaldemand_duplicate.shape

(0, 2)

### Temperature Data 

In [47]:
#read temperature_nsw csv file
temperature = pd.read_csv('../data/temperature_nsw.csv')
temperature.head()

Unnamed: 0,LOCATION,DATETIME,TEMPERATURE
0,Bankstown,1/1/2010 0:00,23.1
1,Bankstown,1/1/2010 0:01,23.1
2,Bankstown,1/1/2010 0:30,22.9
3,Bankstown,1/1/2010 0:50,22.7
4,Bankstown,1/1/2010 1:00,22.6


In [48]:
#drop LOCATION column
temperature = temperature.drop(columns=['LOCATION']) 
temperature.head()


Unnamed: 0,DATETIME,TEMPERATURE
0,1/1/2010 0:00,23.1
1,1/1/2010 0:01,23.1
2,1/1/2010 0:30,22.9
3,1/1/2010 0:50,22.7
4,1/1/2010 1:00,22.6


In [49]:
#dimension of the data
temperature.shape

(220326, 2)

In [50]:
#check for missing values
print(totaldemand.isnull().sum())
print(temperature.isnull().sum())

DATETIME       0
TOTALDEMAND    0
dtype: int64
DATETIME       0
TEMPERATURE    0
dtype: int64


In [51]:
#check for duplicates
temp_duplicate = temperature[temperature.duplicated(subset='DATETIME')]
temp_duplicate.shape

(13, 2)

In [52]:
temp_duplicate

Unnamed: 0,DATETIME,TEMPERATURE
19006,1/1/2011 0:00,21.0
34282,10/10/2011 10:30,18.9
34299,10/10/2011 18:30,16.1
34302,10/10/2011 19:30,15.5
38655,1/1/2012 0:00,15.4
58293,1/1/2013 0:00,21.0
78276,1/1/2014 0:00,20.4
97917,1/1/2015 0:00,20.9
117699,1/1/2016 0:00,16.9
137200,1/1/2017 0:00,22.6


In [53]:
#show all the duplicates rows
temperature[temperature.duplicated(subset='DATETIME', keep=False)]


Unnamed: 0,DATETIME,TEMPERATURE
19005,1/1/2011 0:00,21.0
19006,1/1/2011 0:00,21.0
34281,10/10/2011 10:30,18.9
34282,10/10/2011 10:30,18.9
34298,10/10/2011 18:30,16.1
34299,10/10/2011 18:30,16.1
34301,10/10/2011 19:30,15.5
34302,10/10/2011 19:30,15.5
38654,1/1/2012 0:00,15.4
38655,1/1/2012 0:00,15.4


In [54]:
#count the number of duplicates
temperature.duplicated(subset='DATETIME').sum()

13

In [55]:
temperature.shape

(220326, 2)

In [56]:
#drop duplicates
temperature = temperature.drop_duplicates(subset='DATETIME')
temperature.shape

(220313, 2)

### Merge temperature and demand 

In [57]:
#join the two dataframes on the DATETIME column
data = pd.merge(totaldemand, temperature, on='DATETIME')
data.head()


Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE
0,1/1/2010 0:00,8038.0,23.1
1,1/1/2010 0:30,7809.31,22.9
2,1/1/2010 1:00,7483.69,22.6
3,1/1/2010 1:30,7117.23,22.5
4,1/1/2010 2:00,6812.03,22.5


In [58]:
data.shape

(195934, 3)

In [59]:
data.isna().sum()

DATETIME       0
TOTALDEMAND    0
TEMPERATURE    0
dtype: int64

In [60]:
#check for duplicates
data_duplicate = data[data.duplicated(subset='DATETIME')]
data_duplicate.shape


(0, 3)

In [61]:
#create a new column 'SEASON' based on the month
data['DATETIME'] = pd.to_datetime(data['DATETIME'], format='%d/%m/%Y %H:%M')
data['SEASON'] = data['DATETIME'].dt.month
data['SEASON'] = data['SEASON'].apply(lambda x: 'Summer' if x in [12, 1, 2] else ('Autumn' if x in [3, 4, 5] else ('Winter' if x in [6, 7, 8] else 'Spring')))
data.head()


Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON
0,2010-01-01 00:00:00,8038.0,23.1,Summer
1,2010-01-01 00:30:00,7809.31,22.9,Summer
2,2010-01-01 01:00:00,7483.69,22.6,Summer
3,2010-01-01 01:30:00,7117.23,22.5,Summer
4,2010-01-01 02:00:00,6812.03,22.5,Summer


In [62]:
#create a new column 'Weekday' based on the day of the week 
data['WEEKDAY'] = data['DATETIME'].dt.day_name()
data.head()


Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY
0,2010-01-01 00:00:00,8038.0,23.1,Summer,Friday
1,2010-01-01 00:30:00,7809.31,22.9,Summer,Friday
2,2010-01-01 01:00:00,7483.69,22.6,Summer,Friday
3,2010-01-01 01:30:00,7117.23,22.5,Summer,Friday
4,2010-01-01 02:00:00,6812.03,22.5,Summer,Friday


In [63]:
#export the data to a new csv file
data.to_csv('../data/totaldemand_temperature_nsw.csv', index=False)


In [64]:
#export the data to a new zip file
data.to_csv('../data/totaldemand_temperature_nsw.zip', index=False, compression='zip')


Combining temperature and demand data with holiday data

In [65]:
#read the data from the csv file
datedata = pd.read_csv('../data/date_dim.csv')
datedata.head()

Unnamed: 0.1,Unnamed: 0,Calendar_Date,DayOfWeek,Week,Quarter,Month,Year,Year_half,Summer,Autumn,Winter,Spring,is_weekday,Description,is_holiday
0,0,2014-01-01,3,1,1,1,2014,1,1,0,0,0,1,New Year's Day,1
1,1,2014-01-02,4,1,1,1,2014,1,1,0,0,0,1,,0
2,2,2014-01-03,5,1,1,1,2014,1,1,0,0,0,1,,0
3,3,2014-01-04,6,1,1,1,2014,1,1,0,0,0,0,,0
4,4,2014-01-05,7,1,1,1,2014,1,1,0,0,0,0,,0


In [66]:
#copy the data to a new dataframe with calender_date, DayOfWeek, Summer, Autumn, Winter, Spring, is_weekday, is_holiday columns
new_date_data = datedata.copy()
#drop the columns not needed
new_date_data = new_date_data.drop(columns=['Unnamed: 0','Week', 'Quarter', 'Month', 'Year', 'Year_half', 'Description'])
new_date_data.head()

Unnamed: 0,Calendar_Date,DayOfWeek,Summer,Autumn,Winter,Spring,is_weekday,is_holiday
0,2014-01-01,3,1,0,0,0,1,1
1,2014-01-02,4,1,0,0,0,1,0
2,2014-01-03,5,1,0,0,0,1,0
3,2014-01-04,6,1,0,0,0,0,0
4,2014-01-05,7,1,0,0,0,0,0


In [67]:
new_date_data[new_date_data['Calendar_Date'] == '2014-12-25']


Unnamed: 0,Calendar_Date,DayOfWeek,Summer,Autumn,Winter,Spring,is_weekday,is_holiday
358,2014-12-25,4,1,0,0,0,1,1


In [68]:
#remane one column
new_date_data = new_date_data.rename(columns={'Calendar_Date': 'DATE'})
new_date_data.head()


Unnamed: 0,DATE,DayOfWeek,Summer,Autumn,Winter,Spring,is_weekday,is_holiday
0,2014-01-01,3,1,0,0,0,1,1
1,2014-01-02,4,1,0,0,0,1,0
2,2014-01-03,5,1,0,0,0,1,0
3,2014-01-04,6,1,0,0,0,0,0
4,2014-01-05,7,1,0,0,0,0,0


In [69]:
new_date_data.isna().sum() 

DATE          0
DayOfWeek     0
Summer        0
Autumn        0
Winter        0
Spring        0
is_weekday    0
is_holiday    0
dtype: int64

In [70]:
new_date_data.dtypes

DATE          object
DayOfWeek      int64
Summer         int64
Autumn         int64
Winter         int64
Spring         int64
is_weekday     int64
is_holiday     int64
dtype: object

In [71]:
#max and min value of the DATE column
new_date_data['DATE'].max(), new_date_data['DATE'].min()


('2022-12-31', '2014-01-01')

In [72]:
#remove the rows after 2020-12-31
new_date_data = new_date_data[new_date_data['DATE'] <= '2021-03-18']
new_date_data['DATE'].max(), new_date_data['DATE'].min()

('2021-03-18', '2014-01-01')

In [73]:
#read the data from the csv file
demand_temperature_data = pd.read_csv('../data/totaldemand_temperature_nsw.csv')
demand_temperature_data.head()

Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY
0,2010-01-01 00:00:00,8038.0,23.1,Summer,Friday
1,2010-01-01 00:30:00,7809.31,22.9,Summer,Friday
2,2010-01-01 01:00:00,7483.69,22.6,Summer,Friday
3,2010-01-01 01:30:00,7117.23,22.5,Summer,Friday
4,2010-01-01 02:00:00,6812.03,22.5,Summer,Friday


In [74]:
demand_temperature_data.shape

(195934, 5)

In [75]:
#create a new column 'DATE' 
demand_temperature_data['DATETIME'] = pd.to_datetime(demand_temperature_data['DATETIME'], format='%Y-%m-%d %H:%M:%S')
demand_temperature_data.head()

Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY
0,2010-01-01 00:00:00,8038.0,23.1,Summer,Friday
1,2010-01-01 00:30:00,7809.31,22.9,Summer,Friday
2,2010-01-01 01:00:00,7483.69,22.6,Summer,Friday
3,2010-01-01 01:30:00,7117.23,22.5,Summer,Friday
4,2010-01-01 02:00:00,6812.03,22.5,Summer,Friday


In [76]:
#create a new column 'DATE' 
demand_temperature_data['DATE'] = demand_temperature_data['DATETIME'].dt.date
demand_temperature_data.head()


Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY,DATE
0,2010-01-01 00:00:00,8038.0,23.1,Summer,Friday,2010-01-01
1,2010-01-01 00:30:00,7809.31,22.9,Summer,Friday,2010-01-01
2,2010-01-01 01:00:00,7483.69,22.6,Summer,Friday,2010-01-01
3,2010-01-01 01:30:00,7117.23,22.5,Summer,Friday,2010-01-01
4,2010-01-01 02:00:00,6812.03,22.5,Summer,Friday,2010-01-01


In [77]:
#remove the rows before 2014-01-01
demand_temperature_data = demand_temperature_data[demand_temperature_data['DATETIME'] >= '2014-01-01']
demand_temperature_data['DATETIME'].max(), demand_temperature_data['DATETIME'].min()

(Timestamp('2021-03-18 00:00:00'), Timestamp('2014-01-01 00:00:00'))

In [78]:
demand_temperature_data.shape

(126129, 6)

In [79]:
demand_temperature_data.dtypes, new_date_data.dtypes

(DATETIME       datetime64[ns]
 TOTALDEMAND           float64
 TEMPERATURE           float64
 SEASON                 object
 WEEKDAY                object
 DATE                   object
 dtype: object,
 DATE          object
 DayOfWeek      int64
 Summer         int64
 Autumn         int64
 Winter         int64
 Spring         int64
 is_weekday     int64
 is_holiday     int64
 dtype: object)

In [80]:
demand_temperature_data.head()

Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY,DATE
69805,2014-01-01 00:00:00,7009.91,20.4,Summer,Wednesday,2014-01-01
69806,2014-01-01 00:30:00,6840.01,19.8,Summer,Wednesday,2014-01-01
69807,2014-01-01 01:00:00,6580.75,19.5,Summer,Wednesday,2014-01-01
69808,2014-01-01 01:30:00,6212.79,19.2,Summer,Wednesday,2014-01-01
69809,2014-01-01 02:00:00,5988.92,18.8,Summer,Wednesday,2014-01-01


In [81]:
# find duplicates
demand_temperature_data.duplicated().sum()

0

In [82]:
#show all the duplicates rows
demand_temperature_data[demand_temperature_data.duplicated(keep=False)]

Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY,DATE


In [83]:
#show all the duplicates rows
demand_temperature_data[demand_temperature_data.duplicated(subset=['DATETIME'], keep=False)]

Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY,DATE


In [84]:
# dimension of the data
demand_temperature_data.shape, new_date_data.shape

((126129, 6), (2636, 8))

In [85]:
new_date_data['DATE'] = pd.to_datetime(new_date_data['DATE'])
new_date_data.dtypes

DATE          datetime64[ns]
DayOfWeek              int64
Summer                 int64
Autumn                 int64
Winter                 int64
Spring                 int64
is_weekday             int64
is_holiday             int64
dtype: object

In [86]:
new_date_data.head()

Unnamed: 0,DATE,DayOfWeek,Summer,Autumn,Winter,Spring,is_weekday,is_holiday
0,2014-01-01,3,1,0,0,0,1,1
1,2014-01-02,4,1,0,0,0,1,0
2,2014-01-03,5,1,0,0,0,1,0
3,2014-01-04,6,1,0,0,0,0,0
4,2014-01-05,7,1,0,0,0,0,0


In [87]:
demand_temperature_data['DATE'] = pd.to_datetime(demand_temperature_data['DATE'])
demand_temperature_data.dtypes

DATETIME       datetime64[ns]
TOTALDEMAND           float64
TEMPERATURE           float64
SEASON                 object
WEEKDAY                object
DATE           datetime64[ns]
dtype: object

In [88]:
holiday_merged = pd.merge(demand_temperature_data, new_date_data, how='left', on='DATE')
holiday_merged.head()

Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY,DATE,DayOfWeek,Summer,Autumn,Winter,Spring,is_weekday,is_holiday
0,2014-01-01 00:00:00,7009.91,20.4,Summer,Wednesday,2014-01-01,3,1,0,0,0,1,1
1,2014-01-01 00:30:00,6840.01,19.8,Summer,Wednesday,2014-01-01,3,1,0,0,0,1,1
2,2014-01-01 01:00:00,6580.75,19.5,Summer,Wednesday,2014-01-01,3,1,0,0,0,1,1
3,2014-01-01 01:30:00,6212.79,19.2,Summer,Wednesday,2014-01-01,3,1,0,0,0,1,1
4,2014-01-01 02:00:00,5988.92,18.8,Summer,Wednesday,2014-01-01,3,1,0,0,0,1,1


In [89]:
holiday_merged.shape

(126225, 13)

In [90]:
holiday_merged.isna().sum()

DATETIME       0
TOTALDEMAND    0
TEMPERATURE    0
SEASON         0
WEEKDAY        0
DATE           0
DayOfWeek      0
Summer         0
Autumn         0
Winter         0
Spring         0
is_weekday     0
is_holiday     0
dtype: int64

In [91]:
holiday_merged['DATE'].max(), holiday_merged['DATE'].min()

(Timestamp('2021-03-18 00:00:00'), Timestamp('2014-01-01 00:00:00'))

In [92]:
final_data = holiday_merged.copy()
final_data.head() 

Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY,DATE,DayOfWeek,Summer,Autumn,Winter,Spring,is_weekday,is_holiday
0,2014-01-01 00:00:00,7009.91,20.4,Summer,Wednesday,2014-01-01,3,1,0,0,0,1,1
1,2014-01-01 00:30:00,6840.01,19.8,Summer,Wednesday,2014-01-01,3,1,0,0,0,1,1
2,2014-01-01 01:00:00,6580.75,19.5,Summer,Wednesday,2014-01-01,3,1,0,0,0,1,1
3,2014-01-01 01:30:00,6212.79,19.2,Summer,Wednesday,2014-01-01,3,1,0,0,0,1,1
4,2014-01-01 02:00:00,5988.92,18.8,Summer,Wednesday,2014-01-01,3,1,0,0,0,1,1


In [93]:
#drop columns  SEASON, WEEKDAY, DATE
final_data = final_data.drop(columns=['SEASON', 'WEEKDAY', 'DATE'])
final_data.head()


Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,DayOfWeek,Summer,Autumn,Winter,Spring,is_weekday,is_holiday
0,2014-01-01 00:00:00,7009.91,20.4,3,1,0,0,0,1,1
1,2014-01-01 00:30:00,6840.01,19.8,3,1,0,0,0,1,1
2,2014-01-01 01:00:00,6580.75,19.5,3,1,0,0,0,1,1
3,2014-01-01 01:30:00,6212.79,19.2,3,1,0,0,0,1,1
4,2014-01-01 02:00:00,5988.92,18.8,3,1,0,0,0,1,1


In [94]:
#rename the columns
final_data = final_data.rename(columns={'DATETIME': 'DATETIME', 'TOTALDEMAND': 'TOTALDEMAND', 'TEMPERATURE': 'TEMPERATURE', 'DayOfWeek': 'DAYOFWEEK', 'Summer': 'SUMMER', 'Autumn': 'AUTUMN', 'Winter': 'WINTER', 'Spring': 'SPRING', 'is_weekday': 'WEEKDAY', 'is_holiday': 'HOLIDAY'})
final_data.head()

Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,DAYOFWEEK,SUMMER,AUTUMN,WINTER,SPRING,WEEKDAY,HOLIDAY
0,2014-01-01 00:00:00,7009.91,20.4,3,1,0,0,0,1,1
1,2014-01-01 00:30:00,6840.01,19.8,3,1,0,0,0,1,1
2,2014-01-01 01:00:00,6580.75,19.5,3,1,0,0,0,1,1
3,2014-01-01 01:30:00,6212.79,19.2,3,1,0,0,0,1,1
4,2014-01-01 02:00:00,5988.92,18.8,3,1,0,0,0,1,1


In [95]:
final_data.dtypes


DATETIME       datetime64[ns]
TOTALDEMAND           float64
TEMPERATURE           float64
DAYOFWEEK               int64
SUMMER                  int64
AUTUMN                  int64
WINTER                  int64
SPRING                  int64
WEEKDAY                 int64
HOLIDAY                 int64
dtype: object

In [96]:
duplicate_data = final_data[final_data.duplicated(subset='DATETIME')]
duplicate_data.DATETIME.sort_values()

52431   2017-01-02 00:00:00
52433   2017-01-02 00:30:00
52435   2017-01-02 01:00:00
52437   2017-01-02 01:30:00
52439   2017-01-02 02:00:00
                ...        
57988   2017-04-25 21:30:00
57990   2017-04-25 22:00:00
57992   2017-04-25 22:30:00
57994   2017-04-25 23:00:00
57996   2017-04-25 23:30:00
Name: DATETIME, Length: 96, dtype: datetime64[ns]

In [97]:
#show all the duplicates rows
final_data[final_data.duplicated(subset='DATETIME', keep=False)]

Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,DAYOFWEEK,SUMMER,AUTUMN,WINTER,SPRING,WEEKDAY,HOLIDAY
52430,2017-01-02 00:00:00,6847.27,22.6,1,1,0,0,0,1,1
52431,2017-01-02 00:00:00,6847.27,22.6,1,1,0,0,0,1,1
52432,2017-01-02 00:30:00,6678.33,22.9,1,1,0,0,0,1,1
52433,2017-01-02 00:30:00,6678.33,22.9,1,1,0,0,0,1,1
52434,2017-01-02 01:00:00,6487.85,22.9,1,1,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...
57992,2017-04-25 22:30:00,7041.88,19.8,2,0,1,0,0,1,1
57993,2017-04-25 23:00:00,6880.41,19.5,2,0,1,0,0,1,1
57994,2017-04-25 23:00:00,6880.41,19.5,2,0,1,0,0,1,1
57995,2017-04-25 23:30:00,6877.11,18.9,2,0,1,0,0,1,1


In [98]:
#remove duplicates
final_data = final_data.drop_duplicates(subset='DATETIME')


In [99]:
final_data.shape


(126129, 10)

In [100]:
#export the data to a new csv file
final_data.to_csv('../data/final_data_nsw.csv', index=False)

In [101]:
#export the data to a new zip file
final_data.to_csv('../data/final_data_nsw.zip', index=False, compression='zip')
