In [18]:
import pandas as pd

In [19]:
#read totaldemand_nsw csv file
totaldemand = pd.read_csv('../data/totaldemand_nsw.csv')
totaldemand.head()

Unnamed: 0,DATETIME,TOTALDEMAND,REGIONID
0,1/1/2010 0:00,8038.0,NSW1
1,1/1/2010 0:30,7809.31,NSW1
2,1/1/2010 1:00,7483.69,NSW1
3,1/1/2010 1:30,7117.23,NSW1
4,1/1/2010 2:00,6812.03,NSW1


In [20]:
#drop REGIONID column
totaldemand = totaldemand.drop(columns=['REGIONID'])
totaldemand.head()

Unnamed: 0,DATETIME,TOTALDEMAND
0,1/1/2010 0:00,8038.0
1,1/1/2010 0:30,7809.31
2,1/1/2010 1:00,7483.69
3,1/1/2010 1:30,7117.23
4,1/1/2010 2:00,6812.03


In [21]:
#dimension of the data
totaldemand.shape

(196513, 2)

In [22]:
#read temperature_nsw csv file
temperature = pd.read_csv('../data/temperature_nsw.csv')
temperature.head()

Unnamed: 0,LOCATION,DATETIME,TEMPERATURE
0,Bankstown,1/1/2010 0:00,23.1
1,Bankstown,1/1/2010 0:01,23.1
2,Bankstown,1/1/2010 0:30,22.9
3,Bankstown,1/1/2010 0:50,22.7
4,Bankstown,1/1/2010 1:00,22.6


In [23]:
#drop LOCATION column
temperature = temperature.drop(columns=['LOCATION'])
temperature.head()


Unnamed: 0,DATETIME,TEMPERATURE
0,1/1/2010 0:00,23.1
1,1/1/2010 0:01,23.1
2,1/1/2010 0:30,22.9
3,1/1/2010 0:50,22.7
4,1/1/2010 1:00,22.6


In [24]:
#dimension of the data
temperature.shape

(220326, 2)

In [6]:
#check for missing values
print(totaldemand.isnull().sum())
print(temperature.isnull().sum())

DATETIME       0
TOTALDEMAND    0
REGIONID       0
dtype: int64
DATETIME       0
TEMPERATURE    0
dtype: int64


In [25]:
#join the two dataframes on the DATETIME column
data = pd.merge(totaldemand, temperature, on='DATETIME')
data.head()


Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE
0,1/1/2010 0:00,8038.0,23.1
1,1/1/2010 0:30,7809.31,22.9
2,1/1/2010 1:00,7483.69,22.6
3,1/1/2010 1:30,7117.23,22.5
4,1/1/2010 2:00,6812.03,22.5


In [26]:
data.shape

(195947, 3)

In [27]:
data.isna().sum()

DATETIME       0
TOTALDEMAND    0
TEMPERATURE    0
dtype: int64

In [28]:
#create a new column 'SEASON' based on the month
data['DATETIME'] = pd.to_datetime(data['DATETIME'], format='%d/%m/%Y %H:%M')
data['SEASON'] = data['DATETIME'].dt.month
data['SEASON'] = data['SEASON'].apply(lambda x: 'Summer' if x in [12, 1, 2] else ('Autumn' if x in [3, 4, 5] else ('Winter' if x in [6, 7, 8] else 'Spring')))
data.head()


Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON
0,2010-01-01 00:00:00,8038.0,23.1,Summer
1,2010-01-01 00:30:00,7809.31,22.9,Summer
2,2010-01-01 01:00:00,7483.69,22.6,Summer
3,2010-01-01 01:30:00,7117.23,22.5,Summer
4,2010-01-01 02:00:00,6812.03,22.5,Summer


In [30]:
#create a new column 'Weekday' based on the day of the week 
data['WEEKDAY'] = data['DATETIME'].dt.day_name()
data.head()


Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY
0,2010-01-01 00:00:00,8038.0,23.1,Summer,Friday
1,2010-01-01 00:30:00,7809.31,22.9,Summer,Friday
2,2010-01-01 01:00:00,7483.69,22.6,Summer,Friday
3,2010-01-01 01:30:00,7117.23,22.5,Summer,Friday
4,2010-01-01 02:00:00,6812.03,22.5,Summer,Friday


In [31]:
#export the data to a new csv file
data.to_csv('../data/totaldemand_temperature_nsw.csv', index=False)


In [32]:
#export the data to a new zip file
data.to_csv('../data/totaldemand_temperature_nsw.zip', index=False, compression='zip')
