In [150]:
import pandas as pd

In [151]:
#read totaldemand_nsw csv file
totaldemand = pd.read_csv('../data/totaldemand_nsw.csv')
totaldemand.head()

Unnamed: 0,DATETIME,TOTALDEMAND,REGIONID
0,1/1/2010 0:00,8038.0,NSW1
1,1/1/2010 0:30,7809.31,NSW1
2,1/1/2010 1:00,7483.69,NSW1
3,1/1/2010 1:30,7117.23,NSW1
4,1/1/2010 2:00,6812.03,NSW1


In [152]:
#drop REGIONID column
totaldemand = totaldemand.drop(columns=['REGIONID'])
totaldemand.head()

Unnamed: 0,DATETIME,TOTALDEMAND
0,1/1/2010 0:00,8038.0
1,1/1/2010 0:30,7809.31
2,1/1/2010 1:00,7483.69
3,1/1/2010 1:30,7117.23
4,1/1/2010 2:00,6812.03


In [153]:
#dimension of the data
totaldemand.shape

(196513, 2)

In [154]:
#read temperature_nsw csv file
temperature = pd.read_csv('../data/temperature_nsw.csv')
temperature.head()

Unnamed: 0,LOCATION,DATETIME,TEMPERATURE
0,Bankstown,1/1/2010 0:00,23.1
1,Bankstown,1/1/2010 0:01,23.1
2,Bankstown,1/1/2010 0:30,22.9
3,Bankstown,1/1/2010 0:50,22.7
4,Bankstown,1/1/2010 1:00,22.6


In [155]:
#drop LOCATION column
temperature = temperature.drop(columns=['LOCATION'])
temperature.head()


Unnamed: 0,DATETIME,TEMPERATURE
0,1/1/2010 0:00,23.1
1,1/1/2010 0:01,23.1
2,1/1/2010 0:30,22.9
3,1/1/2010 0:50,22.7
4,1/1/2010 1:00,22.6


In [156]:
#dimension of the data
temperature.shape

(220326, 2)

In [157]:
#check for missing values
print(totaldemand.isnull().sum())
print(temperature.isnull().sum())

DATETIME       0
TOTALDEMAND    0
dtype: int64
DATETIME       0
TEMPERATURE    0
dtype: int64


In [158]:
#join the two dataframes on the DATETIME column
data = pd.merge(totaldemand, temperature, on='DATETIME')
data.head()


Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE
0,1/1/2010 0:00,8038.0,23.1
1,1/1/2010 0:30,7809.31,22.9
2,1/1/2010 1:00,7483.69,22.6
3,1/1/2010 1:30,7117.23,22.5
4,1/1/2010 2:00,6812.03,22.5


In [159]:
data.shape

(195947, 3)

In [160]:
data.isna().sum()

DATETIME       0
TOTALDEMAND    0
TEMPERATURE    0
dtype: int64

In [161]:
#create a new column 'SEASON' based on the month
data['DATETIME'] = pd.to_datetime(data['DATETIME'], format='%d/%m/%Y %H:%M')
data['SEASON'] = data['DATETIME'].dt.month
data['SEASON'] = data['SEASON'].apply(lambda x: 'Summer' if x in [12, 1, 2] else ('Autumn' if x in [3, 4, 5] else ('Winter' if x in [6, 7, 8] else 'Spring')))
data.head()


Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON
0,2010-01-01 00:00:00,8038.0,23.1,Summer
1,2010-01-01 00:30:00,7809.31,22.9,Summer
2,2010-01-01 01:00:00,7483.69,22.6,Summer
3,2010-01-01 01:30:00,7117.23,22.5,Summer
4,2010-01-01 02:00:00,6812.03,22.5,Summer


In [162]:
#create a new column 'Weekday' based on the day of the week 
data['WEEKDAY'] = data['DATETIME'].dt.day_name()
data.head()


Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY
0,2010-01-01 00:00:00,8038.0,23.1,Summer,Friday
1,2010-01-01 00:30:00,7809.31,22.9,Summer,Friday
2,2010-01-01 01:00:00,7483.69,22.6,Summer,Friday
3,2010-01-01 01:30:00,7117.23,22.5,Summer,Friday
4,2010-01-01 02:00:00,6812.03,22.5,Summer,Friday


In [163]:
#export the data to a new csv file
data.to_csv('../data/totaldemand_temperature_nsw.csv', index=False)


In [164]:
#export the data to a new zip file
data.to_csv('../data/totaldemand_temperature_nsw.zip', index=False, compression='zip')


Combining temperature and demand data with holiday data

In [250]:
#read the data from the csv file
datedata = pd.read_csv('../data/date_dim.csv')
datedata.head()

Unnamed: 0.1,Unnamed: 0,Calendar_Date,DayOfWeek,Week,Quarter,Month,Year,Year_half,Summer,Autumn,Winter,Spring,is_weekday,Description,is_holiday
0,0,2014-01-01,3,1,1,1,2014,1,1,0,0,0,1,,0
1,1,2014-01-02,4,1,1,1,2014,1,1,0,0,0,1,,0
2,2,2014-01-03,5,1,1,1,2014,1,1,0,0,0,1,,0
3,3,2014-01-04,6,1,1,1,2014,1,1,0,0,0,0,,0
4,4,2014-01-05,7,1,1,1,2014,1,1,0,0,0,0,,0


In [251]:
#copy the data to a new dataframe with calender_date, DayOfWeek, Summer, Autumn, Winter, Spring, is_weekday, is_holiday columns
new_date_data = datedata.copy()
#drop the columns not needed
new_date_data = new_date_data.drop(columns=['Unnamed: 0','Week', 'Quarter', 'Month', 'Year', 'Year_half', 'Description'])
new_date_data.head()

Unnamed: 0,Calendar_Date,DayOfWeek,Summer,Autumn,Winter,Spring,is_weekday,is_holiday
0,2014-01-01,3,1,0,0,0,1,0
1,2014-01-02,4,1,0,0,0,1,0
2,2014-01-03,5,1,0,0,0,1,0
3,2014-01-04,6,1,0,0,0,0,0
4,2014-01-05,7,1,0,0,0,0,0


In [252]:
#remane one column
new_date_data = new_date_data.rename(columns={'Calendar_Date': 'DATE'})
new_date_data.head()


Unnamed: 0,DATE,DayOfWeek,Summer,Autumn,Winter,Spring,is_weekday,is_holiday
0,2014-01-01,3,1,0,0,0,1,0
1,2014-01-02,4,1,0,0,0,1,0
2,2014-01-03,5,1,0,0,0,1,0
3,2014-01-04,6,1,0,0,0,0,0
4,2014-01-05,7,1,0,0,0,0,0


In [254]:
new_data.isna().sum() 

DATETIME            0
TOTALDEMAND         0
TEMPERATURE         0
SEASON              0
WEEKDAY             0
DATE                0
DayOfWeek      126129
Summer         126129
Autumn         126129
Winter         126129
Spring         126129
is_weekday     126129
is_holiday     126129
dtype: int64

In [255]:
new_date_data.dtypes

DATE          object
DayOfWeek      int64
Summer         int64
Autumn         int64
Winter         int64
Spring         int64
is_weekday     int64
is_holiday     int64
dtype: object

In [256]:
#max and min value of the DATE column
new_date_data['DATE'].max(), new_date_data['DATE'].min()


('2023-12-31', '2014-01-01')

In [257]:
#remove the rows after 2020-12-31
new_date_data = new_date_data[new_date_data['DATE'] <= '2021-03-18']
new_date_data['DATE'].max(), new_date_data['DATE'].min()

('2021-03-18', '2014-01-01')

In [258]:
#read the data from the csv file
demand_temperature_data = pd.read_csv('../data/totaldemand_temperature_nsw.csv')
demand_temperature_data.head()

Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY
0,2010-01-01 00:00:00,8038.0,23.1,Summer,Friday
1,2010-01-01 00:30:00,7809.31,22.9,Summer,Friday
2,2010-01-01 01:00:00,7483.69,22.6,Summer,Friday
3,2010-01-01 01:30:00,7117.23,22.5,Summer,Friday
4,2010-01-01 02:00:00,6812.03,22.5,Summer,Friday


In [173]:
demand_temperature_data.shape

(195947, 5)

In [259]:
#create a new column 'DATE' based on the DATETIME column
demand_temperature_data['DATETIME'] = pd.to_datetime(demand_temperature_data['DATETIME'], format='%Y-%m-%d %H:%M:%S')
demand_temperature_data.head()

Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY
0,2010-01-01 00:00:00,8038.0,23.1,Summer,Friday
1,2010-01-01 00:30:00,7809.31,22.9,Summer,Friday
2,2010-01-01 01:00:00,7483.69,22.6,Summer,Friday
3,2010-01-01 01:30:00,7117.23,22.5,Summer,Friday
4,2010-01-01 02:00:00,6812.03,22.5,Summer,Friday


In [260]:
#create a new column 'DATE' and get the date from the DATETIME column
demand_temperature_data['DATE'] = demand_temperature_data['DATETIME'].dt.date
demand_temperature_data.head()


Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY,DATE
0,2010-01-01 00:00:00,8038.0,23.1,Summer,Friday,2010-01-01
1,2010-01-01 00:30:00,7809.31,22.9,Summer,Friday,2010-01-01
2,2010-01-01 01:00:00,7483.69,22.6,Summer,Friday,2010-01-01
3,2010-01-01 01:30:00,7117.23,22.5,Summer,Friday,2010-01-01
4,2010-01-01 02:00:00,6812.03,22.5,Summer,Friday,2010-01-01


In [261]:
#remove the rows before 2014-01-01
demand_temperature_data = demand_temperature_data[demand_temperature_data['DATETIME'] >= '2014-01-01']
demand_temperature_data['DATETIME'].max(), demand_temperature_data['DATETIME'].min()

(Timestamp('2021-03-18 00:00:00'), Timestamp('2014-01-01 00:00:00'))

In [262]:
demand_temperature_data.shape

(126136, 6)

In [263]:
demand_temperature_data.dtypes, new_date_data.dtypes

(DATETIME       datetime64[ns]
 TOTALDEMAND           float64
 TEMPERATURE           float64
 SEASON                 object
 WEEKDAY                object
 DATE                   object
 dtype: object,
 DATE          object
 DayOfWeek      int64
 Summer         int64
 Autumn         int64
 Winter         int64
 Spring         int64
 is_weekday     int64
 is_holiday     int64
 dtype: object)

In [264]:
demand_temperature_data.head()

Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY,DATE
69811,2014-01-01 00:00:00,7009.91,20.4,Summer,Wednesday,2014-01-01
69812,2014-01-01 00:00:00,7009.91,20.4,Summer,Wednesday,2014-01-01
69813,2014-01-01 00:30:00,6840.01,19.8,Summer,Wednesday,2014-01-01
69814,2014-01-01 01:00:00,6580.75,19.5,Summer,Wednesday,2014-01-01
69815,2014-01-01 01:30:00,6212.79,19.2,Summer,Wednesday,2014-01-01


In [265]:
# find duplicates
demand_temperature_data.duplicated().sum()

7

In [266]:
# drop duplicates
demand_temperature_data = demand_temperature_data.drop_duplicates()

In [267]:
# check for duplicates
demand_temperature_data.duplicated().sum()

0

In [268]:
# dimension of the data
demand_temperature_data.shape, new_date_data.shape

((126129, 6), (2649, 8))

In [269]:
# column names of the data
demand_temperature_data.columns, new_date_data.columns

(Index(['DATETIME', 'TOTALDEMAND', 'TEMPERATURE', 'SEASON', 'WEEKDAY', 'DATE'], dtype='object'),
 Index(['DATE', 'DayOfWeek', 'Summer', 'Autumn', 'Winter', 'Spring',
        'is_weekday', 'is_holiday'],
       dtype='object'))

In [447]:
new_date_data.tail()

Unnamed: 0,DATE,DayOfWeek,Summer,Autumn,Winter,Spring,is_weekday,is_holiday
2644,2021-03-14,7,0,1,0,0,0,0
2645,2021-03-15,1,0,1,0,0,1,0
2646,2021-03-16,2,0,1,0,0,1,0
2647,2021-03-17,3,0,1,0,0,1,0
2648,2021-03-18,4,0,1,0,0,1,0


In [270]:
new_date_data.head()

Unnamed: 0,DATE,DayOfWeek,Summer,Autumn,Winter,Spring,is_weekday,is_holiday
0,2014-01-01,3,1,0,0,0,1,0
1,2014-01-02,4,1,0,0,0,1,0
2,2014-01-03,5,1,0,0,0,1,0
3,2014-01-04,6,1,0,0,0,0,0
4,2014-01-05,7,1,0,0,0,0,0


In [230]:
# set the DATE column as the index
new_date_data.set_index('DATE', inplace=True)
new_date_data.head()



Unnamed: 0_level_0,DayOfWeek,Summer,Autumn,Winter,Spring,is_weekday,is_holiday
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-01-01,3,1,0,0,0,1,0
2014-01-02,4,1,0,0,0,1,0
2014-01-03,5,1,0,0,0,1,0
2014-01-04,6,1,0,0,0,0,0
2014-01-05,7,1,0,0,0,0,0


In [271]:
#copy the i th indeex of the new_date_data to a new dataframe
new_date_data.iloc[0]

DATE          2014-01-01
DayOfWeek              3
Summer                 1
Autumn                 0
Winter                 0
Spring                 0
is_weekday             1
is_holiday             0
Name: 0, dtype: object

In [272]:
#index value of the i th row
new_date_data.index[0]

#new_date_data[2014-01-01]

0

In [273]:


#fill the missing values in the new_date_data with the i the index value
new_data.fillna(new_date_data.iloc[0], inplace=True)
new_data.head(50)



Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY,DATE,DayOfWeek,Summer,Autumn,Winter,Spring,is_weekday,is_holiday
0,2014-01-01 00:00:00,7009.91,20.4,Summer,Wednesday,2014-01-01,3.0,1.0,0.0,0.0,0.0,1.0,0.0
1,2014-01-01 00:30:00,6840.01,19.8,Summer,Wednesday,2014-01-01,3.0,1.0,0.0,0.0,0.0,1.0,0.0
2,2014-01-01 01:00:00,6580.75,19.5,Summer,Wednesday,2014-01-01,3.0,1.0,0.0,0.0,0.0,1.0,0.0
3,2014-01-01 01:30:00,6212.79,19.2,Summer,Wednesday,2014-01-01,3.0,1.0,0.0,0.0,0.0,1.0,0.0
4,2014-01-01 02:00:00,5988.92,18.8,Summer,Wednesday,2014-01-01,3.0,1.0,0.0,0.0,0.0,1.0,0.0
5,2014-01-01 02:30:00,5752.86,18.0,Summer,Wednesday,2014-01-01,3.0,1.0,0.0,0.0,0.0,1.0,0.0
6,2014-01-01 03:00:00,5630.41,17.9,Summer,Wednesday,2014-01-01,3.0,1.0,0.0,0.0,0.0,1.0,0.0
7,2014-01-01 03:30:00,5553.19,17.9,Summer,Wednesday,2014-01-01,3.0,1.0,0.0,0.0,0.0,1.0,0.0
8,2014-01-01 04:00:00,5529.09,18.3,Summer,Wednesday,2014-01-01,3.0,1.0,0.0,0.0,0.0,1.0,0.0
9,2014-01-01 04:30:00,5565.83,17.3,Summer,Wednesday,2014-01-01,3.0,1.0,0.0,0.0,0.0,1.0,0.0


In [274]:
new_date_data.iloc[0].values


array(['2014-01-01', 3, 1, 0, 0, 0, 1, 0], dtype=object)

In [275]:
#merge the two dataframes on the DATE column while filling NaN values with values from new_date_data for the corresponding date using fillna() method
new_data = pd.merge(demand_temperature_data, new_date_data, on='DATE', how='left')
#new_data = pd.concat([demand_temperature_data, new_date_data], axis=1)
new_data.head()


Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY,DATE,DayOfWeek,Summer,Autumn,Winter,Spring,is_weekday,is_holiday
0,2014-01-01 00:00:00,7009.91,20.4,Summer,Wednesday,2014-01-01,,,,,,,
1,2014-01-01 00:30:00,6840.01,19.8,Summer,Wednesday,2014-01-01,,,,,,,
2,2014-01-01 01:00:00,6580.75,19.5,Summer,Wednesday,2014-01-01,,,,,,,
3,2014-01-01 01:30:00,6212.79,19.2,Summer,Wednesday,2014-01-01,,,,,,,
4,2014-01-01 02:00:00,5988.92,18.8,Summer,Wednesday,2014-01-01,,,,,,,


In [276]:
new_data.head(5)

Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY,DATE,DayOfWeek,Summer,Autumn,Winter,Spring,is_weekday,is_holiday
0,2014-01-01 00:00:00,7009.91,20.4,Summer,Wednesday,2014-01-01,,,,,,,
1,2014-01-01 00:30:00,6840.01,19.8,Summer,Wednesday,2014-01-01,,,,,,,
2,2014-01-01 01:00:00,6580.75,19.5,Summer,Wednesday,2014-01-01,,,,,,,
3,2014-01-01 01:30:00,6212.79,19.2,Summer,Wednesday,2014-01-01,,,,,,,
4,2014-01-01 02:00:00,5988.92,18.8,Summer,Wednesday,2014-01-01,,,,,,,


In [277]:
nulls = new_data.loc[new_data.is_weekday.isnull(), 'DATE']
nulls

0         2014-01-01
1         2014-01-01
2         2014-01-01
3         2014-01-01
4         2014-01-01
             ...    
126124    2021-03-17
126125    2021-03-17
126126    2021-03-17
126127    2021-03-17
126128    2021-03-18
Name: DATE, Length: 126129, dtype: object

In [278]:
new_date_data

Unnamed: 0,DATE,DayOfWeek,Summer,Autumn,Winter,Spring,is_weekday,is_holiday
0,2014-01-01,3,1,0,0,0,1,0
1,2014-01-02,4,1,0,0,0,1,0
2,2014-01-03,5,1,0,0,0,1,0
3,2014-01-04,6,1,0,0,0,0,0
4,2014-01-05,7,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...
2644,2021-03-14,7,0,1,0,0,0,0
2645,2021-03-15,1,0,1,0,0,1,0
2646,2021-03-16,2,0,1,0,0,1,0
2647,2021-03-17,3,0,1,0,0,1,0


In [279]:
#create a pandas series
s = pd.Series(new_date_data['is_weekday'].values, index=new_date_data['DATE'])  
s

DATE
2014-01-01    1
2014-01-02    1
2014-01-03    1
2014-01-04    0
2014-01-05    0
             ..
2021-03-14    0
2021-03-15    1
2021-03-16    1
2021-03-17    1
2021-03-18    1
Length: 2649, dtype: int64

In [280]:
#fill NaN wil numpy NaN
new_data['is_weekday'].replace('Nan', np.NaN, inplace=True)
#new_data['is_weekday'] = new_data['DATE'].map(s)
new_data.head()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  new_data['is_weekday'].replace('Nan', np.NaN, inplace=True)


Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY,DATE,DayOfWeek,Summer,Autumn,Winter,Spring,is_weekday,is_holiday
0,2014-01-01 00:00:00,7009.91,20.4,Summer,Wednesday,2014-01-01,,,,,,,
1,2014-01-01 00:30:00,6840.01,19.8,Summer,Wednesday,2014-01-01,,,,,,,
2,2014-01-01 01:00:00,6580.75,19.5,Summer,Wednesday,2014-01-01,,,,,,,
3,2014-01-01 01:30:00,6212.79,19.2,Summer,Wednesday,2014-01-01,,,,,,,
4,2014-01-01 02:00:00,5988.92,18.8,Summer,Wednesday,2014-01-01,,,,,,,


In [282]:
s[new_date_data['DATE']]

DATE
2014-01-01    1
2014-01-02    1
2014-01-03    1
2014-01-04    0
2014-01-05    0
             ..
2021-03-14    0
2021-03-15    1
2021-03-16    1
2021-03-17    1
2021-03-18    1
Length: 2679, dtype: int64

In [281]:
#new_data.loc[new_data.is_weekday.isnull(), 'is_weekday'] = s.loc[nulls].values 

def remove_na(new_data):
    if pd.isnull(['is_weekday']):
        return s[new_date_data['DATE']]
    else:
        return new_date_data['DATE']
    
new_data['is_weekday'] = new_data.apply(remove_na, axis=1)


ValueError: Columns must be same length as key

In [221]:
# show the first few rows
new_data.head(100)

Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY,DATE,DayOfWeek,Summer,Autumn,Winter,Spring,is_weekday,is_holiday
0,2014-01-01 00:00:00,7009.91,20.4,Summer,Wednesday,2014-01-01,,,,,,2014-01-01,
1,2014-01-01 00:30:00,6840.01,19.8,Summer,Wednesday,2014-01-01,,,,,,2014-01-01,
2,2014-01-01 01:00:00,6580.75,19.5,Summer,Wednesday,2014-01-01,,,,,,2014-01-01,
3,2014-01-01 01:30:00,6212.79,19.2,Summer,Wednesday,2014-01-01,,,,,,2014-01-01,
4,2014-01-01 02:00:00,5988.92,18.8,Summer,Wednesday,2014-01-01,,,,,,2014-01-01,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2014-01-02 23:30:00,7416.42,21.8,Summer,Thursday,2014-01-02,,,,,,2014-01-02,
96,2014-01-03 00:00:00,7254.94,21.3,Summer,Friday,2014-01-03,,,,,,2014-01-03,
97,2014-01-03 00:30:00,7072.75,21.2,Summer,Friday,2014-01-03,,,,,,2014-01-03,
98,2014-01-03 01:00:00,6805.58,21.2,Summer,Friday,2014-01-03,,,,,,2014-01-03,


In [60]:
new_data.shape

(126136, 13)

In [46]:
new_data.isna().sum()

DATETIME            0
TOTALDEMAND         0
TEMPERATURE         0
SEASON              0
WEEKDAY             0
DATE                0
DayOfWeek      126136
Summer         126136
Autumn         126136
Winter         126136
Spring         126136
is_weekday     126136
is_holiday     126136
dtype: int64

In [63]:
new_data.dtypes


DATETIME       datetime64[ns]
TOTALDEMAND           float64
TEMPERATURE           float64
SEASON                 object
WEEKDAY                object
DATE                   object
DayOfWeek             float64
Summer                float64
Autumn                float64
Winter                float64
Spring                float64
is_weekday            float64
is_holiday            float64
dtype: object

In [286]:
#find 0 and 1 values in the is_weekday column
new_data.describe()



Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,DayOfWeek,Summer,Autumn,Winter,Spring,is_weekday,is_holiday
count,126129,126129.0,126129.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,2017-08-10 04:15:07.742073600,7929.739463,17.71798,,,,,,,
min,2014-01-01 00:00:00,5138.11,-1.3,,,,,,,
25%,2015-10-19 23:30:00,7031.6,13.6,,,,,,,
50%,2017-08-11 13:30:00,7864.9,18.1,,,,,,,
75%,2019-05-31 00:30:00,8668.16,21.7,,,,,,,
max,2021-03-18 00:00:00,13985.87,44.7,,,,,,,
std,,1240.220281,5.947852,,,,,,,


In [288]:
#write a function to update the value of the is_weekday column based on the value of the is_weekday column from the new_date_data
def update_is_weekday(row):
    if pd.isnull(row['is_weekday']):
        return new_date_data.loc[row['DATE'], 'is_weekday']
    else:
        return row['is_weekday']

In [307]:
new_data[:1]

Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY,DATE,DayOfWeek,Summer,Autumn,Winter,Spring,is_weekday,is_holiday
0,2014-01-01,7009.91,20.4,Summer,Wednesday,2014-01-01,,,,,,,


In [309]:
update_is_weekday(new_data[:1])


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

# new approach 

In [367]:
#create a new series from new_date_data with is_holiday and use DATE as the index
holiday_list = pd.Series(new_date_data['is_holiday'].values, index=new_date_data['DATE'])
holiday_list

DATE
2014-01-01    0
2014-01-02    0
2014-01-03    0
2014-01-04    0
2014-01-05    0
             ..
2021-03-14    0
2021-03-15    0
2021-03-16    0
2021-03-17    0
2021-03-18    0
Length: 2649, dtype: int64

In [368]:
df = demand_temperature_data.copy()
df.head()

Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY,DATE
69811,2014-01-01 00:00:00,7009.91,20.4,Summer,Wednesday,2014-01-01
69813,2014-01-01 00:30:00,6840.01,19.8,Summer,Wednesday,2014-01-01
69814,2014-01-01 01:00:00,6580.75,19.5,Summer,Wednesday,2014-01-01
69815,2014-01-01 01:30:00,6212.79,19.2,Summer,Wednesday,2014-01-01
69816,2014-01-01 02:00:00,5988.92,18.8,Summer,Wednesday,2014-01-01


In [397]:
df1 = df.copy()
df2 = new_date_data.copy()

In [398]:
df1.columns, df2.columns

(Index(['DATETIME', 'TOTALDEMAND', 'TEMPERATURE', 'SEASON', 'WEEKDAY', 'DATE',
        'is_holiday'],
       dtype='object'),
 Index(['DATE', 'DayOfWeek', 'Summer', 'Autumn', 'Winter', 'Spring',
        'is_weekday', 'is_holiday'],
       dtype='object'))

In [401]:
df_merged = pd.merge(df1, df2[['DATE', 'is_weekday', 'is_holiday']], on='DATE', how='left')

In [403]:
df_merged.head(72)

Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY,DATE,is_holiday_x,is_weekday,is_holiday_y
0,2014-01-01 00:00:00,7009.91,20.4,Summer,Wednesday,2014-01-01,0,,
1,2014-01-01 00:30:00,6840.01,19.8,Summer,Wednesday,2014-01-01,0,,
2,2014-01-01 01:00:00,6580.75,19.5,Summer,Wednesday,2014-01-01,0,,
3,2014-01-01 01:30:00,6212.79,19.2,Summer,Wednesday,2014-01-01,0,,
4,2014-01-01 02:00:00,5988.92,18.8,Summer,Wednesday,2014-01-01,0,,
...,...,...,...,...,...,...,...,...,...
67,2014-01-02 09:30:00,8625.91,32.5,Summer,Thursday,2014-01-02,0,,
68,2014-01-02 10:00:00,8859.01,33.1,Summer,Thursday,2014-01-02,0,,
69,2014-01-02 10:30:00,9062.05,33.1,Summer,Thursday,2014-01-02,0,,
70,2014-01-02 11:00:00,9196.69,33.9,Summer,Thursday,2014-01-02,0,,


In [369]:
#create a new column 'is_holiday' based in df dataframe with all 0
#df['is_holiday'] = np.nan 
df['is_holiday'] = 0
df.head()


#df['is_holiday'] = df['DATE'].map(holiday_list)

Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY,DATE,is_holiday
69811,2014-01-01 00:00:00,7009.91,20.4,Summer,Wednesday,2014-01-01,0
69813,2014-01-01 00:30:00,6840.01,19.8,Summer,Wednesday,2014-01-01,0
69814,2014-01-01 01:00:00,6580.75,19.5,Summer,Wednesday,2014-01-01,0
69815,2014-01-01 01:30:00,6212.79,19.2,Summer,Wednesday,2014-01-01,0
69816,2014-01-01 02:00:00,5988.92,18.8,Summer,Wednesday,2014-01-01,0


In [372]:
#update is_holiday column based on the value of the holiday_list using lambda function
# df['is_holiday'] = df.apply(lambda x: holiday_list[x['DATE']] if pd.isna(x['is_holiday']) else x['is_holiday'], axis=1)
df['is_holiday'] = df.apply(lambda x: holiday_list[x['DATE']] if (x['is_holiday']==1) else x['is_holiday'], axis=1)
df.head()

Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY,DATE,is_holiday
69811,2014-01-01 00:00:00,7009.91,20.4,Summer,Wednesday,2014-01-01,0
69813,2014-01-01 00:30:00,6840.01,19.8,Summer,Wednesday,2014-01-01,0
69814,2014-01-01 01:00:00,6580.75,19.5,Summer,Wednesday,2014-01-01,0
69815,2014-01-01 01:30:00,6212.79,19.2,Summer,Wednesday,2014-01-01,0
69816,2014-01-01 02:00:00,5988.92,18.8,Summer,Wednesday,2014-01-01,0


In [396]:
#write a lambda function to update the value of is_holiday column in df using the value of the holiday_list series using lambda function
# Update the 'value' column using the Series 's'
df['is_holiday'] = df['DATE'].apply(lambda x: holiday_list[x] if x in holiday_list.index else df.loc[df['DATE'] == x, 'is_holiday'].values[0])

print(df)
    

SyntaxError: closing parenthesis ']' does not match opening parenthesis '(' (601652666.py, line 3)

In [373]:
df.head(100)

Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY,DATE,is_holiday
69811,2014-01-01 00:00:00,7009.91,20.4,Summer,Wednesday,2014-01-01,0
69813,2014-01-01 00:30:00,6840.01,19.8,Summer,Wednesday,2014-01-01,0
69814,2014-01-01 01:00:00,6580.75,19.5,Summer,Wednesday,2014-01-01,0
69815,2014-01-01 01:30:00,6212.79,19.2,Summer,Wednesday,2014-01-01,0
69816,2014-01-01 02:00:00,5988.92,18.8,Summer,Wednesday,2014-01-01,0
...,...,...,...,...,...,...,...
69907,2014-01-02 23:30:00,7416.42,21.8,Summer,Thursday,2014-01-02,0
69908,2014-01-03 00:00:00,7254.94,21.3,Summer,Friday,2014-01-03,0
69909,2014-01-03 00:30:00,7072.75,21.2,Summer,Friday,2014-01-03,0
69910,2014-01-03 01:00:00,6805.58,21.2,Summer,Friday,2014-01-03,0


In [374]:
df.dtypes


DATETIME       datetime64[ns]
TOTALDEMAND           float64
TEMPERATURE           float64
SEASON                 object
WEEKDAY                object
DATE                   object
is_holiday              int64
dtype: object

In [388]:
#select all the rows with the DATETIME containing '2015-12-25'
#df[df['DATE'].str.contains('2015-12-25')]



df[df['DATETIME'] == '2015-12-25']


#df[df['DATE'] == '2015-12-25']



Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY,DATE,is_holiday
104513,2015-12-25,6795.29,18.3,Summer,Friday,2015-12-25,0


In [389]:
#select all rows where the is_holiday column is 1
df[df['is_holiday'] == 1]


Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY,DATE,is_holiday


Refrence code:

In [392]:
import pandas as pd

# Sample DataFrame
data = {'id': ['01-01-2000', '02-01-2000', '03-01-2000', '04-01-2000'], 'value': [0, 0, 0, 0]}
df1 = pd.DataFrame(data)

# Sample Series
s = pd.Series([100, 200, 300], index=['01-01-2000', '02-01-2000', '03-01-2000'])  # Index 1, 2, 3 corresponds to 'id' in df

df1['value'] = df1['id'].apply(lambda x: s[x] if x in s.index else df1.loc[df1['id'] == x, 'value'].values[0])

print(df1)

           id  value
0  01-01-2000    100
1  02-01-2000    200
2  03-01-2000    300
3  04-01-2000      0


In [393]:
data = {'id': [1, 2, 3, 4], 'value': [10, 20, 30, 40]}
df1 = pd.DataFrame(data)
df1

Unnamed: 0,id,value
0,1,10
1,2,20
2,3,30
3,4,40


In [357]:
s

1    100
2    200
3    300
dtype: int64

In [435]:
df_holiday = new_date_data.copy()

#drop all columns except DATE and is_holiday
df_holiday = df_holiday.drop(columns=['DayOfWeek', 'Summer', 'Autumn', 'Winter', 'Spring', 'is_weekday'])
df_holiday.head()


Unnamed: 0,DATE,is_holiday
0,2014-01-01,0
1,2014-01-02,0
2,2014-01-03,0
3,2014-01-04,0
4,2014-01-05,0


In [436]:
#update the DATE column to datetime format
df_holiday['DATE'] = pd.to_datetime(df_holiday['DATE'], format='%Y-%m-%d')
df_holiday.head()


Unnamed: 0,DATE,is_holiday
0,2014-01-01,0
1,2014-01-02,0
2,2014-01-03,0
3,2014-01-04,0
4,2014-01-05,0


In [437]:
#reindex the dataframe
df_holiday.set_index('DATE', inplace=True)
df_holiday.head()

Unnamed: 0_level_0,is_holiday
DATE,Unnamed: 1_level_1
2014-01-01,0
2014-01-02,0
2014-01-03,0
2014-01-04,0
2014-01-05,0


In [440]:
#reset the index
df_holiday.reset_index(inplace=True)
df_holiday.head()

Unnamed: 0,DATE,is_holiday
0,2014-01-01,0
1,2014-01-02,0
2,2014-01-03,0
3,2014-01-04,0
4,2014-01-05,0


In [446]:
#add new rows with 30mins interval to the DATE column
df_holiday = df_holiday.set_index('DATE').resample('30T').asfreq().reset_index()
df_holiday.head()


  df_holiday = df_holiday.set_index('DATE').resample('30T').asfreq().reset_index()


ValueError: cannot reindex on an axis with duplicate labels

In [412]:
#add 30 mins in the DATE column
df_holiday['DATE'] = df_holiday['DATE'] + pd.Timedelta('30 min')
df_holiday.head()

Unnamed: 0,DATE,is_holiday
0,2014-01-01 00:30:00,0
1,2014-01-02 00:30:00,0
2,2014-01-03 00:30:00,0
3,2014-01-04 00:30:00,0
4,2014-01-05 00:30:00,0


In [413]:
df_holiday.tail()

Unnamed: 0,DATE,is_holiday
2644,2021-03-14 00:30:00,0
2645,2021-03-15 00:30:00,0
2646,2021-03-16 00:30:00,0
2647,2021-03-17 00:30:00,0
2648,2021-03-18 00:30:00,0


In [416]:
#find rows with the DATE column containing '2014-12-25'
df_holiday[df_holiday['DATE'].dt.date == pd.to_datetime('2014-12-25').date()]


Unnamed: 0,DATE,is_holiday
358,2014-12-25 00:30:00,0


In [417]:
#read the data from the csv file
df11 = pd.read_csv('../data/date_dim.csv')
df11.head()


Unnamed: 0.1,Unnamed: 0,Calendar_Date,DayOfWeek,Week,Quarter,Month,Year,Year_half,Summer,Autumn,Winter,Spring,is_weekday,Description,is_holiday
0,0,2014-01-01,3,1,1,1,2014,1,1,0,0,0,1,,0
1,1,2014-01-02,4,1,1,1,2014,1,1,0,0,0,1,,0
2,2,2014-01-03,5,1,1,1,2014,1,1,0,0,0,1,,0
3,3,2014-01-04,6,1,1,1,2014,1,1,0,0,0,0,,0
4,4,2014-01-05,7,1,1,1,2014,1,1,0,0,0,0,,0


In [419]:
#show rows where is_holiday is 1
df11[df11['Year'] == 2014]


Unnamed: 0.1,Unnamed: 0,Calendar_Date,DayOfWeek,Week,Quarter,Month,Year,Year_half,Summer,Autumn,Winter,Spring,is_weekday,Description,is_holiday
0,0,2014-01-01,3,1,1,1,2014,1,1,0,0,0,1,,0
1,1,2014-01-02,4,1,1,1,2014,1,1,0,0,0,1,,0
2,2,2014-01-03,5,1,1,1,2014,1,1,0,0,0,1,,0
3,3,2014-01-04,6,1,1,1,2014,1,1,0,0,0,0,,0
4,4,2014-01-05,7,1,1,1,2014,1,1,0,0,0,0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,357,2014-12-24,3,52,4,12,2014,2,1,0,0,0,1,,0
358,358,2014-12-25,4,52,4,12,2014,2,1,0,0,0,1,,0
359,359,2014-12-26,5,52,4,12,2014,2,1,0,0,0,1,,0
360,360,2014-12-27,6,52,4,12,2014,2,1,0,0,0,0,,0


In [420]:
df_holiday.dtypes

DATE          datetime64[ns]
is_holiday             int64
dtype: object

In [424]:
#rename the DATE column to DATETIME
df_holiday = df_holiday.rename(columns={'DATE': 'DATETIME'})
df_holiday.head()

Unnamed: 0,DATETIME,is_holiday
0,2014-01-01 00:30:00,0
1,2014-01-02 00:30:00,0
2,2014-01-03 00:30:00,0
3,2014-01-04 00:30:00,0
4,2014-01-05 00:30:00,0


In [427]:
#remove the rows with the DATETIME column after '2021-18-03'
df_holiday = df_holiday[df_holiday['DATETIME'] <= '2021-03-18']
df_holiday['DATETIME'].max(), df_holiday['DATETIME'].min()


(Timestamp('2021-03-17 00:30:00'), Timestamp('2014-01-01 00:30:00'))

In [422]:
#read the data from the csv file
df_total = pd.read_csv('../data/totaldemand_temperature_nsw.csv')
df_total.head()

Unnamed: 0,DATETIME,TOTALDEMAND,TEMPERATURE,SEASON,WEEKDAY
0,2010-01-01 00:00:00,8038.0,23.1,Summer,Friday
1,2010-01-01 00:30:00,7809.31,22.9,Summer,Friday
2,2010-01-01 01:00:00,7483.69,22.6,Summer,Friday
3,2010-01-01 01:30:00,7117.23,22.5,Summer,Friday
4,2010-01-01 02:00:00,6812.03,22.5,Summer,Friday


In [426]:
#join the two dataframes on the DATETIME column
df_total = pd.merge(df_total, df_holiday, on='DATETIME', how='left')
df_total.head()


ValueError: You are trying to merge on object and datetime64[ns] columns for key 'DATETIME'. If you wish to proceed you should use pd.concat