### Import Packages

In [55]:
# Basics
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 1000)
from datetime import datetime as dt

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

# Warnings
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning)

### Load the data

In [56]:
!pwd

/Users/Alice/Documents/Omdena/Local Chapters - Chennai/chennai-india-power-outage/src/tasks/task-06-Model Building


In [57]:
# load in outage data as df
df = pd.read_csv('../../data/cleaned data/demand_supply_monthly.csv')
df.head()

Unnamed: 0,Requirement(MU/DAY),Energy_met(MU/DAY),Surplus(+)/Deficit(-)(MU/DAY),Requirement(MU),Energy_met(MU),Surplus(+)/Deficit(-)(MU),%Shortage(MU),Requirement(MW),Peak_Demand_Met5(MW),Surplus(+)/Deficit(-)(MW),%Shortage(MW),name_report,year,month,date_published,month_published,year_published
0,354.0,354.0,-0.9,10632.0,10605.0,-26.0,-0.2,17646.0,17563.0,-83.0,-0.5,Monthly_Report_Apr_2022,2022,4,2022-05-23,5,2022
1,337.0,337.0,0.0,10459.0,10458.0,-1.0,0.0,16906.0,16906.0,0.0,0.0,Monthly_Report_May_2022,2022,5,2022-06-23,6,2022
2,344.0,343.0,-0.3,10657.0,10648.0,-9.0,-0.1,17196.0,17196.0,0.0,0.0,Monthly_Report_Mar_2022,2022,3,2022-04-25,4,2022
3,286.0,286.0,0.0,8873.0,8873.0,1.0,0.0,15290.0,15290.0,0.0,0.0,Monthly_Report_Jan_2022,2022,1,2022-02-23,2,2022
4,245.0,245.0,0.0,7362.0,7360.0,-1.0,0.0,13501.0,13480.0,-21.0,-0.2,Monthly_Report_Nov_2021,2021,11,2021-12-23,12,2021


In [58]:
df.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 17 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Requirement(MU/DAY)            122 non-null    float64
 1   Energy_met(MU/DAY)             122 non-null    float64
 2   Surplus(+)/Deficit(-)(MU/DAY)  122 non-null    float64
 3   Requirement(MU)                122 non-null    float64
 4   Energy_met(MU)                 122 non-null    float64
 5   Surplus(+)/Deficit(-)(MU)      122 non-null    float64
 6   %Shortage(MU)                  122 non-null    float64
 7   Requirement(MW)                122 non-null    float64
 8   Peak_Demand_Met5(MW)           122 non-null    float64
 9   Surplus(+)/Deficit(-)(MW)      122 non-null    float64
 10  %Shortage(MW)                  122 non-null    float64
 11  name_report                    122 non-null    object 
 12  year                           122 non-null    int

In [59]:
df.describe()

Unnamed: 0,Requirement(MU/DAY),Energy_met(MU/DAY),Surplus(+)/Deficit(-)(MU/DAY),Requirement(MU),Energy_met(MU),Surplus(+)/Deficit(-)(MU),%Shortage(MU),Requirement(MW),Peak_Demand_Met5(MW),Surplus(+)/Deficit(-)(MW),%Shortage(MW),year,month,month_published,year_published
count,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0
mean,283.758197,276.782787,-6.989344,8631.884426,8419.755738,-212.110656,-2.718033,14066.011475,13849.101639,-216.893443,-1.734426,2016.836066,6.467213,6.598361,2017.131148
std,27.948258,36.6943,14.425527,864.073361,1127.604563,438.266568,5.696266,1470.967073,1771.175005,583.523308,4.757991,2.969155,3.447803,3.271868,2.721056
min,217.0,177.0,-60.0,6511.1,5302.0,-1860.0,-24.0,10161.0,8518.0,-3745.0,-28.0,2012.0,1.0,1.0,2013.0
25%,264.5,254.0,-7.0,7994.75,7759.75,-216.525,-2.825,12969.0,12732.0,-161.25,-1.25,2014.0,4.0,4.0,2014.0
50%,280.5,279.05,-0.2,8534.1,8431.0,-5.45,-0.1,14290.5,14225.5,-23.0,-0.2,2017.0,6.0,7.0,2017.0
75%,302.875,302.8,0.0,9201.15,9201.65,-0.75,0.0,15024.5,15007.25,0.0,0.0,2019.0,9.0,9.0,2019.0
max,354.0,354.0,0.1,10657.0,10648.0,3.0,0.0,17646.0,17563.0,2360.0,20.7,2022.0,12.0,12.0,2022.0


## Time Series Data

### Prepare time series data

In [60]:
df.head()

Unnamed: 0,Requirement(MU/DAY),Energy_met(MU/DAY),Surplus(+)/Deficit(-)(MU/DAY),Requirement(MU),Energy_met(MU),Surplus(+)/Deficit(-)(MU),%Shortage(MU),Requirement(MW),Peak_Demand_Met5(MW),Surplus(+)/Deficit(-)(MW),%Shortage(MW),name_report,year,month,date_published,month_published,year_published
0,354.0,354.0,-0.9,10632.0,10605.0,-26.0,-0.2,17646.0,17563.0,-83.0,-0.5,Monthly_Report_Apr_2022,2022,4,2022-05-23,5,2022
1,337.0,337.0,0.0,10459.0,10458.0,-1.0,0.0,16906.0,16906.0,0.0,0.0,Monthly_Report_May_2022,2022,5,2022-06-23,6,2022
2,344.0,343.0,-0.3,10657.0,10648.0,-9.0,-0.1,17196.0,17196.0,0.0,0.0,Monthly_Report_Mar_2022,2022,3,2022-04-25,4,2022
3,286.0,286.0,0.0,8873.0,8873.0,1.0,0.0,15290.0,15290.0,0.0,0.0,Monthly_Report_Jan_2022,2022,1,2022-02-23,2,2022
4,245.0,245.0,0.0,7362.0,7360.0,-1.0,0.0,13501.0,13480.0,-21.0,-0.2,Monthly_Report_Nov_2021,2021,11,2021-12-23,12,2021


In [61]:
# data frame is clean, no nulls
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 17 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Requirement(MU/DAY)            122 non-null    float64
 1   Energy_met(MU/DAY)             122 non-null    float64
 2   Surplus(+)/Deficit(-)(MU/DAY)  122 non-null    float64
 3   Requirement(MU)                122 non-null    float64
 4   Energy_met(MU)                 122 non-null    float64
 5   Surplus(+)/Deficit(-)(MU)      122 non-null    float64
 6   %Shortage(MU)                  122 non-null    float64
 7   Requirement(MW)                122 non-null    float64
 8   Peak_Demand_Met5(MW)           122 non-null    float64
 9   Surplus(+)/Deficit(-)(MW)      122 non-null    float64
 10  %Shortage(MW)                  122 non-null    float64
 11  name_report                    122 non-null    object 
 12  year                           122 non-null    int

In [62]:
df.index

RangeIndex(start=0, stop=122, step=1)

In [63]:
#create unique dates using year and month column
df['year']=df['year'].astype(str)
df['month']=df['month'].astype(str)
df['ts_date'] = df[['year', 'month']].agg('-'.join, axis=1)
df['ts_date'] = pd.to_datetime(df['ts_date'], format = '%Y-%m')

In [64]:
df.head()

Unnamed: 0,Requirement(MU/DAY),Energy_met(MU/DAY),Surplus(+)/Deficit(-)(MU/DAY),Requirement(MU),Energy_met(MU),Surplus(+)/Deficit(-)(MU),%Shortage(MU),Requirement(MW),Peak_Demand_Met5(MW),Surplus(+)/Deficit(-)(MW),%Shortage(MW),name_report,year,month,date_published,month_published,year_published,ts_date
0,354.0,354.0,-0.9,10632.0,10605.0,-26.0,-0.2,17646.0,17563.0,-83.0,-0.5,Monthly_Report_Apr_2022,2022,4,2022-05-23,5,2022,2022-04-01
1,337.0,337.0,0.0,10459.0,10458.0,-1.0,0.0,16906.0,16906.0,0.0,0.0,Monthly_Report_May_2022,2022,5,2022-06-23,6,2022,2022-05-01
2,344.0,343.0,-0.3,10657.0,10648.0,-9.0,-0.1,17196.0,17196.0,0.0,0.0,Monthly_Report_Mar_2022,2022,3,2022-04-25,4,2022,2022-03-01
3,286.0,286.0,0.0,8873.0,8873.0,1.0,0.0,15290.0,15290.0,0.0,0.0,Monthly_Report_Jan_2022,2022,1,2022-02-23,2,2022,2022-01-01
4,245.0,245.0,0.0,7362.0,7360.0,-1.0,0.0,13501.0,13480.0,-21.0,-0.2,Monthly_Report_Nov_2021,2021,11,2021-12-23,12,2021,2021-11-01


In [65]:
# set index
df = df.set_index(['ts_date'])

In [66]:
df.head()

Unnamed: 0_level_0,Requirement(MU/DAY),Energy_met(MU/DAY),Surplus(+)/Deficit(-)(MU/DAY),Requirement(MU),Energy_met(MU),Surplus(+)/Deficit(-)(MU),%Shortage(MU),Requirement(MW),Peak_Demand_Met5(MW),Surplus(+)/Deficit(-)(MW),%Shortage(MW),name_report,year,month,date_published,month_published,year_published
ts_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2022-04-01,354.0,354.0,-0.9,10632.0,10605.0,-26.0,-0.2,17646.0,17563.0,-83.0,-0.5,Monthly_Report_Apr_2022,2022,4,2022-05-23,5,2022
2022-05-01,337.0,337.0,0.0,10459.0,10458.0,-1.0,0.0,16906.0,16906.0,0.0,0.0,Monthly_Report_May_2022,2022,5,2022-06-23,6,2022
2022-03-01,344.0,343.0,-0.3,10657.0,10648.0,-9.0,-0.1,17196.0,17196.0,0.0,0.0,Monthly_Report_Mar_2022,2022,3,2022-04-25,4,2022
2022-01-01,286.0,286.0,0.0,8873.0,8873.0,1.0,0.0,15290.0,15290.0,0.0,0.0,Monthly_Report_Jan_2022,2022,1,2022-02-23,2,2022
2021-11-01,245.0,245.0,0.0,7362.0,7360.0,-1.0,0.0,13501.0,13480.0,-21.0,-0.2,Monthly_Report_Nov_2021,2021,11,2021-12-23,12,2021


In [67]:
df.columns

Index(['Requirement(MU/DAY)', 'Energy_met(MU/DAY)',
       'Surplus(+)/Deficit(-)(MU/DAY)', 'Requirement(MU)', 'Energy_met(MU)',
       'Surplus(+)/Deficit(-)(MU)', '%Shortage(MU)', 'Requirement(MW)',
       'Peak_Demand_Met5(MW)', 'Surplus(+)/Deficit(-)(MW)', '%Shortage(MW)',
       'name_report', 'year', 'month', 'date_published', 'month_published',
       'year_published'],
      dtype='object')

In [68]:
# delete all extra columns from the dataframe
ts = df.drop(columns=['Requirement(MU/DAY)', 'Energy_met(MU/DAY)',
       'Surplus(+)/Deficit(-)(MU/DAY)', 'Requirement(MW)',
       'Peak_Demand_Met5(MW)', 'Surplus(+)/Deficit(-)(MW)', '%Shortage(MW)',
       'name_report', 'year', 'month', 'date_published', 'month_published',
       'year_published'])

In [69]:
ts.head()

Unnamed: 0_level_0,Requirement(MU),Energy_met(MU),Surplus(+)/Deficit(-)(MU),%Shortage(MU)
ts_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-04-01,10632.0,10605.0,-26.0,-0.2
2022-05-01,10459.0,10458.0,-1.0,0.0
2022-03-01,10657.0,10648.0,-9.0,-0.1
2022-01-01,8873.0,8873.0,1.0,0.0
2021-11-01,7362.0,7360.0,-1.0,0.0


## Investigating Time Series with Datetime Objects

Datetime objects include aspects of the date as attributes, like month and year:

In [70]:
# Entire
ts.index[0]

Timestamp('2022-04-01 00:00:00')

In [71]:
# Day
ts.index[0].day

1

In [72]:
# Month
ts.index[0].month

4

In [73]:
# Year
ts.index[0].year

2022

In [74]:
# Sort and reset index
ts = ts.sort_index()

In [75]:
ts.head()

Unnamed: 0_level_0,Requirement(MU),Energy_met(MU),Surplus(+)/Deficit(-)(MU),%Shortage(MU)
ts_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012-04-01,7707.0,5940.0,-1767.0,-23.0
2012-05-01,7605.0,6649.0,-956.0,-13.0
2012-06-01,7928.0,6893.0,-1035.0,-13.0
2012-07-01,7981.0,7272.0,-709.0,-9.0
2012-08-01,7851.0,6773.0,-1078.0,-14.0
