## Initialization

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import pandas_gbq
import datetime as dt
from collections import defaultdict
import time
import datetime as dt
from pytz import timezone
tz = timezone('EST')
from tqdm import tqdm

In [None]:
# data visualization
import seaborn as sns
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style
sns.set()

In [None]:
# PySpark
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *

### Default constants

In [None]:
BUCKET_NAME = "homework_rl3154_data"
PROJECT_BUCKET = "project"
FOLDER_NAME = "forecast_data"
BIG_QUERY_TABLE_NAME = "project_dataset.agg_electricity_load_data_hourly"


### Spark initialization

In [None]:
conf = SparkConf()
conf.setMaster('yarn')
conf.setAppName("spark-bigquery-prophet")
# BigQuery Connector
conf.set("spark.jars.packages", "com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.22.0")
sc = SparkContext.getOrCreate(conf=conf)
sqlContext = SQLContext(sc)

In [None]:
sqlContext._sc.version

'3.1.2'

# Get Data

In [None]:
# Open from GCS
forecast_df = pd.read_pickle(f"gs://{BUCKET_NAME}/{PROJECT_BUCKET}/{FOLDER_NAME}/forecast_res-2021-12-09.pkl")
eval_df = pd.read_pickle(f"gs://{BUCKET_NAME}/{PROJECT_BUCKET}/{FOLDER_NAME}/forecast_eval-2021-12-09.pkl")


In [None]:
forecast_df

Unnamed: 0,ds,zone_name,y,yhat,yhat_upper,yhat_lower,trend,holidays,yearly,daily,weekly
0,2010-01-01 00:00:00,CAPITL,12518.099609,12613.607422,16561.296875,9120.713867,14812.665039,-0.095238,0.056130,-0.130641,0.021291
1,2010-01-01 01:00:00,CAPITL,13092.299805,11989.854492,15666.069336,8186.646973,14813.125000,-0.095238,0.056180,-0.172363,0.020830
2,2010-01-01 02:00:00,CAPITL,12562.200195,11588.465820,15455.477539,7901.891602,14813.585938,-0.095238,0.056229,-0.199018,0.020314
3,2010-01-01 03:00:00,CAPITL,12101.299805,11466.584961,15152.679688,7877.807129,14814.046875,-0.095238,0.056278,-0.206743,0.019738
4,2010-01-01 04:00:00,CAPITL,11989.799805,11715.201172,15239.740234,8260.172852,14814.506836,-0.095238,0.056327,-0.189391,0.019095
...,...,...,...,...,...,...,...,...,...,...,...
1148802,2021-11-30 20:00:00,MILLWD,,4885.288086,6073.982422,3612.519531,4080.098145,0.000000,0.001962,0.172233,0.023151
1148803,2021-11-30 21:00:00,MILLWD,,4621.094238,5895.692383,3381.287354,4080.104492,0.000000,0.002109,0.107065,0.023418
1148804,2021-11-30 22:00:00,MILLWD,,4264.531250,5530.609863,2964.978516,4080.110840,0.000000,0.002257,0.019235,0.023708
1148805,2021-11-30 23:00:00,MILLWD,,3896.691162,5133.661621,2598.658447,4080.117188,0.000000,0.002405,-0.071380,0.024019


In [None]:
eval_df

Unnamed: 0,training_date,zone_name,mae,mse,mape,rmse
0,2021-12-09,CAPITL,1338.88147,3543972.0,0.079469,1882.543945
1,2021-12-09,CENTRL,1629.996704,5235488.0,0.07201,2288.118896
2,2021-12-09,DUNWOD,801.376038,1314109.0,0.092441,1146.345947
3,2021-12-09,GENESE,1127.091919,2572215.0,0.0801,1603.812744
4,2021-12-09,HUD VL,1307.080322,3498391.0,0.093496,1870.398682
5,2021-12-09,LONGIL,3057.967041,19472320.0,0.100375,4412.745605
6,2021-12-09,MHK VL,874.259399,1425834.0,0.07986,1194.082764
7,2021-12-09,MILLWD,464.994049,407818.3,0.117714,638.606567
8,2021-12-09,N.Y.C.,5754.780762,68619350.0,0.077383,8283.679688
9,2021-12-09,NORTH,356.26062,290110.3,0.049971,538.618896


In [None]:
## Convert to Spark RDD
spark_df = sqlContext.createDataFrame(forecast_df)

# Identify Critical Dates

In [None]:
dates_df = pd.merge(
    forecast_df[(forecast_df['ds']<'2021-11-01')][['ds', 'zone_name', 'y', 'yhat']], 
    eval_df[['zone_name', 'mae', 'rmse', 'mape']], 
    on=['zone_name'], how='left'
)
dates_df['y_abs_diff'] = np.abs(dates_df['y']-dates_df['yhat'])
dates_df['y_pct_diff'] = dates_df['y_abs_diff']/dates_df['y']

dates_df

Unnamed: 0,ds,zone_name,y,yhat,mae,rmse,mape,y_abs_diff,y_pct_diff
0,2010-01-01 00:00:00,CAPITL,12518.099609,12613.607422,1338.881470,1882.543945,0.079469,95.507812,0.007630
1,2010-01-01 01:00:00,CAPITL,13092.299805,11989.854492,1338.881470,1882.543945,0.079469,1102.445312,0.084206
2,2010-01-01 02:00:00,CAPITL,12562.200195,11588.465820,1338.881470,1882.543945,0.079469,973.734375,0.077513
3,2010-01-01 03:00:00,CAPITL,12101.299805,11466.584961,1338.881470,1882.543945,0.079469,634.714844,0.052450
4,2010-01-01 04:00:00,CAPITL,11989.799805,11715.201172,1338.881470,1882.543945,0.079469,274.598633,0.022903
...,...,...,...,...,...,...,...,...,...
1140871,2021-10-31 19:00:00,MILLWD,3901.063965,4167.789062,464.994049,638.606567,0.117714,266.725098,0.068372
1140872,2021-10-31 20:00:00,MILLWD,3805.951416,4060.871826,464.994049,638.606567,0.117714,254.920410,0.066979
1140873,2021-10-31 21:00:00,MILLWD,3647.767578,3807.325439,464.994049,638.606567,0.117714,159.557861,0.043741
1140874,2021-10-31 22:00:00,MILLWD,3449.541748,3461.595947,464.994049,638.606567,0.117714,12.054199,0.003494


## Based on RMSE

In [None]:
filtered_dates = (dates_df[(dates_df['y_abs_diff']>(5*dates_df['rmse']))]
                  .groupby(['zone_name', dates_df['ds'].dt.date])
                  .agg(anomaly_occurence=('ds', 'nunique'))
                  .reset_index())
filtered_dates = filtered_dates[filtered_dates['anomaly_occurence']>3]
filtered_dates#.describe()

Unnamed: 0,zone_name,ds,anomaly_occurence
7,CAPITL,2011-07-21,4
8,CAPITL,2011-07-22,5
14,CAPITL,2012-06-20,5
15,CAPITL,2012-06-21,6
46,CAPITL,2013-07-18,5
...,...,...,...
1954,WEST,2013-09-11,6
2028,WEST,2016-08-12,4
2071,WEST,2019-06-28,4
2073,WEST,2019-07-02,4


In [None]:
filtered_dates.to_csv(f"gs://{BUCKET_NAME}/{PROJECT_BUCKET}/{FOLDER_NAME}/dates_for_twitter.csv", index=False)

In [None]:
# Summary
pd.merge(
    filtered_dates.groupby('zone_name').agg(unique_date=('ds','nunique')).reset_index(), 
    filtered_dates.groupby('zone_name')['ds'].apply(list).reset_index(),
    how='left',
    on=['zone_name']
)

Unnamed: 0,zone_name,unique_date,ds
0,CAPITL,11,"[2011-07-21, 2011-07-22, 2012-06-20, 2012-06-2..."
1,CENTRL,8,"[2011-07-21, 2011-07-22, 2012-06-20, 2012-06-2..."
2,DUNWOD,16,"[2011-07-22, 2012-06-21, 2012-06-29, 2013-05-3..."
3,GENESE,13,"[2011-07-21, 2011-07-22, 2012-06-20, 2012-06-2..."
4,HUD VL,15,"[2011-06-09, 2011-07-21, 2011-07-22, 2012-06-2..."
5,LONGIL,13,"[2011-07-22, 2012-06-21, 2012-06-29, 2012-07-0..."
6,MHK VL,5,"[2011-07-21, 2011-07-22, 2012-06-21, 2012-06-2..."
7,MILLWD,9,"[2011-06-08, 2011-06-09, 2011-07-22, 2012-06-2..."
8,N.Y.C.,14,"[2011-07-22, 2012-06-21, 2012-06-29, 2013-05-3..."
9,NORTH,5,"[2012-06-21, 2012-10-31, 2014-01-03, 2014-02-2..."
