# Cheat sheet notebook for loading and preprocessing synthetic dataset

In [1]:
# In the root of the repo run "pip install --editable ."

# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

# Replace sample_func_dir by the name of directory in src/ and replace sample_func_file the
# file name in src/sample_func_dir
from src.data.import_data import *
from src.data.make_dataset import *

## Load synthetic dataset

The function `import_df_from_dir` can be used to import df from directory

In [2]:
index   = 1
synth_name  = "basic"
path_to_zip = "../data/raw/synthetic_" + synth_name + ".zip"
df          = import_df_from_zip_pkl(path_to_zip, index, verbose=True)

time for importing dataframe: 0.71 seconds


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2629440 entries, 2015-01-01 00:00:00-05:00 to 2019-12-31 23:59:00-05:00
Freq: T
Data columns (total 8 columns):
Power            float64
POA              float64
Tamb             float64
Wind             float64
Degradation      float64
Soiling          int64
Faults           int64
minute_of_day    int64
dtypes: float64(5), int64(3)
memory usage: 180.5 MB


In [4]:
df.head()

Unnamed: 0_level_0,Power,POA,Tamb,Wind,Degradation,Soiling,Faults,minute_of_day
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-01-01 00:00:00-05:00,-1.0,0.0,5.1,0.0,1.0,1,1,0
2015-01-01 00:01:00-05:00,-1.0,0.0,5.1,0.0,1.0,1,1,1
2015-01-01 00:02:00-05:00,-1.0,0.0,5.1,0.0,1.0,1,1,2
2015-01-01 00:03:00-05:00,-1.0,0.0,5.1,0.0,1.0,1,1,3
2015-01-01 00:04:00-05:00,-1.0,0.0,5.1,0.0,1.0,1,1,4


## Preprocess data

There are three steps:
- Remove night time periods
- Remoce clipping time periods
- Downgrade from minute to daily frequency

There are two ways of applying these three preprocessing steps:
1. Call a function for each of the three steps
2. Call the downgrade function with the optional parameters `night_method` & `clip_method`

### 1. Call a function for each of the three steps

In [5]:
# remove night time periods
df = remove_night_time_data(df)

In [6]:
# remove clipping time periods (two possibilited: universal time window or flexible time window)
df = remove_clipping_with_universal_window(df)

In [7]:
# downgrade from minute to daily frequency
df= downsample_dataframe(df)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1826 entries, 2015-01-01 00:00:00-05:00 to 2019-12-31 00:00:00-05:00
Freq: D
Data columns (total 4 columns):
Power          1826 non-null float64
Degradation    1826 non-null float64
Soiling        1826 non-null int64
Faults         1826 non-null int64
dtypes: float64(2), int64(2)
memory usage: 151.3 KB


In [9]:
df.head()

Unnamed: 0_level_0,Power,Degradation,Soiling,Faults
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-01 00:00:00-05:00,376.825397,0.99997,1,1
2015-01-02 00:00:00-05:00,381.884462,0.99994,1,1
2015-01-03 00:00:00-05:00,384.142857,0.9999,1,1
2015-01-04 00:00:00-05:00,386.715415,0.99986,1,1
2015-01-05 00:00:00-05:00,389.570866,0.99982,1,1


### 2. Call the downgrade function with the optional parameters

In [10]:
index       = 2
synth_name  = "basic"
path_to_zip = "../data/raw/synthetic_" + synth_name + ".zip"
df          = import_df_from_zip_pkl(path_to_zip, index, verbose=True)

time for importing dataframe: 0.71 seconds


In [11]:
df= downsample_dataframe(df, offset='H', night_method=None, clip_method='universal', power_sampling_function=np.max)
# set <clip_method> to 'flexible' to remove clipping data with flexible window

In [12]:
df['2015-01-01 06:00': '2015-01-01 12:00']

Unnamed: 0_level_0,Power,Degradation,Soiling,Faults
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-01 06:00:00-05:00,-1.0,0.99999,1.0,1.0
2015-01-01 07:00:00-05:00,19.0,0.99999,1.0,1.0
2015-01-01 08:00:00-05:00,384.0,0.99999,1.0,1.0
2015-01-01 09:00:00-05:00,752.0,0.99999,1.0,1.0
2015-01-01 10:00:00-05:00,,,,
2015-01-01 11:00:00-05:00,,,,
2015-01-01 12:00:00-05:00,,,,


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 43824 entries, 2015-01-01 00:00:00-05:00 to 2019-12-31 23:00:00-05:00
Freq: H
Data columns (total 4 columns):
Power          36520 non-null float64
Degradation    36520 non-null float64
Soiling        36520 non-null float64
Faults         36520 non-null float64
dtypes: float64(4)
memory usage: 2.9 MB


In [14]:
index       = 3
synth_name  = "basic"
path_to_zip = "../data/raw/synthetic_" + synth_name + ".zip"
df          = import_df_from_zip_pkl(path_to_zip, index, verbose=True)

time for importing dataframe: 0.74 seconds


In [15]:
df= downsample_dataframe(df, offset='D', night_method=None, clip_method='universal', power_sampling_function=np.max)

In [16]:
df.head()

Unnamed: 0_level_0,Power,Degradation,Soiling,Faults
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-01 00:00:00-05:00,900.0,0.99996,1,1
2015-01-02 00:00:00-05:00,908.0,0.99993,1,1
2015-01-03 00:00:00-05:00,916.0,0.99989,1,1
2015-01-04 00:00:00-05:00,924.0,0.99986,1,1
2015-01-05 00:00:00-05:00,933.0,0.99982,1,1
