# Cheat sheet notebook for loading and preprocessing synthetic dataset

In [1]:
# In the root of the repo run "pip install --editable ."

# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

# Replace sample_func_dir by the name of directory in src/ and replace sample_func_file the
# file name in src/sample_func_dir
from src.data.import_data import import_df_from_zip
from src.data.make_dataset import *

## Load synthetic dataset

The function `import_df_from_dir` can be used to import df from directory

In [2]:
csv_index   = 1
synth_name  = "basic"
path_to_zip = "../data/raw/synthetic_" + synth_name + ".zip"
df          = import_df_from_zip(path_to_zip, csv_index, verbose=True)

time for importing dataframe: 67.29 seconds


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2629440 entries, 2015-01-01 00:00:00-05:00 to 2019-12-31 23:59:00-05:00
Data columns (total 12 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   Power                    float64
 1   POA                      float64
 2   Tamb                     float64
 3   Wind                     float64
 4   Degradation_rate_per_yr  float64
 5   soiling                  int64  
 6   year                     int64  
 7   month                    int64  
 8   day                      int64  
 9   hour                     int64  
 10  minute_of_hour           int64  
 11  minute_of_day            int64  
dtypes: float64(5), int64(7)
memory usage: 260.8 MB


In [4]:
df.head()

Unnamed: 0,Power,POA,Tamb,Wind,Degradation_rate_per_yr,soiling,year,month,day,hour,minute_of_hour,minute_of_day
2015-01-01 00:00:00-05:00,-1.0,0.0,5.1,0.0,-0.01436,1,2015,1,1,0,0,0
2015-01-01 00:01:00-05:00,-1.0,0.0,5.1,0.0,-0.01436,1,2015,1,1,0,1,1
2015-01-01 00:02:00-05:00,-1.0,0.0,5.1,0.0,-0.01436,1,2015,1,1,0,2,2
2015-01-01 00:03:00-05:00,-1.0,0.0,5.1,0.0,-0.01436,1,2015,1,1,0,3,3
2015-01-01 00:04:00-05:00,-1.0,0.0,5.1,0.0,-0.01436,1,2015,1,1,0,4,4


## Preprocess data

There are three steps:
- Remove night time periods
- Remoce clipping time periods
- Downgrade from minute to daily frequency

In [5]:
# remove night time periods
df = remove_night_time_data(df)

In [6]:
# remove clipping time periods (two possibilited: universal time window or daily time window)
df = remove_clipping_with_universal_window(df)

(213267, 12)


In [7]:
# downgrade from minute to daily frequency
df = downsample_dataframe(df)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1826 entries, 2015-01-01 00:00:00-05:00 to 2019-12-31 00:00:00-05:00
Freq: D
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Power                    1826 non-null   float64
 1   POA                      1826 non-null   float64
 2   Tamb                     1826 non-null   float64
 3   Wind                     1826 non-null   float64
 4   Degradation_rate_per_yr  1826 non-null   float64
 5   soiling                  1826 non-null   float64
 6   year                     1826 non-null   float64
 7   month                    1826 non-null   float64
 8   day                      1826 non-null   float64
 9   hour                     1826 non-null   float64
 10  minute_of_hour           1826 non-null   float64
 11  minute_of_day            1826 non-null   float64
dtypes: float64(12)
memory usage: 185.5 KB


In [9]:
df.head()

Unnamed: 0,Power,POA,Tamb,Wind,Degradation_rate_per_yr,soiling,year,month,day,hour,minute_of_hour,minute_of_day
2015-01-01 00:00:00-05:00,376.821429,169.214286,7.361111,0.0,-0.01436,1.0,2015.0,1.0,1.0,12.150794,30.373016,759.420635
2015-01-02 00:00:00-05:00,381.860558,171.207171,7.328287,0.0,-0.01436,1.0,2015.0,1.0,2.0,12.171315,30.282869,760.561753
2015-01-03 00:00:00-05:00,384.126984,172.051587,7.291667,0.0,-0.01436,1.0,2015.0,1.0,3.0,12.190476,30.166667,761.595238
2015-01-04 00:00:00-05:00,386.70751,172.956522,7.251383,0.0,-0.01436,1.0,2015.0,1.0,4.0,12.209486,30.055336,762.624506
2015-01-05 00:00:00-05:00,389.547244,173.980315,7.217323,0.0,-0.01436,1.0,2015.0,1.0,5.0,12.228346,29.948819,763.649606
