In [65]:
# Imports requirements
import pandas as pd
import glob

In [66]:
# Get CSV files list from a folder
csv_path = 'raw_data'

csv_files = glob.glob(csv_path + "/*.csv")

# Read each CSV file into DataFrame
csv_df_list = (pd.read_csv(file) for file in csv_files)

# Concatenate all DataFrames
wind_turbine_raw_data   = pd.concat(csv_df_list, ignore_index=True, axis=0)

# convert datatype
wind_turbine_raw_data['timestamp'] = pd.to_datetime(wind_turbine_raw_data['timestamp'])

In [67]:
wind_turbine_raw_data.head()

Unnamed: 0,timestamp,turbine_id,wind_speed,wind_direction,power_output
0,2022-03-01,1,11.8,169,2.7
1,2022-03-01,2,11.6,24,2.2
2,2022-03-01,3,13.8,335,2.3
3,2022-03-01,4,12.8,238,1.9
4,2022-03-01,5,11.4,103,3.5


In [68]:
# Check data type
wind_turbine_raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11160 entries, 0 to 11159
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   timestamp       11160 non-null  datetime64[ns]
 1   turbine_id      11160 non-null  int64         
 2   wind_speed      11160 non-null  float64       
 3   wind_direction  11160 non-null  int64         
 4   power_output    11160 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(2)
memory usage: 436.1 KB


In [69]:
# View turbine id's
wind_turbine_raw_data['turbine_id'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
      dtype=int64)

In [70]:
# Make data resample
def resample_data(
    df: pd.DataFrame()
) -> pd.DataFrame():
    
    # resample data
    df = df.resample('1h', on = "timestamp").mean()
    
    return df

In [71]:
# Apply function to resample data
wind_turbine_interim_data = (wind_turbine_raw_data
                             .groupby(['turbine_id'], group_keys = False, as_index = False)
                             .apply(resample_data)
                             .reset_index()
                            )
wind_turbine_interim_data['turbine_id'] = wind_turbine_interim_data['turbine_id'].astype(int)

In [72]:
wind_turbine_interim_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11160 entries, 0 to 11159
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   timestamp       11160 non-null  datetime64[ns]
 1   turbine_id      11160 non-null  int32         
 2   wind_speed      11160 non-null  float64       
 3   wind_direction  11160 non-null  float64       
 4   power_output    11160 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int32(1)
memory usage: 392.5 KB


In [73]:
wind_turbine_interim_data.head()

Unnamed: 0,timestamp,turbine_id,wind_speed,wind_direction,power_output
0,2022-03-01 00:00:00,1,11.8,169.0,2.7
1,2022-03-01 01:00:00,1,11.6,152.0,4.4
2,2022-03-01 02:00:00,1,13.8,73.0,2.9
3,2022-03-01 03:00:00,1,10.5,61.0,1.8
4,2022-03-01 04:00:00,1,9.1,209.0,2.3


In [105]:
# calculate summary statitics
def calculate_summary_statistics(
    df: pd.DataFrame(),
    summary_col = 'power_output'
    
) -> pd.DataFrame:
    
    """Calculates summary statistics:
    For each turbine, calculate the minimum, maximum, and average power output over a given time period"""
        
    # calculate summary statistics
    turbine_summary = df[summary_col].describe().T
    
    return turbine_summary[['min', 'max', 'mean']]

In [102]:
# Apply function to resample data
wind_turbine_summary_statistics_data = (wind_turbine_interim_data
                                        .groupby(['turbine_id'], group_keys = True)
                                        .apply(calculate_summary_statistics)
                                        .rename(columns = {"min": "power_min", "max": "power_max", "mean": "power_mean"}).round(2)
                                       )

In [103]:
wind_turbine_summary_statistics_data

power_output,power_min,power_max,power_mean
turbine_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.5,4.5,3.02
2,1.5,4.5,2.98
3,1.5,4.5,2.98
4,1.5,4.5,2.95
5,1.5,4.5,3.02
...,...,...,...
11,1.5,4.5,2.96
12,1.5,4.5,3.05
13,1.5,4.5,3.03
14,1.5,4.5,3.02


In [104]:
wind_turbine_summary_statistics_data.to_csv(r"./processed_data/wind_turbine_summary_statistics_data.csv", index=True)

In [77]:
# Anomaly detection
def identifies_anomalies(
    df: pd.DataFrame(),
    std_num = 2,
    replace_outlier = True
    ) -> pd.DataFrame:
    
    """
    # Identifies anomalies: Identify any turbines
    that have significantly deviated from their expected power output
    over the same time period
    """ 
    
    if replace_outlier == True:
        
        # Outlier limit power
        outlier_limit_power = df['power_output'].mean() + (df['power_output'].std() * std_num)
         
        # Maximum limit power
        df["power_max_limit"] = df["power_output"].apply(lambda x: outlier_limit_power if (x >= outlier_limit_power) else (0 if x < 0 else x)).round(2)
        
        return df
    
    else:
        return None

In [78]:
# Find anomalies
wind_turbine_procesed_data = (wind_turbine_interim_data
                              .groupby(['turbine_id'], group_keys = False)
                              .apply(identifies_anomalies)
                             )

wind_turbine_procesed_data

Unnamed: 0,timestamp,turbine_id,wind_speed,wind_direction,power_output,power_max_limit
0,2022-03-01 00:00:00,1,11.8,169.0,2.7,2.7
1,2022-03-01 01:00:00,1,11.6,152.0,4.4,4.4
2,2022-03-01 02:00:00,1,13.8,73.0,2.9,2.9
3,2022-03-01 03:00:00,1,10.5,61.0,1.8,1.8
4,2022-03-01 04:00:00,1,9.1,209.0,2.3,2.3
...,...,...,...,...,...,...
11155,2022-03-31 19:00:00,15,11.1,241.0,2.1,2.1
11156,2022-03-31 20:00:00,15,12.0,220.0,1.7,1.7
11157,2022-03-31 21:00:00,15,13.5,137.0,2.2,2.2
11158,2022-03-31 22:00:00,15,14.5,350.0,4.4,4.4


In [82]:
wind_turbine_procesed_data.to_csv(r"./processed_data/wind_turbine_procesed_data.csv", index=False)