**Data Wrangling & Feature Engineering**

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#from windrose import WindroseAxes
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf
%matplotlib inline 
#os.getcwd()
#os.chdir('C:\\Users\\amit_\\Desktop\\SPRINGBOARD PROJECTS\\Project-Cap3')

In [None]:
raw_data = pd.read_csv('../input/wind-turbine-scada-dataset/T1.csv')
raw_data.head()

**Information about features**

-Date/Time (for 10 minutes intervals)

-LV ActivePower (kW): The power generated by the turbine for that moment

-Wind Speed (m/s): The wind speed at the hub height of the turbine (the wind speed that turbine use for electricity generation)

-TheoreticalPowerCurve (KWh): The theoretical power values that the turbine generates with that wind speed which is given by the turbine manufacturer

-Wind Direction (°): The wind direction at the hub height of the turbine (wind turbines turn to this direction automaticly)

In [None]:
raw_data.info()

In [None]:
 #convert the date column to datetime object
raw_data['Date/Time'] = pd.to_datetime(raw_data['Date/Time']) 

In [None]:
#set the index to the Date time column
raw_data.set_index('Date/Time',inplace=True) 

In [None]:
raw_data.columns

In [None]:
 #change column names
raw_data.columns = ['Power','Wind_speed','Theoretical_power','Wind_direction']

In [None]:
raw_data.describe()

In [None]:
print(raw_data.index.min())
print(raw_data.index.max())

The turbine has recorded data for the year of 2018

In [None]:
#create a loss column
raw_data['Loss'] = raw_data['Theoretical_power'] - raw_data['Power']
raw_data.head(3)

In [None]:
 # due to the random noise, the data will be resampled by every hour

raw_data['Power'].plot(figsize=(20,5))
plt.show()

In [None]:
# create an hourly,daily,weekly,monthly data frame resampled by the mean
hourly = pd.DataFrame()
daily = pd.DataFrame()
weekly = pd.DataFrame()
monthly = pd.DataFrame()

for col in raw_data.columns:
    weekly[col] = raw_data[col].resample('W').mean()

for col in raw_data.columns:
    monthly[col] = raw_data[col].resample('M').mean()

for col in raw_data.columns:
    daily[col] = raw_data[col].resample('D').mean()

for col in raw_data.columns:
    hourly[col] = raw_data[col].resample('H').mean()



In [None]:
hourly['Power'].plot(figsize=(20,5))
plt.show()

In [None]:
daily['Power'].plot(figsize=(20,5),label='daily')
plt.show()

In [None]:
# create a function for a categorical column 
def direction(x):
    if x > 348.75 or x<11.25: return 'N'
    if x < 33.75: return 'NNE'
    if x < 56.25: return 'NE'
    if x < 78.75: return 'ENE'
    if x < 101.25: return 'E'
    if x < 123.75: return 'ESE'
    if x < 146.25: return 'SE'
    if x < 168.75: return 'SSE'
    if x < 191.25: return 'S'
    if x < 213.75: return 'SSW'
    if x < 236.25: return 'SW'
    if x < 258.75: return 'WSW'
    if x < 281.25: return 'W'
    if x < 303.75: return 'WNW'
    if x < 326.25: return 'NW'
    else: return 'NNW'

In [None]:
daily.info()

In [None]:
# replace all Nan values with interpolated values for each column
for col in daily.columns:
    daily[col] = daily[col].interpolate()

In [None]:
daily.info()

In [None]:
#create a categorical column for the direction of wind 
daily['Direction'] = daily['Wind_direction'].apply(direction)

In [None]:
daily.head(3)

In [None]:
daily['Direction'].unique()

In [None]:
daily['Loss'].plot()
plt.title('Daily Loss')
plt.show()

There are times throughout the year where the turbine has experienced a large amount of Power loss. This is worth investigating further. I suspect this is due to maintenance. 

Between 12-14km/h (**3.3-3.8m/s**) a wind turbine starts to generate electricity.
If the wind speed is higher than 3.3m/s and the power output is zero, we can assume the turbine is under maintenance

In [None]:
print(max(raw_data['Wind_speed']))

In [None]:
#check number of data points in raw_data where the wind speed is above 3.3 and power is less than zero
#create a dataframe where times of maintenance are not included
raw_data_nm = raw_data[~((raw_data['Power']<=0) & (raw_data['Wind_speed'] > 3.3))]

In [None]:
raw_data_nm.info()

In [None]:
raw_data_nm.head(3)

In [None]:
# filtered loss
_ = raw_data_nm['Loss'].plot()


In [None]:
raw_data_nm['Wind_direction'].describe()

In [None]:
#function to create x,y component of wind direction 
def x_y_component(wind_direction, wind_speed):
    """
    Convert degrees to x,y components
    """
    #convert to radians 
    radians = (wind_direction * np.pi)/180
    # give the x, y compenents 
    x = wind_speed * np.cos(radians)
    y = wind_speed * np.sin(radians)
    
    return x,y

In [None]:
raw_data_nm.columns

In [None]:
# create two extra columns in raw_data_nm for x,y compnenents of wind direction 
raw_data_nm['x_com'], raw_data_nm['y_com'] = x_y_component(raw_data_nm['Wind_direction'],
                                                           raw_data_nm['Wind_speed'])

In [None]:
raw_data_nm.head(3)

In [None]:
raw_data_nm.describe()

In [None]:
#resample from no maintenance dataset for every hour, day, week and month
hourly_nm = pd.DataFrame()
daily_nm = pd.DataFrame()
weekly_nm = pd.DataFrame()
monthly_nm = pd.DataFrame()

columns = ['Power','Wind_speed','Theoretical_power','Wind_direction','Loss','x_com','y_com']

for column in columns:
    hourly_nm[column] = raw_data_nm[column].resample('H').mean()

for column in columns:
    daily_nm[column] = raw_data_nm[column].resample('D').mean()

for column in columns:
    weekly_nm[column] = raw_data_nm[column].resample('W').mean()
    
for column in columns:
    monthly_nm[column] = raw_data_nm[column].resample('M').mean()
    

In [None]:
daily_nm.info()

In [None]:
hourly_nm.info()

In [None]:
hourly_nm['2018-12']['Power'].plot(figsize=(15,7))
plt.show()

In [None]:
#interpolate values for the hourly no maintenance dataset
for column in hourly_nm.columns:
    hourly_nm[column] = hourly_nm[column].interpolate()

In [None]:
#last two weeks of december
hourly_nm['2018-12-17':'2018-12-31']['Power'].plot(figsize=(15,7))

In [None]:
#plot the difference between maintnence and no maintenance
hourly['Loss'].plot(figsize=(10,5))
plt.title('Average Hourly Power Loss')
plt.xlabel('Date')
plt.ylabel('Power Loss(KWh)')
plt.tight_layout()
#plt.savefig('figures/hourly_loss.png')
plt.show()

In [None]:
hourly_nm['Loss'].plot(figsize=(10,5))
plt.title('Average Hourly Power Loss after Filtering Maintenance')
plt.ylabel('Power Loss (kWh)')
plt.xlabel('Date')
plt.tight_layout()
#plt.savefig('figures/hourly_loss_after_main.png')
plt.show()

From filtering the data when the wind turbine is under maintenance, there are Dates throughout the year when the power loss is minimized, however during January, Febuary and December, there is still extreme power loss. I will investigate this further.

In [None]:
hourly_nm[hourly_nm['Loss']> 500]

In [None]:
hourly_nm[((hourly_nm['Wind_speed']<6) & (hourly_nm['Power']<100))].head()


In [None]:
hourly_nm[((hourly_nm['Power']==0) & (hourly_nm['Theoretical_power']>0))].describe()

It seems the turbine was functioning normally whilst the power output was low. I will keep these data points in my analysis as our model can consider the intermittency  of the wind and whilst the turbine is not as effcient as it should be.  

In [None]:
hourly_nm['Power'].plot(figsize=(20,5),label='hourly')
hourly_nm['Theoretical_power'].plot(label='theoretical')
plt.legend()
plt.show()

In [None]:
raw_data['Direction'] = raw_data['Wind_direction'].apply(direction)

In [None]:
raw_data.head(3)

In [None]:
#add the direction column to the daily,weekly,monthly dataframes no maintenance dataframe
dfs = [hourly_nm, daily_nm, weekly_nm, monthly_nm]

for df in dfs:
    df['Direction'] = df['Wind_direction'].apply(direction)

monthly_nm.head(3)

In [None]:
#create a function for the mean wind speed
def mean_wind(x):
    intervals = [*np.arange(0.25,26,0.5)]

    for i in intervals:
            if x < i: 
                x=i-0.25
                return x

In [None]:
raw_data_nm['Mean_speed'] = raw_data_nm['Wind_speed'].apply(mean_wind)

In [None]:
#add mean wind speed for raw data no maintenance 
raw_data_nm.sample(5)


As the daily power generated will be used for analysis, the autocorrelation and partial autocorrelation function will be used to see if there is a similarity between observations of a function of it's previos time steps.

In [None]:
#ACF : ckecks to see if the previos time step has an impact on the next time step
#PCF : see which lag has an impact on the next time step.
plot_acf(hourly_nm['Power'], lags=30)
#plt.savefig('figures/acf.png')
plt.show()

In [None]:
plt.figsize=(20,5)
plot_pacf(hourly_nm['Power'], lags=30)
#plt.savefig('figures/pacf.png')
plt.show()

In [None]:
hourly_nm['T_1'] = hourly_nm['Power'].shift(1)

In [None]:
hourly_nm = hourly_nm.dropna()

In [None]:
hourly_nm.head(3)

**Exploratory Data Analysis (EDA)**


In [None]:
#Test to check for stationarity, if mean and variance are roughly similar we can say the data is stationary.
columns = ['Power', 'Wind_speed', 'Theoretical_power', 'Loss','x_com','y_com']
for column in columns:
    feature = hourly_nm[column]
    split = round(len(feature)/2)
    X_1,X_2 = feature[:split], feature[split:]
    mean_1,mean_2 = X_1.mean(), X_2.mean()
    var_1,var_2 = X_1.var(), X_2.var()
    print('For {}'.format(column))
    print('mean1=%f, mean2=%f' % (mean_1, mean_2))
    print('variance1=%f, variance2=%f' % (var_1, var_2))
    print('')

**Null Hypothesis (H0)**: If failed to be rejected, it suggests the time series has a unit root, meaning it is non-stationary. It has some time dependent structure. 
\
\
**Alternate Hypothesis (H1)**: The null hypothesis is rejected; it suggests the time series does not have a unit root, meaning it is stationary. It does not have time-dependent structure.
\
\
**p-value > 0.05**: Fail to reject the null hypothesis (H0), the data has a unit root and is non-stationary.
\
**p-value <= 0.05**: Reject the null hypothesis (H0), the data does not have a unit root and is stationary.

In [None]:
# Second test to check stationarity in data using adfuller test
from statsmodels.tsa.stattools import adfuller

adfull = {}

for column in columns:
    X = hourly_nm[column].values
    result = adfuller(X)
    adfull[column] = [result[1]]

    print('P-value for {}: {}'.format(column,result[1]))

In [None]:
adfull_df = pd.DataFrame.from_dict(adfull)

In [None]:
adfull_df

We can conclude that all the features do not follow a random walk and are stationary. 

In [None]:
#daily['Power'].plot(figsize=(20,5),label='daily')
monthly_nm['Power'].plot(label='monthly')
weekly_nm['Power'].plot(label='weekly')
plt.title('Daily and Monthly Average Power Generated')
plt.ylabel('LV Power (kW)')
plt.xlabel('Date')
plt.legend()
plt.show()

In [None]:
#plot kde of power 
sns.kdeplot(raw_data_nm['Power'],shade=True)
plt.title('KDE Plot of Power')
plt.tight_layout()
#plt.savefig('figures/KDE.png')
plt.show()

In [None]:
#plot kde of wind speed
sns.kdeplot(raw_data_nm['Wind_speed'],shade=True)
plt.title('KDE Plot of Wind Speed')
plt.xlabel('Wind speed (m/s)')
plt.tight_layout()
#plt.savefig('figures/KDE wind_speed.png')
plt.show()

In [None]:
m_s_t_se = raw_data_nm.groupby('Mean_speed')['Theoretical_power'].mean()
m_s_p_se = raw_data_nm.groupby('Mean_speed')['Power'].mean()
plt.figure(figsize=(10,5))
plt.tight_layout()
plt.plot(m_s_t_se,label='Theoretical')
plt.plot(m_s_p_se,label='Actual Power')
plt.xlabel('Wind Speed (m/s)')
plt.ylabel('Power (kW)')
plt.title('Theoretical Power vs Actual Power Curve')
plt.grid(which='major')
plt.legend()
#plt.savefig('figures/theo-actual-power-curve.png')
plt.show()

In [None]:
raw_data.columns

In [None]:
# ax = WindroseAxes.from_ax()
# ax.bar(raw_data_nm['Wind_direction'], raw_data_nm['Wind_speed'], normed=True, opening=0.8, edgecolor='black')
# ax.set_theta_direction('clockwise')
# ax.set_theta_zero_location('N')
# ax.set_xticklabels(['N', 'N-E', 'E', 'S-E', 'S', 'S-W', 'W', 'N-W'])
# ax.set_legend()
# plt.title('Wind Direction v Wind Speed (m/s) from 10 min Intervals.')
# plt.tight_layout()
# plt.savefig('figures/windrose.png')
# plt.show()

In [None]:
raw_data_nm['Direction'] = raw_data_nm['Wind_direction'].apply(direction)

In [None]:
direction_df = raw_data_nm.groupby('Direction')[['Power','Theoretical_power']].sum()
direction_df = direction_df.sort_values(by=['Power'],ascending=False)
direction_df.plot(kind='bar',figsize=(10,5))
plt.title('Total Theoretical Power & LV Power Comparison')
plt.ylabel('Power (kW)')
plt.tight_layout()
#plt.savefig('figures/theo_vs_lv_power.png')
plt.show()


In [None]:
# summary df of grouped daily direction 
dir_loss = raw_data_nm.groupby('Direction')[['Loss']].sum()
dir_loss = dir_loss.sort_values(by=['Loss'],ascending=False)
dir_loss.plot(kind='bar', figsize=(10,5))
plt.title('Total Power loss')
plt.ylabel('Power Loss (kW)')
plt.tight_layout()
#plt.savefig('figures/power_loss.png')
plt.show()

Consider the greatest power loss were in ENE, NE, NNE and SSW. I will investigate this further.




In [None]:
raw_data_nm.head(3)

In [None]:
NNE = raw_data_nm[raw_data_nm['Direction']=='NNE']
m_s_p_nne = NNE.groupby('Mean_speed')['Power'].mean()
plt.figure(figsize=(10,5))
plt.plot(m_s_t_se,label='Theoretical')
plt.plot(m_s_p_nne,label='NNE Power')
plt.xlabel('Wind Speed (m/s)')
plt.ylabel('Power (kW)')
plt.title('Theoretical Power vs NNE Power Curve')
plt.grid(which='major')
plt.legend()
plt.tight_layout()
#plt.savefig('figures/NNE_p.png')
plt.show()


In [None]:
ENE = raw_data_nm[raw_data_nm['Direction']=='ENE']
m_s_p_ene = ENE.groupby('Mean_speed')['Power'].mean()
plt.plot(m_s_t_se,label='Theoretical')
plt.plot(m_s_p_ene,label='ENE Power')
plt.xlabel('Wind Speed (m/s)')
plt.ylabel('Power (kW)')
plt.title('Theoretical Power vs ENE Power Curve')
plt.grid(which='major')
plt.legend()
plt.show()

In [None]:
NE = raw_data_nm[raw_data_nm['Direction']=='NE']
m_s_p_ne = NE.groupby('Mean_speed')['Power'].mean()
plt.plot(m_s_t_se,label='Theoretical')
plt.plot(m_s_p_ne,label='NE Power')
plt.xlabel('Wind Speed (m/s)')
plt.ylabel('Power (kW)')
plt.title('Theoretical Power vs NE Power Curve')
plt.grid(which='major')
plt.legend()
plt.show()

In [None]:
SSW = raw_data_nm[raw_data_nm['Direction']=='SSW']
m_s_p_ssw = SSW.groupby('Mean_speed')['Power'].mean()
plt.plot(m_s_t_se,label='Theoretical')
plt.plot(m_s_p_ssw,label='SSW Power')
plt.xlabel('Wind Speed (m/s)')
plt.ylabel('Power (kW)')
plt.title('Theoretical Power vs SSW Power Curve')
plt.grid(which='major')
plt.legend()
plt.show()

In [None]:
SSW = raw_data_nm[raw_data_nm['Direction']=='SSW']
m_s_p_ssw = SSW.groupby('Mean_speed')['Power'].mean()
plt.figure(figsize=(12,7))
plt.plot(m_s_t_se,label='Theoretical')
plt.plot(m_s_p_ssw,label='SSW Power')
plt.plot(m_s_p_ene,label='ENE Power')
plt.plot(m_s_p_ne,label='NE Power')
plt.plot(m_s_p_nne,label='NNE Power')
plt.xlabel('Wind Speed (m/s)')
plt.ylabel('Power (kW)')
plt.title('Theoretical Power vs Power Curve')
plt.grid(which='major')
plt.legend()
plt.tight_layout()
#plt.savefig('figures/power_direction.png')
plt.show()

In [None]:
#pairplot
_ = sns.pairplot(daily_nm,kind='reg')

In [None]:
#plot correlation heatmap, if gausian then pearson 
fig, ax = plt.subplots(1, 1,figsize=(9,7))
sns.heatmap(hourly_nm.corr(),annot=True,linewidth=0.5)
plt.title('Heatmap Showing Correlations Between different features in the dataset.')
ax.set_ylabel('')
ax.set_xlabel('')
plt.tight_layout()
#plt.savefig('figures/correlation.png')
plt.show()

Next to test the co-inegration between different features of the dataset. 

In [None]:
#test co-integration between features
import statsmodels.api as sm

num_columns = ['Wind_speed', 'Theoretical_power', 'Loss','x_com','y_com']
P = hourly_nm[['Power']]
co_int = {}

for column in num_columns:
    C = hourly_nm[[column]]

    C = sm.add_constant(C)
    result = sm.OLS(P,C).fit()

    #value of the gradient 
    const = result.params[1]
    adf_g = adfuller(hourly_nm[column] - const*hourly_nm['Power'])
    
    co_int[column] = [adf_g[1]]
   
    print('The p-value for the ADF test between {} and Power is: {} '.format(column,adf_g[1]))

In [None]:
co_int_df = pd.DataFrame.from_dict(co_int)
co_int_df

The above test show that all numerical columns are co-integrated with the Power feature of the dataset.

As Wind Direction in degrees is not a suitable feature for RNN's, this will be removed from the dataset.

In [None]:
hourly_nm.drop(labels=['Wind_direction','Direction'],axis=1,inplace=True)