# CALLING OUR LIBRARIES

In [1]:
import pandas as pd
import numpy as np

# LOAD DATA

In [2]:
mydata = pd.read_csv("turbain_data.csv")
mydata.head()

Unnamed: 0,id,timestamp,active_power_calculated_by_converter,active_power_raw,ambient_temperature,generator_speed,generator_winding_temp_max,grid_power10min_average,nc1_inside_temp,nacelle_temp,reactice_power_calculated_by_converter,reactive_power,wind_direction_raw,wind_speed_raw,wind_speed_turbulence,turbine_id,Target
0,0,6/22/2021 13:15,59.071225,59.117203,35.765579,,64.812227,167.234988,31.835583,35.661191,18.22806,24.117071,316.953893,4.016191,0.524196,Turbine_01,48.670906
1,1,11/6/2021 17:40,258.278976,253.123637,33.392501,789.391492,57.22721,223.845406,32.870656,32.91634,84.444679,102.651681,70.390775,5.198709,0.31184,Turbine_97,47.779032
2,2,12/23/2021 21:08,234.88426,236.78526,33.320189,,47.570285,182.448718,33.043858,29.726347,19.863061,38.311028,302.287781,5.4436,0.367809,Turbine_158,45.542946
3,3,1/2/2021 16:01,50.535151,37.784692,,,33.838225,14.912772,33.752039,30.534683,94.085461,96.431728,93.148946,3.971027,0.631113,Turbine_20,47.515691
4,4,12/6/2021 17:46,247.842835,241.342895,32.897556,,64.972858,226.304581,32.55862,32.711412,93.393299,103.011902,140.825768,4.74652,0.341984,Turbine_20,44.625755


In [4]:
print(mydata.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 591239 entries, 0 to 591238
Data columns (total 17 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   id                                      591239 non-null  int64  
 1   timestamp                               591239 non-null  object 
 2   active_power_calculated_by_converter    591229 non-null  float64
 3   active_power_raw                        591222 non-null  float64
 4   ambient_temperature                     581442 non-null  float64
 5   generator_speed                         274403 non-null  float64
 6   generator_winding_temp_max              591238 non-null  float64
 7   grid_power10min_average                 591204 non-null  float64
 8   nc1_inside_temp                         591007 non-null  float64
 9   nacelle_temp                            591021 non-null  float64
 10  reactice_power_calculated_by_converter  5912

# Business questions:


What is the most efficient turbine engine ?
What causes turbine engine failure ?
How can our model help ?

# DATA CLEANING & PREPEOCESSING

In [5]:
missing_values_count = mydata.isnull().sum()

In [8]:
missing_values_count[0:17] #mydata.isna().sum()

id                                             0
timestamp                                      0
active_power_calculated_by_converter          10
active_power_raw                              17
ambient_temperature                         9797
generator_speed                           316836
generator_winding_temp_max                     1
grid_power10min_average                       35
nc1_inside_temp                              232
nacelle_temp                                 218
reactice_power_calculated_by_converter         4
reactive_power                                 2
wind_direction_raw                             7
wind_speed_raw                                 1
wind_speed_turbulence                         15
turbine_id                                     4
Target                                         0
dtype: int64

# check percentage of missing value 

In [9]:
total_missing =missing_values_count.sum()

In [10]:
# how many total missing values do we have?
total_cells = np.product(mydata.shape)

In [12]:
(total_missing/total_cells) * 100

3.255168134952492

In [13]:
mydata.drop(columns='generator_speed',inplace=True)

In [14]:
mydata.isna().sum()

id                                           0
timestamp                                    0
active_power_calculated_by_converter        10
active_power_raw                            17
ambient_temperature                       9797
generator_winding_temp_max                   1
grid_power10min_average                     35
nc1_inside_temp                            232
nacelle_temp                               218
reactice_power_calculated_by_converter       4
reactive_power                               2
wind_direction_raw                           7
wind_speed_raw                               1
wind_speed_turbulence                       15
turbine_id                                   4
Target                                       0
dtype: int64

--> we will fill null values with mean of columns

In [15]:
mean1 = mydata.ambient_temperature.mean()
mydata.loc[mydata['ambient_temperature'].isnull(),'ambient_temperature' ] = mean1

In [16]:
mean2 = mydata.active_power_calculated_by_converter.mean()
mydata.loc[mydata['active_power_calculated_by_converter'].isnull(),'active_power_calculated_by_converter' ] = mean2

In [17]:
mean3 = mydata.active_power_raw .mean()
mydata.loc[mydata['active_power_raw'].isnull(),'active_power_raw' ] = mean3

In [18]:
mean4 = mydata.generator_winding_temp_max.mean()
mydata.loc[mydata['generator_winding_temp_max'].isnull(),'generator_winding_temp_max' ] = mean4

In [19]:
mean5 = mydata.grid_power10min_average.mean()
mydata.loc[mydata['grid_power10min_average'].isnull(),'grid_power10min_average' ] = mean5

In [20]:
mean6 = mydata.nc1_inside_temp.mean()
mydata.loc[mydata['nc1_inside_temp'].isnull(),'nc1_inside_temp' ] = mean6

In [21]:
mean7 = mydata.nacelle_temp.mean()
mydata.loc[mydata['nacelle_temp'].isnull(),'nacelle_temp' ] = mean7

In [22]:
mean8 = mydata.reactice_power_calculated_by_converter.mean()
mydata.loc[mydata['reactice_power_calculated_by_converter'].isnull(),'reactice_power_calculated_by_converter' ] = mean8

In [23]:
mean9 = mydata.reactive_power.mean()
mydata.loc[mydata['reactive_power'].isnull(),'reactive_power' ] = mean9

In [24]:
mean10 = mydata.wind_direction_raw .mean()
mydata.loc[mydata['wind_direction_raw'].isnull(),'wind_direction_raw' ] = mean10

In [25]:
mean11 = mydata.wind_speed_raw .mean()
mydata.loc[mydata['wind_speed_raw'].isnull(),'wind_speed_raw' ] = mean11

In [26]:
mean12 = mydata.wind_speed_turbulence.mean()
mydata.loc[mydata['wind_speed_turbulence'].isnull(),'wind_speed_turbulence' ] = mean12

In [27]:
mydata.isna().sum()

id                                        0
timestamp                                 0
active_power_calculated_by_converter      0
active_power_raw                          0
ambient_temperature                       0
generator_winding_temp_max                0
grid_power10min_average                   0
nc1_inside_temp                           0
nacelle_temp                              0
reactice_power_calculated_by_converter    0
reactive_power                            0
wind_direction_raw                        0
wind_speed_raw                            0
wind_speed_turbulence                     0
turbine_id                                4
Target                                    0
dtype: int64

# changing column names to be more descriable

In [28]:
mydata.rename(columns = {'wind_direction_raw':'wind_direction'
                        ,'wind_speed_raw':'wind_speed'
                        ,'turbine_id':'Turbien_ID'
                        ,'generator_winding_temp_max':'wind_temp_max'
                        ,'reactice_power_calculated_by_converter':'reactive_power'
                        ,'timestamp':'Date'}, inplace = True)

In [29]:
mydata

Unnamed: 0,id,Date,active_power_calculated_by_converter,active_power_raw,ambient_temperature,wind_temp_max,grid_power10min_average,nc1_inside_temp,nacelle_temp,reactive_power,reactive_power.1,wind_direction,wind_speed,wind_speed_turbulence,Turbien_ID,Target
0,0,6/22/2021 13:15,59.071225,59.117203,35.765579,64.812227,167.234988,31.835583,35.661191,18.228060,24.117071,316.953893,4.016191,0.524196,Turbine_01,48.670906
1,1,11/6/2021 17:40,258.278976,253.123637,33.392501,57.227210,223.845406,32.870656,32.916340,84.444679,102.651681,70.390775,5.198709,0.311840,Turbine_97,47.779032
2,2,12/23/2021 21:08,234.884260,236.785260,33.320189,47.570285,182.448718,33.043858,29.726347,19.863061,38.311028,302.287781,5.443600,0.367809,Turbine_158,45.542946
3,3,1/2/2021 16:01,50.535151,37.784692,25.575695,33.838225,14.912772,33.752039,30.534683,94.085461,96.431728,93.148946,3.971027,0.631113,Turbine_20,47.515691
4,4,12/6/2021 17:46,247.842835,241.342895,32.897556,64.972858,226.304581,32.558620,32.711412,93.393299,103.011902,140.825768,4.746520,0.341984,Turbine_20,44.625755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
591234,591234,3/18/2021 19:37,121.264847,108.633915,28.822063,60.426447,78.542900,28.404571,32.300816,87.176760,97.308688,276.542084,3.921454,0.352061,Turbine_97,49.511423
591235,591235,7/24/2021 4:54,679.381755,699.714915,26.066999,59.386416,609.587229,34.645046,32.662838,49.636947,73.951992,262.125946,6.539214,0.620994,Turbine_15,46.682127
591236,591236,11/9/2021 13:51,289.928238,285.483582,31.997511,61.138416,517.386113,41.325528,37.321171,85.270166,106.643428,35.269936,4.627115,1.153216,Turbine_14,49.314674
591237,591237,2/2/2021 2:58,28.318385,12.871501,33.040375,47.513898,8.731869,32.886009,30.212943,92.453150,95.705736,184.148130,3.581920,0.277640,Turbine_20,51.652792


In [30]:
mydata.isna().sum()

id                                      0
Date                                    0
active_power_calculated_by_converter    0
active_power_raw                        0
ambient_temperature                     0
wind_temp_max                           0
grid_power10min_average                 0
nc1_inside_temp                         0
nacelle_temp                            0
reactive_power                          0
reactive_power                          0
wind_direction                          0
wind_speed                              0
wind_speed_turbulence                   0
Turbien_ID                              4
Target                                  0
dtype: int64