In [52]:
# Load libraries
import pandas as pd
from sklearn import preprocessing
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics  

In [40]:
# Load training data
train_data = pd.read_csv('train_windy.csv')
len(train_data)

28200

In [41]:
train_data.head()

Unnamed: 0,tracking_id,datetime,wind_speed(m/s),atmospheric_temperature(°C),shaft_temperature(°C),blades_angle(°),gearbox_temperature(°C),engine_temperature(°C),motor_torque(N-m),generator_temperature(°C),...,windmill_body_temperature(°C),wind_direction(°),resistance(ohm),rotor_torque(N-m),turbine_status,cloud_level,blade_length(m),blade_breadth(m),windmill_height(m),windmill_generated_power(kW/h)
0,WM_33725,2019-08-04 14:33:20,94.820023,-99.0,41.723019,-0.903423,82.410573,42.523015,2563.124522,76.66556,...,,239.836388,2730.310605,42.084666,BA,Medium,2.217542,0.314065,24.281689,6.766521
1,WM_698,2018-11-05 10:13:20,241.832734,27.764785,-99.0,-99.0,44.104919,46.25887,2372.384119,78.129803,...,,337.944723,1780.2072,107.888643,A2,Medium,4.210346,0.448494,27.262139,5.966275
2,WM_39146,2019-09-14 14:03:20,95.484724,,41.855473,12.652763,42.322098,42.878552,1657.169646,67.654469,...,45.033197,227.850294,1666.0499,-42.931459,ABC,Medium,2.719475,0.302321,27.366127,2.874342
3,WM_6757,2018-12-25 15:33:20,238.819424,-99.0,45.443914,15.115323,44.759643,47.282101,2888.134079,95.389974,...,44.827154,492.08152,1964.502895,42.744596,ABC,,4.857385,0.36714,24.287767,14.851089
4,WM_21521,2019-05-04 03:13:20,10.72289,,41.981183,1.715696,-17.616459,43.469852,781.695419,37.423065,...,-99.0,259.274601,1177.516152,13.387289,AAA,Medium,,0.453374,27.97165,3.519074


In [42]:
# Load testing data
test_data = pd.read_csv('test_windy.csv')
len(test_data)

12086

In [43]:
test_data.head()

Unnamed: 0,tracking_id,datetime,wind_speed(m/s),atmospheric_temperature(°C),shaft_temperature(°C),blades_angle(°),gearbox_temperature(°C),engine_temperature(°C),motor_torque(N-m),generator_temperature(°C),...,area_temperature(°C),windmill_body_temperature(°C),wind_direction(°),resistance(ohm),rotor_torque(N-m),turbine_status,cloud_level,blade_length(m),blade_breadth(m),windmill_height(m)
0,WM_19817,2019-04-17 08:53:20,94.324266,17.641186,89.714193,51.146788,40.46056,39.594734,1073.202715,66.830037,...,24.004812,43.756693,445.976992,1664.222023,21.912243,BA,Medium,3.185837,0.403965,25.572431
1,WM_18723,2019-03-30 07:43:20,10.08887,13.978119,43.272846,46.516394,40.027788,41.17686,517.43643,37.284163,...,29.431813,42.728174,499.595287,1165.111992,-35.050093,A,Medium,3.016603,0.444755,24.371823
2,WM_34552,2019-08-10 11:33:20,347.15209,31.423035,41.07664,26.931602,43.109122,43.439556,1480.716492,70.010762,...,29.924235,43.256122,245.432231,1667.720491,27.195302,B2,Medium,2.611941,0.387368,27.654677
3,WM_28570,2019-06-26 03:53:20,24.471997,-99.0,14.375078,66.513953,13.741253,15.577472,887.979475,41.445258,...,23.886434,13.501595,,1329.74474,15.245757,BBB,Low,2.866805,0.450478,24.189426
4,WM_36934,2019-08-27 16:43:20,96.997026,33.281836,41.405192,1.843112,121.572907,43.934587,2053.916354,68.007787,...,35.906889,-99.0,442.425744,691.408996,34.257024,A,Low,3.549672,0.368355,4.88544


In [44]:
# Get the number of missing data points per column
missing_values_count_train = train_data.isnull().sum()
print(missing_values_count_train)

tracking_id                          0
datetime                             0
wind_speed(m/s)                    273
atmospheric_temperature(°C)       3450
shaft_temperature(°C)                2
blades_angle(°)                    216
gearbox_temperature(°C)              1
engine_temperature(°C)              12
motor_torque(N-m)                   24
generator_temperature(°C)           12
atmospheric_pressure(Pascal)      2707
area_temperature(°C)                 0
windmill_body_temperature(°C)     2363
wind_direction(°)                 5103
resistance(ohm)                      1
rotor_torque(N-m)                  572
turbine_status                    1759
cloud_level                        276
blade_length(m)                   5093
blade_breadth(m)                     0
windmill_height(m)                 543
windmill_generated_power(kW/h)     207
dtype: int64


In [45]:
# Get the number of missing data points per column
missing_values_count_test = test_data.isnull().sum()
print(missing_values_count_test)

tracking_id                         0
datetime                            0
wind_speed(m/s)                   126
atmospheric_temperature(°C)      1427
shaft_temperature(°C)               1
blades_angle(°)                   106
gearbox_temperature(°C)             1
engine_temperature(°C)              5
motor_torque(N-m)                  11
generator_temperature(°C)           5
atmospheric_pressure(Pascal)     1151
area_temperature(°C)                1
windmill_body_temperature(°C)     926
wind_direction(°)                2160
resistance(ohm)                     0
rotor_torque(N-m)                 281
turbine_status                    797
cloud_level                       125
blade_length(m)                  2114
blade_breadth(m)                    0
windmill_height(m)                255
dtype: int64


In [46]:
# Imputing all rows with missing data
#train_modified = train_data.dropna()
train_imputed = train_data.fillna(method='bfill', axis=0).fillna(0)
train_imputed.head()

Unnamed: 0,tracking_id,datetime,wind_speed(m/s),atmospheric_temperature(°C),shaft_temperature(°C),blades_angle(°),gearbox_temperature(°C),engine_temperature(°C),motor_torque(N-m),generator_temperature(°C),...,windmill_body_temperature(°C),wind_direction(°),resistance(ohm),rotor_torque(N-m),turbine_status,cloud_level,blade_length(m),blade_breadth(m),windmill_height(m),windmill_generated_power(kW/h)
0,WM_33725,2019-08-04 14:33:20,94.820023,-99.0,41.723019,-0.903423,82.410573,42.523015,2563.124522,76.66556,...,45.033197,239.836388,2730.310605,42.084666,BA,Medium,2.217542,0.314065,24.281689,6.766521
1,WM_698,2018-11-05 10:13:20,241.832734,27.764785,-99.0,-99.0,44.104919,46.25887,2372.384119,78.129803,...,45.033197,337.944723,1780.2072,107.888643,A2,Medium,4.210346,0.448494,27.262139,5.966275
2,WM_39146,2019-09-14 14:03:20,95.484724,-99.0,41.855473,12.652763,42.322098,42.878552,1657.169646,67.654469,...,45.033197,227.850294,1666.0499,-42.931459,ABC,Medium,2.719475,0.302321,27.366127,2.874342
3,WM_6757,2018-12-25 15:33:20,238.819424,-99.0,45.443914,15.115323,44.759643,47.282101,2888.134079,95.389974,...,44.827154,492.08152,1964.502895,42.744596,ABC,Medium,4.857385,0.36714,24.287767,14.851089
4,WM_21521,2019-05-04 03:13:20,10.72289,30.326226,41.981183,1.715696,-17.616459,43.469852,781.695419,37.423065,...,-99.0,259.274601,1177.516152,13.387289,AAA,Medium,2.504098,0.453374,27.97165,3.519074


In [47]:
# Imputing all rows with missing data
test_imputed = test_data.fillna(method='bfill', axis=0).fillna(0)
test_imputed.head()

Unnamed: 0,tracking_id,datetime,wind_speed(m/s),atmospheric_temperature(°C),shaft_temperature(°C),blades_angle(°),gearbox_temperature(°C),engine_temperature(°C),motor_torque(N-m),generator_temperature(°C),...,area_temperature(°C),windmill_body_temperature(°C),wind_direction(°),resistance(ohm),rotor_torque(N-m),turbine_status,cloud_level,blade_length(m),blade_breadth(m),windmill_height(m)
0,WM_19817,2019-04-17 08:53:20,94.324266,17.641186,89.714193,51.146788,40.46056,39.594734,1073.202715,66.830037,...,24.004812,43.756693,445.976992,1664.222023,21.912243,BA,Medium,3.185837,0.403965,25.572431
1,WM_18723,2019-03-30 07:43:20,10.08887,13.978119,43.272846,46.516394,40.027788,41.17686,517.43643,37.284163,...,29.431813,42.728174,499.595287,1165.111992,-35.050093,A,Medium,3.016603,0.444755,24.371823
2,WM_34552,2019-08-10 11:33:20,347.15209,31.423035,41.07664,26.931602,43.109122,43.439556,1480.716492,70.010762,...,29.924235,43.256122,245.432231,1667.720491,27.195302,B2,Medium,2.611941,0.387368,27.654677
3,WM_28570,2019-06-26 03:53:20,24.471997,-99.0,14.375078,66.513953,13.741253,15.577472,887.979475,41.445258,...,23.886434,13.501595,442.425744,1329.74474,15.245757,BBB,Low,2.866805,0.450478,24.189426
4,WM_36934,2019-08-27 16:43:20,96.997026,33.281836,41.405192,1.843112,121.572907,43.934587,2053.916354,68.007787,...,35.906889,-99.0,442.425744,691.408996,34.257024,A,Low,3.549672,0.368355,4.88544


In [48]:
# Plot statistics of Wind speed
print(train_imputed['wind_speed(m/s)'].describe())

count    28200.000000
mean        69.014553
std         76.231504
min       -496.211029
25%         20.902307
50%         93.300739
75%         95.268399
max        601.455670
Name: wind_speed(m/s), dtype: float64


In [49]:
features_num = ['wind_speed(m/s)', 'atmospheric_temperature(°C)', 'shaft_temperature(°C)', 'blades_angle(°)',
                'gearbox_temperature(°C)', 'engine_temperature(°C)', 'motor_torque(N-m)', 'generator_temperature(°C)',
                'atmospheric_pressure(Pascal)', 'area_temperature(°C)', 'windmill_body_temperature(°C)', 'wind_direction(°)',
                'resistance(ohm)', 'rotor_torque(N-m)', 'blade_length(m)', 'blade_breadth(m)', 'windmill_height(m)']
features_cat = ['cloud_level']
features_lab = ['turbine_status']

preprocessor = make_column_transformer(
    (StandardScaler(), features_num),
    (OneHotEncoder(), features_cat),
    #(LabelEncoder(), features_lab),
)

y = train_imputed.pop('windmill_generated_power(kW/h)')
#train_imputed = train_imputed.iloc[:,[1,2,3,10,12]]
X = preprocessor.fit_transform(train_imputed)

#test_imputed = test_imputed.iloc[:,[1,2,3,10,12]]
test_X = preprocessor.fit_transform(test_imputed)

train_imputed.columns

Index(['tracking_id', 'datetime', 'wind_speed(m/s)',
       'atmospheric_temperature(°C)', 'shaft_temperature(°C)',
       'blades_angle(°)', 'gearbox_temperature(°C)', 'engine_temperature(°C)',
       'motor_torque(N-m)', 'generator_temperature(°C)',
       'atmospheric_pressure(Pascal)', 'area_temperature(°C)',
       'windmill_body_temperature(°C)', 'wind_direction(°)', 'resistance(ohm)',
       'rotor_torque(N-m)', 'turbine_status', 'cloud_level', 'blade_length(m)',
       'blade_breadth(m)', 'windmill_height(m)'],
      dtype='object')

In [50]:
#Train-test split
train_X, val_X, train_y, val_y = train_test_split(X,y,random_state=1,test_size=0.2)

In [51]:
print(train_X[0])

[ 0.37982819 -2.24868305  0.20265802  0.20167612  0.12553187  0.22905639
  1.14264877  0.89414891 -1.46719915  1.44755477  0.49423948  1.55614776
  0.63356324  0.60025019  0.12047891 -0.2974015  -0.10351123  0.
  1.          0.        ]


In [58]:
# Define the models
rf_model1 = RandomForestRegressor(n_estimators = 100, random_state=1)
rf_model2 = RandomForestRegressor(n_estimators = 50, random_state=2)
rf_model3 = RandomForestRegressor(n_estimators = 150, random_state=3)
rf_model4 = RandomForestRegressor(n_estimators = 200, random_state=4)
rf_model5 = RandomForestRegressor(n_estimators = 250, random_state=5)

# Fit the models
rf_model1.fit(train_X,train_y)
val_preds1 = rf_model1.predict(val_X)
rf_model2.fit(train_X,train_y)
val_preds2 = rf_model2.predict(val_X)
rf_model3.fit(train_X,train_y)
val_preds3 = rf_model3.predict(val_X)
rf_model4.fit(train_X,train_y)
val_preds4 = rf_model4.predict(val_X)
rf_model5.fit(train_X,train_y)
val_preds5 = rf_model5.predict(val_X)

In [59]:
print("MAE OF THE MODEL 1: ", metrics.mean_absolute_error(val_y, val_preds1))
print("MAE OF THE MODEL 2: ", metrics.mean_absolute_error(val_y, val_preds2))
print("MAE OF THE MODEL 3: ", metrics.mean_absolute_error(val_y, val_preds3))
print("MAE OF THE MODEL 4: ", metrics.mean_absolute_error(val_y, val_preds4))
print("MAE OF THE MODEL 5: ", metrics.mean_absolute_error(val_y, val_preds5))

MAE OF THE MODEL 1:  0.33005020640040805
MAE OF THE MODEL 2:  0.33228078466957844
MAE OF THE MODEL 3:  0.33154590171735104
MAE OF THE MODEL 4:  0.33083021883268277
MAE OF THE MODEL 5:  0.3288178986932939


In [60]:
test_preds1 = rf_model1.predict(test_X)
test_preds2 = rf_model2.predict(test_X)
test_preds3 = rf_model3.predict(test_X)
test_preds4 = rf_model4.predict(test_X)
test_preds5 = rf_model5.predict(test_X)
test_preds = []
for i in range(len(test_preds1)):
    test_preds.append((test_preds1[i] + test_preds2[i] + test_preds3[i] + test_preds4[i] + test_preds5[i])/5)

In [61]:
# The lines below shows how to save predictions in format used for competition scoring.
output = pd.DataFrame({'tracking_id': test_data.tracking_id,
                       'datetime': test_data.datetime,
                       'windmill_generated_power(kW/h)': test_preds})

output.head()
output.to_csv('submission_windy.csv', index=False)