In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Basic Data Reading and eyeballing

In [None]:
train=pd.read_csv('../input/machine-hack-and-renew-energy-hackathon/train.csv')
test=pd.read_csv('../input/machine-hack-and-renew-energy-hackathon/test.csv')

In [None]:
# First five rows of the data
train.head()

In [None]:
# info of the data
train.info()

In [None]:
# Checking for missing values
train.isnull().sum()
# Data has no missing values

In [None]:
train.Target.mean()

In [None]:
train.describe().T
# Sum of the values are positive,0 and negative
# This hints that since data has different scales, it will not do good pattern capture
# Therefor we would need to scale or transform the data
# Some hints for transformation:- Log Transformation,power transformation, box-cox transformation

In [None]:
train.describe(include=object).T

In [None]:
train.corr()

In [None]:
sns.heatmap(train.corr(),vmin=-1,vmax=1,cmap='YlGnBu')
plt.show()

# Exploratory Data Analysis

In [None]:
# Distribution of target
train.Target

In [None]:
sns.distplot(train.Target)
plt.show()

In [None]:
train.info()

In [None]:
train.hist()
plt.show()

In [None]:
train

In [None]:
# TimeStamp - Day, Month, Year, Hour Min, Weekday
train['Day']=pd.DatetimeIndex(train.timestamp).day
train['Month']=pd.DatetimeIndex(train.timestamp).month
train['Year']=pd.DatetimeIndex(train.timestamp).year
train['hour']=pd.DatetimeIndex(train.timestamp).hour
train['minute']=pd.DatetimeIndex(train.timestamp).minute
train['weekday']=pd.DatetimeIndex(train.timestamp).weekday

In [None]:
train

In [None]:
test['Day']=pd.DatetimeIndex(test.timestamp).day
test['Month']=pd.DatetimeIndex(test.timestamp).month
test['Year']=pd.DatetimeIndex(test.timestamp).year
test['hour']=pd.DatetimeIndex(test.timestamp).hour
test['minute']=pd.DatetimeIndex(test.timestamp).minute
test['weekday']=pd.DatetimeIndex(test.timestamp).weekday

In [None]:
newtrain=train.drop(columns=['turbine_id','Target','timestamp','Year'])
target=train.Target
test.drop(columns=['timestamp','Year'],inplace=True)

# Feature Engineering

In [None]:
newtrain.columns

In [None]:
newtrain['difference_active_power']=newtrain.active_power_raw-newtrain.active_power_calculated_by_converter
test['difference_active_power']=test.active_power_raw-test.active_power_calculated_by_converter
newtrain['difference_nacelle']=newtrain.nacelle_temp-newtrain.nc1_inside_temp
test['difference_nacelle']=test.nacelle_temp-test.nc1_inside_temp
newtrain['difference_react']=newtrain.reactive_power-newtrain.reactice_power_calculated_by_converter
test['difference_react']=test.reactive_power-test.reactice_power_calculated_by_converter

In [None]:
newtrain.columns

In [None]:
# Random Forest Model
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
import catboost as cboost

In [None]:
rf=RandomForestRegressor()
etr=ExtraTreesRegressor()
lgbm=LGBMRegressor()
xgbr=XGBRegressor()
cbr=cboost.CatBoostRegressor()

In [None]:
newtrain.shape,test.shape

In [None]:
pred_rf=rf.fit(newtrain,target).predict(test.drop(columns='turbine_id'))

In [None]:
pred_rf

In [None]:
submission=pd.DataFrame({'Target':pred_rf})

In [None]:
submission.to_csv('CompSub.csv') #rf: mape= 0.0726

# Log Transformation

In [None]:
# Log transformation

features=['active_power_calculated_by_converter', 'active_power_raw',
       'ambient_temperature', 'generator_speed', 'generator_winding_temp_max',
       'grid_power10min_average', 'nc1_inside_temp', 'nacelle_temp',
       'reactice_power_calculated_by_converter', 'reactive_power',
       'wind_direction_raw', 'wind_speed_raw', 'wind_speed_turbulence','difference_active_power',
       'difference_nacelle', 'difference_react']

In [None]:
# Has -ve,0,+ve

for i in features:
    newtrain[i]=newtrain[i].apply(lambda col:0.000001 if col<=0 else col)
    newtrain[i]=np.log(newtrain[i])

In [None]:

for i in features:
    test[i]=test[i].apply(lambda col:0.000001 if col<=0 else col)
    test[i]=np.log(test[i])

In [None]:
newtrain.head()

In [None]:
# Turbine id
train.turbine_id.unique()

In [None]:
test.turbine_id.unique()

In [None]:
encoded=train.groupby('turbine_id')['Target'].mean().to_dict()

In [None]:
train['turbine_id']=train['turbine_id'].map(encoded)

In [None]:
newtrain['turbine_id']=train.turbine_id
test['turbine_id']=test.turbine_id.map(encoded)

In [None]:
newtrain.head()

# Feature Selection

In [None]:
# using RFE
from sklearn.feature_selection import RFECV

In [None]:
from sklearn.tree import DecisionTreeRegressor
dtree=DecisionTreeRegressor()

In [None]:
selector=RFECV(estimator=dtree,step=1,cv=5,verbose=5,n_jobs=-1,scoring='neg_mean_absolute_percentage_error',
               min_features_to_select=5)

In [None]:
# Fit the model and find out the optimal features
#selector.fit(newtrain,target)
# feat_names=list(selector.get_feature_names_out)
# print(feat_names)

In [None]:
rfetrain=newtrain.loc[:,['nacelle_temp','ambient_temperature','turbine_id','hour','Day','Month']]

In [None]:
rfetest=test.loc[:,['nacelle_temp','ambient_temperature','turbine_id','hour','Day','Month']]

In [None]:
rfetest

In [None]:
rf=RandomForestRegressor(random_state=42)
etr=ExtraTreesRegressor(n_estimators=200,random_state=123)
lgbm=LGBMRegressor(random_state=494)
xgbr=XGBRegressor(random_state=500)
cbr=cboost.CatBoostRegressor(random_state=0)

In [None]:
pred=rf.fit(rfetrain,target).predict(rfetest)

In [None]:
submission=pd.DataFrame({'Target':pred})
submission.to_csv('Sub2.csv')

# Summary of the models
* Taking Date, Time and creating the new features gave us 0.0074 mape
* applying encoding+log transformation+ feature selection gave us 0.00316 mape
* next steps- one can try the parameter tuning of the random forrest model and one can explore the lightgbm model