In [85]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
from scipy.stats import t
from sklearn.model_selection import train_test_split
from sklearn import linear_model, preprocessing
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

In [86]:
data = pd.read_csv("T1.csv")

In [87]:
data.tail()

Unnamed: 0,Date/Time,LV ActivePower (kW),Wind Speed (m/s),Theoretical_Power_Curve (KWh),Wind Direction (°)
50525,31 12 2018 23:10,2963.980957,11.40403,3397.190793,80.502724
50526,31 12 2018 23:20,1684.353027,7.332648,1173.055771,84.062599
50527,31 12 2018 23:30,2201.106934,8.435358,1788.284755,84.7425
50528,31 12 2018 23:40,2515.694092,9.421366,2418.382503,84.297913
50529,31 12 2018 23:50,2820.466064,9.979332,2779.184096,82.27462


In [88]:
#data = data[(data["Wind Speed (m/s)"] >= 0) & (data["LV ActivePower (kW)"] >= 0) & (data["Theoretical_Power_Curve (KWh)"]>=0) & (data["Wind Direction (°)"]>=0)]

# In this part we get rid of the values which under 0 because all the data must be greater than 0 or equal to 0
# theoretically, these values cannot be less than 0

In [89]:

data['Date/Time'] = pd.to_datetime(data['Date/Time'])


print(data['Date/Time'].head())

# Extract month, day of the week, and year
data['Month'] = data['Date/Time'].dt.month
data['DayOfWeek'] = data['Date/Time'].dt.dayofweek
data['Year'] = data['Date/Time'].dt.year
data.drop(columns=['Date/Time'], inplace=True)


0   2018-01-01 00:00:00
1   2018-01-01 00:10:00
2   2018-01-01 00:20:00
3   2018-01-01 00:30:00
4   2018-01-01 00:40:00
Name: Date/Time, dtype: datetime64[ns]


In [90]:
print(data['DayOfWeek'].unique())

[0 3 6 1 4 2 5]


In [91]:
data.head()

Unnamed: 0,LV ActivePower (kW),Wind Speed (m/s),Theoretical_Power_Curve (KWh),Wind Direction (°),Month,DayOfWeek,Year
0,380.047791,5.311336,416.328908,259.994904,1,0,2018
1,453.769196,5.672167,519.917511,268.641113,1,0,2018
2,306.376587,5.216037,390.900016,272.564789,1,0,2018
3,419.645904,5.659674,516.127569,271.258087,1,0,2018
4,380.650696,5.577941,491.702972,265.674286,1,0,2018


In [92]:
data.dtypes

LV ActivePower (kW)              float64
Wind Speed (m/s)                 float64
Theoretical_Power_Curve (KWh)    float64
Wind Direction (°)               float64
Month                              int64
DayOfWeek                          int64
Year                               int64
dtype: object

In [93]:
dummy = pd.get_dummies(data[["Month","DayOfWeek"]])
dummy.head()
# These values are already in categorical form. We dont need get_dummies function of pandas.

Unnamed: 0,Month,DayOfWeek
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0


# Standarization (StandartScaler)

In [94]:
scaler = StandardScaler()

scaled_features = scaler.fit_transform(data)

scaled_data = pd.DataFrame(scaled_features,columns=data.columns)

print(scaled_data.head())


   LV ActivePower (kW)  Wind Speed (m/s)  Theoretical_Power_Curve (KWh)  \
0            -0.706800         -0.531476                      -0.786435   
1            -0.650629         -0.446115                      -0.710712   
2            -0.762932         -0.554021                      -0.805023   
3            -0.676628         -0.449071                      -0.713483   
4            -0.706340         -0.468406                      -0.731337   

   Wind Direction (°)     Month  DayOfWeek  Year  
0            1.458725 -1.639044  -1.488416   0.0  
1            1.551254 -1.639044  -1.488416   0.0  
2            1.593244 -1.639044  -1.488416   0.0  
3            1.579260 -1.639044  -1.488416   0.0  
4            1.519504 -1.639044  -1.488416   0.0  


In [95]:
scaled_data.describe()

Unnamed: 0,LV ActivePower (kW),Wind Speed (m/s),Theoretical_Power_Curve (KWh),Wind Direction (°),Month,DayOfWeek,Year
count,50530.0,50530.0,50530.0,50530.0,50530.0,50530.0,50530.0
mean,8.324585e-17,-1.349933e-16,-4.499776e-18,2.5873710000000003e-17,2.069897e-16,-1.392118e-16,0.0
std,1.00001,1.00001,1.00001,1.00001,1.00001,1.00001,0.0
min,-0.9982548,-1.787965,-1.090768,-1.323671,-1.639044,-1.488416,0.0
25%,-0.9577584,-0.7940521,-0.9728382,-0.7959106,-0.7554798,-0.9874292,0.0
50%,-0.367136,-0.1072499,-0.3131562,-0.5348146,0.1280845,0.01454478,0.0
75%,0.8951401,0.648684,1.076602,0.8348332,1.011649,1.016519,0.0
max,1.760871,4.174956,1.540802,2.528927,1.600692,1.517506,0.0


# Split data into training and testing subsets

In [96]:
X = scaled_data.drop(["LV ActivePower (kW)"],axis=1)
y = scaled_data[["LV ActivePower (kW)"]]
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 123)

# Data were splitting like the above because LV ActivePower (kW) is our target. we are going to predict these values.'
# Due to this reason, Data were splitted like that.'

In [97]:
X_train.head()

Unnamed: 0,Wind Speed (m/s),Theoretical_Power_Curve (KWh),Wind Direction (°),Month,DayOfWeek,Year
37617,-0.337097,-0.601163,1.805038,0.717127,-1.488416,0.0
21282,-0.255487,-0.507873,-0.905304,-1.344523,-0.987429,0.0
64,-0.614418,-0.852164,1.132778,-1.639044,-1.488416,0.0
34178,1.735378,1.540802,-0.610201,0.422606,0.515532,0.0
45471,3.040045,1.540802,0.761748,1.30617,-1.488416,0.0
