In [23]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
from scipy.stats import t
from sklearn.model_selection import train_test_split
from sklearn import linear_model, preprocessing
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

In [24]:
data = pd.read_csv("T1.csv")

In [25]:
data.head()

Unnamed: 0,Date/Time,LV ActivePower (kW),Wind Speed (m/s),Theoretical_Power_Curve (KWh),Wind Direction (°)
0,01 01 2018 00:00,380.047791,5.311336,416.328908,259.994904
1,01 01 2018 00:10,453.769196,5.672167,519.917511,268.641113
2,01 01 2018 00:20,306.376587,5.216037,390.900016,272.564789
3,01 01 2018 00:30,419.645904,5.659674,516.127569,271.258087
4,01 01 2018 00:40,380.650696,5.577941,491.702972,265.674286


In [26]:
data = data[(data["Wind Speed (m/s)"] >= 0) & (data["LV ActivePower (kW)"] >= 0) & (data["Theoretical_Power_Curve (KWh)"]>=0) & (data["Wind Direction (°)"]>=0)]

# In this part we get rid of the values which under 0 because all the data must be greater than 0 or equal to 0

# Creating dummy features

In [27]:
data.dtypes

Date/Time                         object
LV ActivePower (kW)              float64
Wind Speed (m/s)                 float64
Theoretical_Power_Curve (KWh)    float64
Wind Direction (°)               float64
dtype: object

In [28]:
dfo = data.select_dtypes(include=['object']) # select object type columns

In [29]:
dfo.head()

Unnamed: 0,Date/Time
0,01 01 2018 00:00
1,01 01 2018 00:10
2,01 01 2018 00:20
3,01 01 2018 00:30
4,01 01 2018 00:40


In [30]:
df = pd.concat([data.drop(dfo, axis=1), pd.get_dummies(dfo)], axis=1)

In [31]:
df.head()

Unnamed: 0,LV ActivePower (kW),Wind Speed (m/s),Theoretical_Power_Curve (KWh),Wind Direction (°),Date/Time_01 01 2018 00:00,Date/Time_01 01 2018 00:10,Date/Time_01 01 2018 00:20,Date/Time_01 01 2018 00:30,Date/Time_01 01 2018 00:40,Date/Time_01 01 2018 00:50,...,Date/Time_31 12 2018 22:20,Date/Time_31 12 2018 22:30,Date/Time_31 12 2018 22:40,Date/Time_31 12 2018 22:50,Date/Time_31 12 2018 23:00,Date/Time_31 12 2018 23:10,Date/Time_31 12 2018 23:20,Date/Time_31 12 2018 23:30,Date/Time_31 12 2018 23:40,Date/Time_31 12 2018 23:50
0,380.047791,5.311336,416.328908,259.994904,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,453.769196,5.672167,519.917511,268.641113,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,306.376587,5.216037,390.900016,272.564789,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,419.645904,5.659674,516.127569,271.258087,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,380.650696,5.577941,491.702972,265.674286,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
df.shape

(50473, 50477)

In [33]:
# In our data, we dont have categorical data because of this reason, it does not make any sense to use get_dummies
# function in this project

In [34]:
data.drop('Date/Time', axis=1,inplace=True)

In [35]:
data.head()

Unnamed: 0,LV ActivePower (kW),Wind Speed (m/s),Theoretical_Power_Curve (KWh),Wind Direction (°)
0,380.047791,5.311336,416.328908,259.994904
1,453.769196,5.672167,519.917511,268.641113
2,306.376587,5.216037,390.900016,272.564789
3,419.645904,5.659674,516.127569,271.258087
4,380.650696,5.577941,491.702972,265.674286


In [37]:
type(data)

pandas.core.frame.DataFrame

# Standarization (StandartScaler)

In [38]:
scaler = StandardScaler()

scaled_features = scaler.fit_transform(data)

scaled_data = pd.DataFrame(scaled_features,columns=data.columns)

print(scaled_data.head())


   LV ActivePower (kW)  Wind Speed (m/s)  Theoretical_Power_Curve (KWh)  \
0            -0.707923         -0.532828                      -0.787732   
1            -0.651752         -0.447454                      -0.712003   
2            -0.764055         -0.555376                      -0.806322   
3            -0.677752         -0.450410                      -0.714773   
4            -0.707464         -0.469748                      -0.732629   

   Wind Direction (°)  
0            1.459106  
1            1.551633  
2            1.593621  
3            1.579638  
4            1.519883  


In [39]:
scaled_data.describe()

Unnamed: 0,LV ActivePower (kW),Wind Speed (m/s),Theoretical_Power_Curve (KWh),Wind Direction (°)
count,50473.0,50473.0,50473.0,50473.0
mean,-2.702915e-17,-9.009715e-18,-7.207772000000001e-17,-2.950682e-16
std,1.00001,1.00001,1.00001,1.00001
min,-0.997494,-1.789513,-1.092095,-1.323211
25%,-0.9573093,-0.7933185,-0.97287,-0.7955788
50%,-0.366608,-0.1068105,-0.3119332,-0.5349401
75%,0.8954262,0.6483174,1.076802,0.8350957
max,1.75974,4.174336,1.53973,2.529277


# Split data into training and testing subsets

In [46]:
X = scaled_data.drop(["LV ActivePower (kW)"],axis=1)
y = scaled_data[["LV ActivePower (kW)"]]
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 123)

# Data were splitting like the above because LV ActivePower (kW) is our target. we are going to predict these values.'
# Due to this reason, Data were splitted like that.'