In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import accuracy_score,mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv('Auction_and_demand_data_MAIN.csv')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
# sns.pairplot(data)

In [None]:
sns.jointplot(x='NSL_FLOW',y='Average base prices',data=data,kind='reg')

#  ploting the flow of data in time

In [None]:
data['Date'] = pd.to_datetime(data['Date'], utc=True, infer_datetime_format=True)
data = data.set_index('Date')

In [None]:
def plot_series(df=None, column=None, series=pd.Series([]), 
                label=None, ylabel=None, title=None, start=0, end=None):
    sns.set()
    fig, ax = plt.subplots(figsize=(30, 12))
    ax.set_xlabel('Time', fontsize=16)
    if column:
        ax.plot(df[column][start:end], label=label)
        ax.set_ylabel(ylabel, fontsize=16)
    if series.any():
        ax.plot(series, label=label)
        ax.set_ylabel(ylabel, fontsize=16)
    if label:
        ax.legend(fontsize=16)
    if title:
        ax.set_title(title, fontsize=24)
    ax.grid(True)
    return ax

In [None]:
ax = plot_series(df=data, column='1_Hour', ylabel='Average base prices',
                 title='cost of price per hour')
plt.show()

# Checking for correlatin bewteen the target(Average base price ) and other features

In [None]:
data.corr()
data.corr().iloc[:,24:]

In [None]:
data = data.dropna(axis=0)

In [None]:
# data['NSL_FLOW'].value_counts()

In [None]:
# data['Average base prices'].isna().sum()

# Manually selecting the features for prediction

In [None]:
X = data.drop(columns=['Date','PUMP_STORAGE_PUMPING','IFA_FLOW','IFA2_FLOW','BRITNED_FLOW','MOYLE_FLOW','EAST_WEST_FLOW',
                      'NEMO_FLOW','ELECLINK_FLOW','Average base prices','Week','Month','ND','TSD','ENGLAND_WALES_DEMAND',
                       'EMBEDDED_WIND_GENERATION','EMBEDDED_WIND_CAPACITY','EMBEDDED_SOLAR_GENERATION','EMBEDDED_SOLAR_CAPACITY','NON_BM_STOR'])
y = data['Average base prices']

In [None]:
X.head()

# checking the dimension of the data with plot

In [None]:
data.plot(kind='line', x='1_Hour', y='Average base prices');
data.plot(kind='line', x='NSL_FLOW', y='Average base prices');

In [None]:
X.info()

# standardizing the data in other to bring all the unit to the same range

In [None]:
scaler = StandardScaler()
scaler.fit_transform(X)

# Spliting the data

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
print(f'x_train: {x_train.shape}, {y_train.shape}')
print(f'x_train: {x_test.shape}, {y_test.shape}')

# Using RandomForest to train the model 

In [None]:
model = RandomForestRegressor(random_state =8)

# Fitting the model

In [None]:
model.fit(x_train,y_train)

In [None]:
model.score(x_train,y_train)

In [None]:
model.score(x_test,y_test)

In [None]:
y_pred= model.predict(x_test)
y_pred

In [None]:
mean_squared_error(y_test,y_pred,squared=False)

# Using Feature selection to get us the best features to use for prediction

In [None]:
sel = SelectKBest(k=10)

In [None]:
sel.fit(X,y)

In [None]:
sel.get_support()

In [None]:
X.columns[sel.get_support()]

In [None]:
new_x = data[['4_Hour', '7_Hour', '14_Hour', '16_Hour', '17_Hour', '18_Hour',
       '20_Hour', '21_Hour', '22_Hour','NSL_FLOW']]

In [None]:
new_x.head()

In [None]:
new_x_train,new_x_test,new_y_train,new_y_test = train_test_split(new_x,y,test_size=0.2,random_state=2)

# Buiding Decision Tree Model for prediction

In [None]:
model2 = DecisionTreeRegressor(random_state=4, max_depth=6)

In [None]:
model2.fit(new_x_train,new_y_train)

In [None]:
new_y_pred= model2.predict(new_x_test)
new_y_pred

In [None]:
mean_squared_error(y_test,new_y_pred,squared=False)

In [None]:
print(model2.score(new_x_train,new_y_train))
model2.score(new_x_test,new_y_test)