In [1]:
#Importing relevant libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression  #Subclass Linear Regression in Class Linear Model
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import datetime
import plotly.graph_objects as go

In [2]:
df = pd.read_csv("../input/brent-oil-prices/BrentOilPrices.csv")
df.describe(include = "all")

Unnamed: 0,Date,Price
count,8554,8554.0
unique,8554,
top,20-May-87,
freq,1,
mean,,46.352962
std,,32.165282
min,,9.1
25%,,18.85
50%,,33.24
75%,,66.21


In [3]:
df = df.reset_index()
df = df.dropna()
print(df.head())
print(df.tail())
print(df.dtypes)

   index       Date  Price
0      0  20-May-87  18.63
1      1  21-May-87  18.45
2      2  22-May-87  18.55
3      3  25-May-87  18.60
4      4  26-May-87  18.63
      index          Date  Price
8549   8549  Jan 19, 2021  55.38
8550   8550  Jan 20, 2021  55.66
8551   8551  Jan 21, 2021  55.68
8552   8552  Jan 22, 2021  55.22
8553   8553  Jan 25, 2021  55.44
index      int64
Date      object
Price    float64
dtype: object


In [4]:
df["Date"] = pd.to_datetime(df.Date)
df.dtypes

index             int64
Date     datetime64[ns]
Price           float64
dtype: object

#### Preprocessing the data

In [5]:
#Function to prepare training, testing and forecasting data
#Function arguments are dataframe, column to be forecasted, no. of days to be predicted in future,test data size
def prepare_ttfdata(df,forecast_col,forecast_out,test_size):
    label = df[forecast_col].shift(-forecast_out) #creating the label column and 
    #expanding the dataset by a given number of rows having value NaN(not a number)
    #print(f"Length of the Label is /n {label[15:]}")
    print(label[-15:])
    X = np.array(df[[forecast_col]]) #creating the feature array
    X = preprocessing.scale(X) #preprocessing the feature array to reduce the biasness among input features
    X_lately = X[-forecast_out:] #dropping the last n rows which are newly created for forecasting
    # and X_lately column will be used later in predicting method
    X = X[:-forecast_out] #training and test data
    label.dropna(inplace = True) #dropping the NA values
    y = np.array(label) #assigning the output y
    
    #Creating training and test data
    x_train = X[0:int(len(X)*(1-test_size))]
    x_test = X[-int(len(X)*test_size):]
    y_train = y[0:int(len(y)*(1-test_size))]
    y_test = y[-int(len(y)*test_size):]
    response = [x_train,x_test,y_train,y_test,X_lately]
    #list of lists : list comprising of training i/p, test set, training o/p, test o/p, final 10 days that we need to predict as forecasting values
    return response


In [6]:
forecast_col = 'Price' #column to be forecasted
forecast_out = 10 # how far to forecast i.e forecasting for 10 days
test_size = 0.2 # size of test data

x_train,x_test,y_train,y_test,X_lately = prepare_ttfdata(df,forecast_col,forecast_out,test_size)

8539    55.38
8540    55.66
8541    55.68
8542    55.22
8543    55.44
8544      NaN
8545      NaN
8546      NaN
8547      NaN
8548      NaN
8549      NaN
8550      NaN
8551      NaN
8552      NaN
8553      NaN
Name: Price, dtype: float64


In [7]:
def train_model(model_class,x_train,y_train,x_test,y_test,X_lately):
    learner = model_class #Algorithm we want to initialise
    learner.fit(x_train,y_train) #learning happens here when the model is trained (training input and output variables)
    #y = f(x), function in the LR equation is given by learner.fit(x_train,y_train)
    score = learner.score(x_test,y_test) #testing the model
    
    #Prediction: We'll predict for a given set of dates
    #Forecast: We'll forecast for dates which are in Future
    
    forecast = learner.predict(X_lately) #set that'll contain forcasted data
    
    response = {} #creating json object/dictionary
    response['test_score'] = score #first key is test_score
    #measures the squared error b/w actual value and predicted value and provides a score b/w 0 and 1
    response['forecast_set'] = forecast #passing the forecasted values
    print(response)
    
    return learner

In [8]:
learner = train_model(model_class = LinearRegression(),x_train = x_train,y_train = y_train
                      ,x_test = x_test,y_test = y_test,X_lately = X_lately)

{'test_score': 0.9338452756377739, 'forecast_set': array([56.07999625, 55.62115149, 55.86054875, 54.90295968, 54.3144414 ,
       55.48150308, 55.76079989, 55.78074966, 55.3219049 , 55.5413524 ])}


For Linear Regression:
*     93% is RMSE (lesser the RSME, closer to 0 , better the results)
*     Practically we'll not be able to predict exactly similar value
*     Any number can have infinite decimal values 

For classification : 
*     93% is accuracy
*     For spam or genuine email, 93% is likely to be score of 1 i.e. accurate

#### Plotting the interactive graphs

In [9]:
fig = go.Figure()
#add_trace: plotting line graph 
#Scatter: plots points b/w 2 points where y is in date format
#plotting training data represented by teal
fig.add_trace(go.Scatter(x = df["Date"], y = df["Price"],
                        mode = 'lines',
                        name = 'Price', line = dict(color='teal')))
#plotting prediction of date represented by blue
#green colored is actual value and blue color is predicted value
fig.add_trace(go.Scatter(x = df["Date"][-len(x_test):], y = learner.predict(x_test),
                        mode = 'lines',
                        name = 'Predicted Price', line = dict(color='blue')))
#plotting forecast data represented by red (future forecast of 10 days)
fig.add_trace(go.Scatter(x = pd.date_range('2022-05-15','2022-05-24',
                                           freq = pd.tseries.offsets.BDay()),
                        y = learner.predict(X_lately),
                        mode = 'lines',
                        name = 'Forecasted Price', line = dict(color='yellow')))
#DateOffset: Standard kind of date increment used for a date range
#BDay: DateOffset subclass representing possibly n business days

fig.update_layout(title = 'Brent Oil Prediction & Forecasting over 10 days',
                 plot_bgcolor = 'black',
                 xaxis = dict(showgrid = False), yaxis = dict(showgrid = False),
                 xaxis_title = "Date",
                 yaxis_title = "Brent Oil Prices ($)")

fig.show()

In [10]:
# y = b0 + b1*x
print(f"The intercept for LR equation is : {learner.intercept_}")
print(f"The slope for LR equation is : {learner.coef_}")

The intercept for LR equation is : 46.47713578207336
The slope for LR equation is : [32.08262758]


#### y_hat = learner.intercept_ + learner.coef_ * X

##### Interpretation 1
* Sign of slope is +ve, with increase in values of X, y_hat will increase
* Sign of slope is -ve, with increase in yesterday's, today's value is likely to decrease

##### Interpretation 2
* With increase in 1 unit or 1$ in price, the chances of increase in today's value is an increase by 32.08