In [1]:
pip install ipynb-py-convert

Collecting ipynb-py-convert
  Downloading ipynb-py-convert-0.4.6.tar.gz (3.9 kB)
Building wheels for collected packages: ipynb-py-convert
  Building wheel for ipynb-py-convert (setup.py): started
  Building wheel for ipynb-py-convert (setup.py): finished with status 'done'
  Created wheel for ipynb-py-convert: filename=ipynb_py_convert-0.4.6-py3-none-any.whl size=4630 sha256=8c91d3f5e8b115e8a2c1e8425a5f88e7a6c10aeb2e8f389304aee79628e77d8c
  Stored in directory: c:\users\1\appdata\local\pip\cache\wheels\af\31\a9\761b134adbbca3c92d491eff3bad785c0a0c0079695d6f0504
Successfully built ipynb-py-convert
Installing collected packages: ipynb-py-convert
Successfully installed ipynb-py-convert-0.4.6
Note: you may need to restart the kernel to use updated packages.


## Crude Oil Price Forecasting using timeseries analysis

data source : https://www.investing.com/commodities/crude-oil-historical-data

model type : recurrent neural network (RNN) with long short term memory (LSTM)

pre-requisite : https://chromedriver.chromium.org/downloads 

In [1]:
# import necessary libraries
import pandas as pd
from selenium import webdriver
import time
import matplotlib.pyplot as plt
import numpy as np

### Data Mining

In [2]:
# to scrap a dynamic webpage , selenimum webdriver is required
# for the function to work on other system, download webdriver for your platform from 
# https://chromedriver.chromium.org/downloads 
# unzip the downloaded file and place the chromewebdriver and copy the path
# Update the line browser = webdriver.Chrome('path_to_webdriver')

# following function scraps webdata and return pandas dataframe
def scrap_historical_data():
    # instantiate chrome webdriver
    browser = webdriver.Chrome(r'C:\Users\1\Desktop\קורס DS\שיעור 1\demos\demos\chromedriver.exe')
    url = 'https://www.investing.com/commodities/crude-oil-historical-data'

    browser.get(url)
    # add wait time for the webpage to load
    time.sleep(10)
    # remove any popups
    try:
        popup = browser.find_element_by_xpath('//*[@id="PromoteSignUpPopUp"]/div[2]/i')
        popup.click()
    except:
        pass
    field = browser.find_element_by_xpath('//*[@id="widgetFieldDateRange"]')
    field.click()
    # add wait time for the webpage to load
    time.sleep(2)
    # set date limit to which historical data is considered
    start_date = browser.find_element_by_xpath('//*[@id="startDate"]')
    start_date.clear()
    start_date.send_keys('01/01/2000')
    enter_key = browser.find_element_by_xpath('//*[@id="applyBtn"]')
    enter_key.click()
    # add wait time for the webpage to load
    time.sleep(10)
    # remove any popups
    try:
        popup = browser.find_element_by_xpath('//*[@id="PromoteSignUpPopUp"]/div[2]/i')
        popup.click()
    except:
        pass
    # load the data from webpage
    try:
        tabel = browser.find_element_by_xpath('//*[@id="curr_table"]').get_attribute('outerHTML')
    except:
        # remove any popups
        try:
            popup = browser.find_element_by_xpath('//*[@id="PromoteSignUpPopUp"]/div[2]/i')
            popup.click()
        except:
            pass
        # load the data from webpage
        tabel = browser.find_element_by_xpath('//*[@id="curr_table"]').get_attribute('outerHTML')
    df  = pd.read_html(tabel)
    dataset = df[0]
    # save a copy of data scarpped into local directory
    dataset.to_csv('crude_oil_price_history.csv')
    return dataset

In [3]:
# invoke scrapping function
main_dataset = scrap_historical_data()
# parse data in scrapped data to standard timestamp format
main_dataset.loc[:,'Date'] = pd.to_datetime(main_dataset.loc[:,'Date'])
# Drop columns that are not needed to do prediction
main_dataset.drop(columns=['Open','High','Low','Vol.','Change %'], inplace=True, axis=1)
# Display the sample to see how dataframe looks now
main_dataset.head()

Unnamed: 0,Date,Price
0,2019-08-23,54.17
1,2019-08-22,55.35
2,2019-08-21,55.68
3,2019-08-20,56.34
4,2019-08-19,56.21


### Data Cleansing

In [4]:
# set index of the datafarme as timestamp
cleaned_dataset = main_dataset.set_index('Date')
# downsample the data to fix the data frequency 
# dataframe now contains crudeoil price for once in 3 days
# upsample is not done since it will add noise to the data
resample = cleaned_dataset.resample('3D').ffill()

# isolate test data from training data
real_data = resample.iloc[-12:,:]
test_data = resample.iloc[-24:-12,:]

# create the masked sample which doesn't contain the test data
masked_resample = resample.iloc[:-24,:]

In [5]:
real_data.head()

Unnamed: 0_level_0,Price
Date,Unnamed: 1_level_1
2019-07-20,55.63
2019-07-23,56.77
2019-07-26,56.2
2019-07-29,56.87
2019-08-01,53.95


In [6]:
test_data.head()

Unnamed: 0_level_0,Price
Date,Unnamed: 1_level_1
2019-06-14,52.51
2019-06-17,51.93
2019-06-20,56.65
2019-06-23,57.43
2019-06-26,59.38


### Feature Engineering

In [7]:
# adding sliding window for creation of forecasting model
# the forecasting model will take 12 previous data points of crude 
# oil price and forecast future 12 datapoints
masked_resample['Price+1'] = masked_resample['Price'].shift(-1)
masked_resample['Price+2'] = masked_resample['Price'].shift(-2)
masked_resample['Price+3'] = masked_resample['Price'].shift(-3)
masked_resample['Price+4'] = masked_resample['Price'].shift(-4)
masked_resample['Price+5'] = masked_resample['Price'].shift(-5)
masked_resample['Price+6'] = masked_resample['Price'].shift(-6)
masked_resample['Price+7'] = masked_resample['Price'].shift(-7)
masked_resample['Price+8'] = masked_resample['Price'].shift(-8)
masked_resample['Price+9'] = masked_resample['Price'].shift(-9)
masked_resample['Price+10'] = masked_resample['Price'].shift(-10)
masked_resample['Price+11'] = masked_resample['Price'].shift(-11)
masked_resample['Price+12'] = masked_resample['Price'].shift(-12)
masked_resample['Price+13'] = masked_resample['Price'].shift(-13)
masked_resample['Price+14'] = masked_resample['Price'].shift(-14)
masked_resample['Price+15'] = masked_resample['Price'].shift(-15)
masked_resample['Price+16'] = masked_resample['Price'].shift(-16)
masked_resample['Price+17'] = masked_resample['Price'].shift(-17)
masked_resample['Price+18'] = masked_resample['Price'].shift(-18)
masked_resample['Price+19'] = masked_resample['Price'].shift(-19)
masked_resample['Price+20'] = masked_resample['Price'].shift(-20)
masked_resample['Price+21'] = masked_resample['Price'].shift(-21)
masked_resample['Price+22'] = masked_resample['Price'].shift(-22)
masked_resample['Price+23'] = masked_resample['Price'].shift(-23)
# drop the trailing row which contains NaN and cleanup the masked sample
masked_resample.dropna(inplace=True)
masked_resample.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: 

Unnamed: 0_level_0,Price,Price+1,Price+2,Price+3,Price+4,Price+5,Price+6,Price+7,Price+8,Price+9,...,Price+14,Price+15,Price+16,Price+17,Price+18,Price+19,Price+20,Price+21,Price+22,Price+23
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-04,25.55,24.22,24.67,26.69,28.02,29.54,28.2,28.28,27.22,27.64,...,30.06,29.51,29.51,29.97,30.35,31.77,31.51,34.13,31.76,32.02
2000-01-07,24.22,24.67,26.69,28.02,29.54,28.2,28.28,27.22,27.64,28.03,...,29.51,29.51,29.97,30.35,31.77,31.51,34.13,31.76,32.02,31.09
2000-01-10,24.67,26.69,28.02,29.54,28.2,28.28,27.22,27.64,28.03,28.82,...,29.51,29.97,30.35,31.77,31.51,34.13,31.76,32.02,31.09,30.91
2000-01-13,26.69,28.02,29.54,28.2,28.28,27.22,27.64,28.03,28.82,28.77,...,29.97,30.35,31.77,31.51,34.13,31.76,32.02,31.09,30.91,27.46
2000-01-16,28.02,29.54,28.2,28.28,27.22,27.64,28.03,28.82,28.77,29.44,...,30.35,31.77,31.51,34.13,31.76,32.02,31.09,30.91,27.46,28.02


### Feature Normalization

In [8]:
#normalize all the features for the machine learning model to learn fast
from sklearn.preprocessing import MinMaxScaler
sc_X = MinMaxScaler()
sc_y = MinMaxScaler()
X = sc_X.fit_transform(masked_resample.iloc[:,0:12])
y = sc_y.fit_transform(masked_resample.iloc[:,12:])

### Define Independant and Dependant Variables

In [9]:
# reshape the input features (X) to a 3D input for the RNN (recurrent neural network) to feed on
X_train = X.reshape(X.shape[0],1,X.shape[1])
y_train = y

### Deep Learning Model Definition

In [10]:
# Recurrent Neural Netowrk (RNN) with Long Short Term Memory (LSTM)
# Importing the Keras libraries and packages
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

# Initialising the RNN
regressor = Sequential()

# Adding the input layer and the LSTM layer
regressor.add(LSTM(units = 30, return_sequences = True, input_shape = (None, X_train.shape[2])))
# Adding a second LSTM layer
regressor.add(LSTM(units = 30, return_sequences = True))
# Adding a third LSTM layer
regressor.add(LSTM(units = 30, return_sequences = True))
# Adding a fourth LSTM layer
regressor.add(LSTM(units = 30))
# Adding the output layer
regressor.add(Dense(units = y_train.shape[1]))

# Compiling the RNN
regressor.compile(optimizer = 'rmsprop', loss = 'mean_squared_error')
regressor.summary()

ModuleNotFoundError: No module named 'keras'

### Model Training

In [None]:
# Fitting the RNN to the Training set (actual training happens here)
regressor.fit(X_train, y_train, epochs = 32, batch_size = 64)

### Model Testing and Test Result Vizualization

In [None]:
# do prediction on test data
test_input = sc_X.transform(np.array(test_data['Price']).reshape(1,-1))
prediction = sc_y.inverse_transform(regressor.predict(test_input.reshape(test_input.shape[0],1,test_input.shape[1])))

In [None]:
# vizulaization on test data prediction
real_data['prediction'] = prediction.reshape(-1,1)
plotting = resample[['Price']]
plotting['predicted'] = real_data['prediction'] 
plotting.columns = ['real price', 'predicted price']
plotting.iloc[-100:,:].plot.line()
plt.show

### Forecast for next 12 datapoints

In [None]:
letest_prices = sc_X.transform(np.array(real_data['Price']).reshape(1,-1))
forecast = sc_y.inverse_transform(regressor.predict(letest_prices.reshape(test_input.shape[0],1,letest_prices.shape[1])))

### Forecast Vizualization

In [None]:
forecast_df = pd.DataFrame(forecast.reshape(-1,1))
forecast_df.index = real_data.index + 12
forecast_df.columns=['prediction']
forecast_df['Price'] = np.nan
f_plotting = resample[['Price']]
f_plotting['prediction'] = np.nan
f_plotting = pd.concat([f_plotting, forecast_df], axis=0)
f_plotting.iloc[-100:,:].plot.line()
plt.show

In [None]:
forecast_df.drop(columns=['Price'], axis=1).to_csv('forecast.csv')