# **Brent Oil Price Prediction Model**

In [2]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
# Read the dataset
df = pd.read_csv('../Datasets/brent.csv')

In [4]:
# Display the first few rows of the dataset
df.head()

Unnamed: 0,Date,OpenBrent,HighBrent,LowBrent,CloseBrent,VolumeBrent
0,2000-01-04,23.9,24.7,23.89,24.39,32509
1,2000-01-05,24.25,24.37,23.7,23.73,30310
2,2000-01-06,23.55,24.22,23.35,23.62,44662
3,2000-01-07,23.57,23.98,23.05,23.09,34826
4,2000-01-10,23.04,23.78,23.04,23.73,26388


In [5]:
# Check the shape of the dataset
df.shape

(5764, 6)

In [6]:
df.head()

Unnamed: 0,Date,OpenBrent,HighBrent,LowBrent,CloseBrent,VolumeBrent
0,2000-01-04,23.9,24.7,23.89,24.39,32509
1,2000-01-05,24.25,24.37,23.7,23.73,30310
2,2000-01-06,23.55,24.22,23.35,23.62,44662
3,2000-01-07,23.57,23.98,23.05,23.09,34826
4,2000-01-10,23.04,23.78,23.04,23.73,26388


In [7]:
# Remove unnecessary columns
df.drop(['Date', 'VolumeBrent'], axis=1, inplace=True)

In [8]:
df.head()

Unnamed: 0,OpenBrent,HighBrent,LowBrent,CloseBrent
0,23.9,24.7,23.89,24.39
1,24.25,24.37,23.7,23.73
2,23.55,24.22,23.35,23.62
3,23.57,23.98,23.05,23.09
4,23.04,23.78,23.04,23.73


In [9]:
# Display the first few rows of the modified dataset
df.corr()

Unnamed: 0,OpenBrent,HighBrent,LowBrent,CloseBrent
OpenBrent,1.0,0.999521,0.999396,0.998965
HighBrent,0.999521,1.0,0.999189,0.999434
LowBrent,0.999396,0.999189,1.0,0.999579
CloseBrent,0.998965,0.999434,0.999579,1.0


In [10]:
# Calculate the correlation matrix
from sklearn.model_selection import train_test_split

In [11]:
# Split the dataset into training and testing sets
xTrain, xTest, yTrain, yTest = train_test_split(df.drop(['CloseBrent'], axis=1), df['CloseBrent'], test_size=0.2, random_state=42)

In [12]:
# Import the regression models
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

In [13]:
# Initialize the models
model1 = LinearRegression()
model2 = XGBRegressor()
model3 = RandomForestRegressor()

In [14]:
# Train the models
model1.fit(xTrain, yTrain)
model2.fit(xTrain, yTrain)
model3.fit(xTrain, yTrain)

In [15]:
# Make predictions on the test set
yPred1 = model1.predict(xTest)
yPred2 = model2.predict(xTest)
yPred3 = model3.predict(xTest)

In [16]:
# Import evaluation metrics - root mean squared error (RMSE)
from sklearn.metrics import mean_squared_error

In [17]:
# Evaluate the models using root mean squared error (RMSE)
print('Linear Regression: ', np.sqrt(mean_squared_error(yTest, yPred1)))
print('XGBoost: ', np.sqrt(mean_squared_error(yTest, yPred2)))
print('Random Forest: ', np.sqrt(mean_squared_error(yTest, yPred3)))

Linear Regression:  0.5682050325346023
XGBoost:  0.7018633752809779
Random Forest:  0.6570915929770714


In [18]:
# Import evaluation metrics -  R-squared (coefficient of determination)
from sklearn.metrics import r2_score

In [19]:
# Evaluate the models using R-squared (coefficient of determination)
print('Linear Regression: ', r2_score(yTest, yPred1))
print('XGBoost: ', r2_score(yTest, yPred2))
print('Random Forest: ', r2_score(yTest, yPred3))

Linear Regression:  0.9996312357822562
XGBoost:  0.9994373429265748
Random Forest:  0.9995068370483443


In [20]:
# Make a prediction using the Linear Regression model as the accuracy is the highest
model1.predict([[23.9,24.7,23.89]])



array([24.53337913])

In [21]:
from sklearn.metrics import accuracy_score

threshold = 24.0  # Adjust the threshold as per your requirement

# Convert predicted values to binary classes
yPred1_binary = [1 if y >= threshold else 0 for y in yPred1]

# Convert actual values to binary classes
yTest_binary = [1 if y >= threshold else 0 for y in yTest]

# Calculate accuracy
accuracy = accuracy_score(yTest_binary, yPred1_binary)
print("Accuracy:", accuracy)

Accuracy: 0.9991326973113617


In [22]:
# Save the trained model using pickle
import pickle

In [23]:
pickle.dump(model1, open("../Models/brent_prediction.pkl", 'wb'))

In [None]:
plt.plot(yTest, label='Actual')
plt.plot(yPred1, label='Linear Regression')
plt.plot(yPred2, label='XGBoost')
plt.plot(yPred3, label='Random Forest')
plt.xlabel('Data Point')
plt.ylabel('Close Heating Value')
plt.title('Comparison of Actual and Predicted Values')
plt.legend()
plt.show()

In [None]:
plt.scatter(yTest, yPred1, label='Linear Regression')
plt.scatter(yTest, yPred2, label='XGBoost')
plt.scatter(yTest, yPred3, label='Random Forest')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Scatter Plot of Actual vs. Predicted Values')
plt.legend()
plt.show()

In [None]:
plt.scatter(yPred1, yPred1 - yTest, label='Linear Regression')
plt.scatter(yPred2, yPred2 - yTest, label='XGBoost')
plt.scatter(yPred3, yPred3 - yTest, label='Random Forest')
plt.axhline(y=0, color='black', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.legend()
plt.show()

In [None]:
plt.hist(yTest, bins=30, alpha=0.5, label='Actual')
plt.hist(yPred1, bins=30, alpha=0.5, label='Linear Regression')
plt.hist(yPred2, bins=30, alpha=0.5, label='XGBoost')
plt.hist(yPred3, bins=30, alpha=0.5, label='Random Forest')
plt.xlabel('Close Heating Value')
plt.ylabel('Frequency')
plt.title('Histogram of Actual and Predicted Values')
plt.legend()
plt.show()

In [30]:
from sklearn.metrics import classification_report

# Define the threshold for binary classification
threshold = 24.0

# Convert predicted values to binary classes
yPred1_binary = [1 if y >= threshold else 0 for y in yPred1]

# Convert actual values to binary classes
yTest_binary = [1 if y >= threshold else 0 for y in yTest]

# Generate the classification report
classification_rep = classification_report(yTest_binary, yPred1_binary)

# Display the classification report
print("Classification Report:")
print(classification_rep)

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        44
           1       1.00      1.00      1.00      1109

    accuracy                           1.00      1153
   macro avg       0.99      1.00      0.99      1153
weighted avg       1.00      1.00      1.00      1153

