In [1]:
# Import dependencies
import pandas as pd
import datetime as dt

In [2]:
# Import and read the NYMEX Crude Oil Historical Futures Prices data
file_path = "../Resources/raw_datasets/"
futures_price_file = f"{file_path}/NYMEX_Crude_Oil_Historical_Futures_Prices.csv"

futures_price_df = pd.read_csv(futures_price_file, skiprows = 2)
futures_price_df.head()

Unnamed: 0,Date,"Cushing, OK Crude Oil Future Contract 1 (Dollars per Barrel)","Cushing, OK Crude Oil Future Contract 2 (Dollars per Barrel)","Cushing, OK Crude Oil Future Contract 3 (Dollars per Barrel)","Cushing, OK Crude Oil Future Contract 4 (Dollars per Barrel)",Unnamed: 5
0,"Mar 30, 1983",,,29.35,,
1,"Mar 31, 1983",,,29.24,,
2,"Apr 04, 1983",29.44,,29.1,,
3,"Apr 05, 1983",29.71,,29.35,,
4,"Apr 06, 1983",29.92,,29.5,,


In [3]:
# Check all of the column names that were imported
futures_price_df.columns.tolist()

['Date',
 'Cushing, OK Crude Oil Future Contract 1 (Dollars per Barrel)',
 'Cushing, OK Crude Oil Future Contract 2 (Dollars per Barrel)',
 'Cushing, OK Crude Oil Future Contract 3 (Dollars per Barrel)',
 'Cushing, OK Crude Oil Future Contract 4 (Dollars per Barrel)',
 'Unnamed: 5']

In [4]:
# Create a DataFrame with the columns that we want to keep
futures_price_df = futures_price_df[[
    "Date", 
    "Cushing, OK Crude Oil Future Contract 1 (Dollars per Barrel)",
    "Cushing, OK Crude Oil Future Contract 2 (Dollars per Barrel)",
    "Cushing, OK Crude Oil Future Contract 3 (Dollars per Barrel)",
    "Cushing, OK Crude Oil Future Contract 4 (Dollars per Barrel)"]]

futures_price_df.head()

Unnamed: 0,Date,"Cushing, OK Crude Oil Future Contract 1 (Dollars per Barrel)","Cushing, OK Crude Oil Future Contract 2 (Dollars per Barrel)","Cushing, OK Crude Oil Future Contract 3 (Dollars per Barrel)","Cushing, OK Crude Oil Future Contract 4 (Dollars per Barrel)"
0,"Mar 30, 1983",,,29.35,
1,"Mar 31, 1983",,,29.24,
2,"Apr 04, 1983",29.44,,29.1,
3,"Apr 05, 1983",29.71,,29.35,
4,"Apr 06, 1983",29.92,,29.5,


In [5]:
# Check the datatypes
futures_price_df.dtypes

Date                                                             object
Cushing, OK Crude Oil Future Contract 1 (Dollars per Barrel)    float64
Cushing, OK Crude Oil Future Contract 2 (Dollars per Barrel)    float64
Cushing, OK Crude Oil Future Contract 3 (Dollars per Barrel)    float64
Cushing, OK Crude Oil Future Contract 4 (Dollars per Barrel)    float64
dtype: object

In [6]:
# Check how many records were pulled in
len(futures_price_df)

9743

In [7]:
# Determine if there are any missing values in the data
futures_price_df.isnull().sum()

Date                                                              1
Cushing, OK Crude Oil Future Contract 1 (Dollars per Barrel)      6
Cushing, OK Crude Oil Future Contract 2 (Dollars per Barrel)    443
Cushing, OK Crude Oil Future Contract 3 (Dollars per Barrel)      2
Cushing, OK Crude Oil Future Contract 4 (Dollars per Barrel)    443
dtype: int64

In [8]:
# Drop the null values
futures_price_df = futures_price_df.dropna()

In [9]:
# Determine if there are any missing values in the data
futures_price_df.isnull().sum()

Date                                                            0
Cushing, OK Crude Oil Future Contract 1 (Dollars per Barrel)    0
Cushing, OK Crude Oil Future Contract 2 (Dollars per Barrel)    0
Cushing, OK Crude Oil Future Contract 3 (Dollars per Barrel)    0
Cushing, OK Crude Oil Future Contract 4 (Dollars per Barrel)    0
dtype: int64

In [10]:
# Convert Data types
futures_price_df["Date"] = pd.to_datetime(futures_price_df["Date"])
futures_price_df.head()

Unnamed: 0,Date,"Cushing, OK Crude Oil Future Contract 1 (Dollars per Barrel)","Cushing, OK Crude Oil Future Contract 2 (Dollars per Barrel)","Cushing, OK Crude Oil Future Contract 3 (Dollars per Barrel)","Cushing, OK Crude Oil Future Contract 4 (Dollars per Barrel)"
441,1985-01-02,25.92,25.81,25.69,25.63
442,1985-01-03,25.84,25.79,25.68,25.65
443,1985-01-04,25.18,25.19,25.16,25.13
444,1985-01-07,25.56,25.6,25.54,25.5
445,1985-01-08,25.48,25.51,25.49,25.46


In [11]:
# Filter the DataFrame between two dates
futures_price_df = futures_price_df[(futures_price_df["Date"] >= "1986-01-01") & (futures_price_df["Date"] <= "2021-10-31")]
futures_price_df.head()

Unnamed: 0,Date,"Cushing, OK Crude Oil Future Contract 1 (Dollars per Barrel)","Cushing, OK Crude Oil Future Contract 2 (Dollars per Barrel)","Cushing, OK Crude Oil Future Contract 3 (Dollars per Barrel)","Cushing, OK Crude Oil Future Contract 4 (Dollars per Barrel)"
691,1986-01-02,25.56,24.55,23.79,23.21
692,1986-01-03,25.97,25.01,24.22,23.62
693,1986-01-06,26.57,25.59,24.81,24.18
694,1986-01-07,26.2,25.2,24.38,23.75
695,1986-01-08,25.93,24.83,24.01,23.4


In [12]:
futures_price_df.tail()

Unnamed: 0,Date,"Cushing, OK Crude Oil Future Contract 1 (Dollars per Barrel)","Cushing, OK Crude Oil Future Contract 2 (Dollars per Barrel)","Cushing, OK Crude Oil Future Contract 3 (Dollars per Barrel)","Cushing, OK Crude Oil Future Contract 4 (Dollars per Barrel)"
9688,2021-10-25,83.76,82.61,81.21,79.95
9689,2021-10-26,84.65,83.36,81.89,80.56
9690,2021-10-27,82.66,81.48,80.09,78.84
9691,2021-10-28,82.81,81.46,79.95,78.62
9692,2021-10-29,83.57,81.78,79.98,78.46


In [13]:
# Calculate the average for each month and year for all columns
avg_future_contract = futures_price_df["Date"].dt.to_period("M")  

futures_price_df = futures_price_df.groupby(avg_future_contract).mean()
futures_price_df.head()

Unnamed: 0_level_0,"Cushing, OK Crude Oil Future Contract 1 (Dollars per Barrel)","Cushing, OK Crude Oil Future Contract 2 (Dollars per Barrel)","Cushing, OK Crude Oil Future Contract 3 (Dollars per Barrel)","Cushing, OK Crude Oil Future Contract 4 (Dollars per Barrel)"
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1986-01,22.976818,22.37,21.935909,21.663636
1986-02,15.457368,15.705263,15.831579,15.942105
1986-03,12.615,12.95,13.1185,13.2845
1986-04,12.753636,12.620909,12.568182,12.622727
1986-05,15.264286,14.755238,14.44,14.285238


In [14]:
# Round to specific decimals places 
futures_price_df = futures_price_df.round(decimals = 1)
futures_price_df.head()

Unnamed: 0_level_0,"Cushing, OK Crude Oil Future Contract 1 (Dollars per Barrel)","Cushing, OK Crude Oil Future Contract 2 (Dollars per Barrel)","Cushing, OK Crude Oil Future Contract 3 (Dollars per Barrel)","Cushing, OK Crude Oil Future Contract 4 (Dollars per Barrel)"
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1986-01,23.0,22.4,21.9,21.7
1986-02,15.5,15.7,15.8,15.9
1986-03,12.6,13.0,13.1,13.3
1986-04,12.8,12.6,12.6,12.6
1986-05,15.3,14.8,14.4,14.3


In [15]:
# Convert type from pandas period to string
futures_price_df.index = futures_price_df.index.strftime("%b-%Y")
futures_price_df.head()

Unnamed: 0_level_0,"Cushing, OK Crude Oil Future Contract 1 (Dollars per Barrel)","Cushing, OK Crude Oil Future Contract 2 (Dollars per Barrel)","Cushing, OK Crude Oil Future Contract 3 (Dollars per Barrel)","Cushing, OK Crude Oil Future Contract 4 (Dollars per Barrel)"
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jan-1986,23.0,22.4,21.9,21.7
Feb-1986,15.5,15.7,15.8,15.9
Mar-1986,12.6,13.0,13.1,13.3
Apr-1986,12.8,12.6,12.6,12.6
May-1986,15.3,14.8,14.4,14.3


In [16]:
# Convert the Index to Column
futures_price_df.reset_index(inplace=True)
futures_price_df.head()

Unnamed: 0,Date,"Cushing, OK Crude Oil Future Contract 1 (Dollars per Barrel)","Cushing, OK Crude Oil Future Contract 2 (Dollars per Barrel)","Cushing, OK Crude Oil Future Contract 3 (Dollars per Barrel)","Cushing, OK Crude Oil Future Contract 4 (Dollars per Barrel)"
0,Jan-1986,23.0,22.4,21.9,21.7
1,Feb-1986,15.5,15.7,15.8,15.9
2,Mar-1986,12.6,13.0,13.1,13.3
3,Apr-1986,12.8,12.6,12.6,12.6
4,May-1986,15.3,14.8,14.4,14.3


In [17]:
# Export the Dataframe as a new CSV file without the index.
futures_price_df.to_csv("../Resources/clean_datasets/Cleaned_NYMEX_Crude_Oil_Historical_Futures_Prices.csv", index=False)