In [1]:
# Import dependencies
import pandas as pd
import datetime as dt

In [2]:
# Import and read the Crude Oil Historical Refinery Utilization EIA data
file_path = "../Resources/raw_datasets/"
refinement_file = f"{file_path}/Crude_Oil_Historical_Refinery_Utilization_EIA.csv"

refinement_df = pd.read_csv(refinement_file, skiprows = 2)
refinement_df.head()

Unnamed: 0,Date,U.S. Gross Inputs to Refineries (Thousand Barrels Per Day),U. S. Operable Crude Oil Distillation Capacity (Thousand Barrels per Calendar Day),U. S. Operating Crude Oil Distillation Capacity (Thousand Barrels per Day),U. S. Idle Crude Oil Distillation Capacity (Thousand Barrels per Day),U.S. Percent Utilization of Refinery Operable Capacity,Unnamed: 6
0,Jan-1985,11583.0,15659.0,14361.0,1298.0,74.0,
1,Feb-1985,11485.0,15559.0,14293.0,1266.0,73.8,
2,Mar-1985,11484.0,15582.0,14268.0,1314.0,73.7,
3,Apr-1985,11969.0,15640.0,14605.0,1035.0,76.5,
4,May-1985,12269.0,15658.0,14338.0,1320.0,78.4,


In [3]:
# Check all of the column names that were imported
refinement_df.columns.tolist()

['Date',
 'U.S. Gross Inputs to Refineries (Thousand Barrels Per Day)',
 'U. S. Operable Crude Oil Distillation Capacity  (Thousand Barrels per Calendar Day)',
 'U. S. Operating Crude Oil Distillation Capacity  (Thousand Barrels per Day)',
 'U. S. Idle Crude Oil Distillation Capacity  (Thousand Barrels per Day)',
 'U.S. Percent Utilization of Refinery Operable Capacity',
 'Unnamed: 6']

In [4]:
# Create a DataFrame with the columns that we want to keep
refinement_df = refinement_df[[
    "Date", 
    "U.S. Gross Inputs to Refineries (Thousand Barrels Per Day)",
    "U. S. Operable Crude Oil Distillation Capacity  (Thousand Barrels per Calendar Day)",
    "U. S. Operating Crude Oil Distillation Capacity  (Thousand Barrels per Day)",
    "U. S. Idle Crude Oil Distillation Capacity  (Thousand Barrels per Day)",
    "U.S. Percent Utilization of Refinery Operable Capacity"]]

refinement_df.head()

Unnamed: 0,Date,U.S. Gross Inputs to Refineries (Thousand Barrels Per Day),U. S. Operable Crude Oil Distillation Capacity (Thousand Barrels per Calendar Day),U. S. Operating Crude Oil Distillation Capacity (Thousand Barrels per Day),U. S. Idle Crude Oil Distillation Capacity (Thousand Barrels per Day),U.S. Percent Utilization of Refinery Operable Capacity
0,Jan-1985,11583.0,15659.0,14361.0,1298.0,74.0
1,Feb-1985,11485.0,15559.0,14293.0,1266.0,73.8
2,Mar-1985,11484.0,15582.0,14268.0,1314.0,73.7
3,Apr-1985,11969.0,15640.0,14605.0,1035.0,76.5
4,May-1985,12269.0,15658.0,14338.0,1320.0,78.4


In [5]:
# Rename the columns
refinement_df.columns = ["Date", 
                         "Gross Inputs to Refineries (1k Bar./Day)",
                         "Operable Crude Oil Distillation Capacity (1k Bar./Calendar Day)",
                         "Operating Crude Oil Distillation Capacity (1k Bar./Day)", 
                         "Idle Crude Oil Distillation Capacity (1k Bar./Day)",
                         "Percent Utilization of Refinery Operable Capacity"]
refinement_df.head()

Unnamed: 0,Date,Gross Inputs to Refineries (1k Bar./Day),Operable Crude Oil Distillation Capacity (1k Bar./Calendar Day),Operating Crude Oil Distillation Capacity (1k Bar./Day),Idle Crude Oil Distillation Capacity (1k Bar./Day),Percent Utilization of Refinery Operable Capacity
0,Jan-1985,11583.0,15659.0,14361.0,1298.0,74.0
1,Feb-1985,11485.0,15559.0,14293.0,1266.0,73.8
2,Mar-1985,11484.0,15582.0,14268.0,1314.0,73.7
3,Apr-1985,11969.0,15640.0,14605.0,1035.0,76.5
4,May-1985,12269.0,15658.0,14338.0,1320.0,78.4


In [6]:
# Check the datatypes
refinement_df.dtypes

Date                                                                object
Gross Inputs to Refineries (1k Bar./Day)                           float64
Operable Crude Oil Distillation Capacity (1k Bar./Calendar Day)    float64
Operating Crude Oil Distillation Capacity (1k Bar./Day)            float64
Idle Crude Oil Distillation Capacity (1k Bar./Day)                 float64
Percent Utilization of Refinery Operable Capacity                  float64
dtype: object

In [7]:
# Check how many records were pulled in
len(refinement_df)

443

In [8]:
# Determine if there are any missing values in the data
refinement_df.isnull().sum()

Date                                                               1
Gross Inputs to Refineries (1k Bar./Day)                           1
Operable Crude Oil Distillation Capacity (1k Bar./Calendar Day)    1
Operating Crude Oil Distillation Capacity (1k Bar./Day)            1
Idle Crude Oil Distillation Capacity (1k Bar./Day)                 1
Percent Utilization of Refinery Operable Capacity                  1
dtype: int64

In [9]:
# Drop the null values
refinement_df = refinement_df.dropna()

In [10]:
# Determine if there are any missing values in the data
refinement_df.isnull().sum()

Date                                                               0
Gross Inputs to Refineries (1k Bar./Day)                           0
Operable Crude Oil Distillation Capacity (1k Bar./Calendar Day)    0
Operating Crude Oil Distillation Capacity (1k Bar./Day)            0
Idle Crude Oil Distillation Capacity (1k Bar./Day)                 0
Percent Utilization of Refinery Operable Capacity                  0
dtype: int64

In [11]:
# Convert Data types
refinement_df["Date"] = pd.to_datetime(refinement_df["Date"])
refinement_df.head()

Unnamed: 0,Date,Gross Inputs to Refineries (1k Bar./Day),Operable Crude Oil Distillation Capacity (1k Bar./Calendar Day),Operating Crude Oil Distillation Capacity (1k Bar./Day),Idle Crude Oil Distillation Capacity (1k Bar./Day),Percent Utilization of Refinery Operable Capacity
0,1985-01-01,11583.0,15659.0,14361.0,1298.0,74.0
1,1985-02-01,11485.0,15559.0,14293.0,1266.0,73.8
2,1985-03-01,11484.0,15582.0,14268.0,1314.0,73.7
3,1985-04-01,11969.0,15640.0,14605.0,1035.0,76.5
4,1985-05-01,12269.0,15658.0,14338.0,1320.0,78.4


In [12]:
# Filter the DataFrame between two dates
refinement_df = refinement_df[(refinement_df["Date"] >= "1986-01-01") & (refinement_df["Date"] <= "2021-10-31")]
refinement_df.head()

Unnamed: 0,Date,Gross Inputs to Refineries (1k Bar./Day),Operable Crude Oil Distillation Capacity (1k Bar./Calendar Day),Operating Crude Oil Distillation Capacity (1k Bar./Day),Idle Crude Oil Distillation Capacity (1k Bar./Day),Percent Utilization of Refinery Operable Capacity
12,1986-01-01,12583.0,15459.0,14639.0,820.0,81.4
13,1986-02-01,12068.0,15485.0,14538.0,947.0,77.9
14,1986-03-01,11759.0,15485.0,14517.0,968.0,75.9
15,1986-04-01,12603.0,15473.0,14550.0,923.0,81.5
16,1986-05-01,13314.0,15484.0,14805.0,679.0,86.0


In [13]:
refinement_df.tail()

Unnamed: 0,Date,Gross Inputs to Refineries (1k Bar./Day),Operable Crude Oil Distillation Capacity (1k Bar./Calendar Day),Operating Crude Oil Distillation Capacity (1k Bar./Day),Idle Crude Oil Distillation Capacity (1k Bar./Day),Percent Utilization of Refinery Operable Capacity
437,2021-06-01,16743.0,18128.0,17910.0,218.0,92.4
438,2021-07-01,16482.0,18129.0,17943.0,187.0,90.9
439,2021-08-01,16377.0,18130.0,17914.0,216.0,90.3
440,2021-09-01,15797.0,18130.0,15800.0,2331.0,87.1
441,2021-10-01,15581.0,18132.0,17133.0,999.0,85.9


In [14]:
# Convert the "Date" column to datetime (abbreviated Month-Year)
refinement_df["Date"] = pd.to_datetime(refinement_df["Date"]).dt.strftime("%b-%Y")
refinement_df.head()

Unnamed: 0,Date,Gross Inputs to Refineries (1k Bar./Day),Operable Crude Oil Distillation Capacity (1k Bar./Calendar Day),Operating Crude Oil Distillation Capacity (1k Bar./Day),Idle Crude Oil Distillation Capacity (1k Bar./Day),Percent Utilization of Refinery Operable Capacity
12,Jan-1986,12583.0,15459.0,14639.0,820.0,81.4
13,Feb-1986,12068.0,15485.0,14538.0,947.0,77.9
14,Mar-1986,11759.0,15485.0,14517.0,968.0,75.9
15,Apr-1986,12603.0,15473.0,14550.0,923.0,81.5
16,May-1986,13314.0,15484.0,14805.0,679.0,86.0


In [15]:
# Export the Dataframe as a new CSV file without the index.
refinement_df.to_csv("../Resources/clean_datasets/Cleaned_Crude_Oil_Historical_Refinery_Utilization_EIA.csv", index=False)