In [1]:
# Import dependencies
import pandas as pd
import datetime as dt

In [2]:
# Import and read the Crude Oil Historical Production EIA data 
file_path = "../Resources/raw_datasets/"
production_file = f"{file_path}/Crude_Oil_Historical_Production_EIA.csv"

production_df = pd.read_csv(production_file, skiprows = 2)
production_df.head()

Unnamed: 0,Date,U.S. Field Production of Crude Oil (Thousand Barrels),East Coast (PADD 1) Field Production of Crude Oil (Thousand Barrels),Florida Field Production of Crude Oil (Thousand Barrels),New York Field Production of Crude Oil (Thousand Barrels),Pennsylvania Field Production of Crude Oil (Thousand Barrels),Virginia Field Production of Crude Oil (Thousand Barrels),West Virginia Field Production of Crude Oil (Thousand Barrels),Midwest (PADD 2) Field Production of Crude Oil (Thousand Barrels),Illinois Field Production of Crude Oil (Thousand Barrels),...,Wyoming Field Production of Crude Oil (Thousand Barrels),West Coast (PADD 5) Field Production of Crude Oil (Thousand Barrels),Alaska Field Production of Crude Oil (Thousand Barrels),Alaska South Field Production of Crude Oil (Thousand Barrels),Alaska North Slope Crude Oil Production (Thousand Barrels),Arizona Field Production of Crude Oil (Thousand Barrels),California Field Production of Crude Oil (Thousand Barrels),Nevada Field Production of Crude Oil (Thousand Barrels),Federal Offshore PADD 5 Field Production of Crude Oil (Thousand Barrels),Unnamed: 43
0,Jan-1920,34008.0,,,,,,,,,...,,,,,,,,,,
1,Feb-1920,33193.0,,,,,,,,,...,,,,,,,,,,
2,Mar-1920,36171.0,,,,,,,,,...,,,,,,,,,,
3,Apr-1920,34945.0,,,,,,,,,...,,,,,,,,,,
4,May-1920,36622.0,,,,,,,,,...,,,,,,,,,,


In [3]:
# Check all of the column names that were imported
production_df.columns.tolist()

['Date',
 'U.S. Field Production of Crude Oil (Thousand Barrels)',
 'East Coast (PADD 1) Field Production of Crude Oil (Thousand Barrels)',
 'Florida Field Production of Crude Oil (Thousand Barrels)',
 'New York Field Production of Crude Oil (Thousand Barrels)',
 'Pennsylvania Field Production of Crude Oil (Thousand Barrels)',
 'Virginia Field Production of Crude Oil (Thousand Barrels)',
 'West Virginia Field Production of Crude Oil (Thousand Barrels)',
 'Midwest (PADD 2) Field Production of Crude Oil (Thousand Barrels)',
 'Illinois Field Production of Crude Oil (Thousand Barrels)',
 'Indiana Field Production of Crude Oil (Thousand Barrels)',
 'Kansas Field Production of Crude Oil (Thousand Barrels)',
 'Kentucky Field Production of Crude Oil (Thousand Barrels)',
 'Michigan Field Production of Crude Oil (Thousand Barrels)',
 'Missouri Field Production of Crude Oil (Thousand Barrels)',
 'Nebraska Field Production of Crude Oil (Thousand Barrels)',
 'North Dakota Field Production of Crude 

In [4]:
# Create a DataFrame with the columns that we want to keep
production_df = production_df[["Date", "U.S. Field Production of Crude Oil (Thousand Barrels)"]]
production_df.head()

Unnamed: 0,Date,U.S. Field Production of Crude Oil (Thousand Barrels)
0,Jan-1920,34008.0
1,Feb-1920,33193.0
2,Mar-1920,36171.0
3,Apr-1920,34945.0
4,May-1920,36622.0


In [5]:
# Rename the columns
production_df.columns = ["Date", "U.S. Field Production of Crude Oil (1k Bar.)"]
production_df.head()

Unnamed: 0,Date,U.S. Field Production of Crude Oil (1k Bar.)
0,Jan-1920,34008.0
1,Feb-1920,33193.0
2,Mar-1920,36171.0
3,Apr-1920,34945.0
4,May-1920,36622.0


In [6]:
# Check the datatypes
production_df.dtypes

Date                                             object
U.S. Field Production of Crude Oil (1k Bar.)    float64
dtype: object

In [7]:
# Check how many records were pulled in
len(production_df)

1223

In [8]:
# Determine if there are any missing values in the data
production_df.isnull().sum()

Date                                            1
U.S. Field Production of Crude Oil (1k Bar.)    1
dtype: int64

In [9]:
# Drop the null values
production_df = production_df.dropna()

In [10]:
# Determine if there are any missing values in the data
production_df.isnull().sum()

Date                                            0
U.S. Field Production of Crude Oil (1k Bar.)    0
dtype: int64

In [11]:
# Convert Data types
production_df["Date"] = pd.to_datetime(production_df["Date"])
production_df.head()

Unnamed: 0,Date,U.S. Field Production of Crude Oil (1k Bar.)
0,1920-01-01,34008.0
1,1920-02-01,33193.0
2,1920-03-01,36171.0
3,1920-04-01,34945.0
4,1920-05-01,36622.0


In [12]:
# Filter the DataFrame between two dates
production_df = production_df[(production_df["Date"] >= "1986-01-01") & (production_df["Date"] <= "2021-10-31")]
production_df.head()

Unnamed: 0,Date,U.S. Field Production of Crude Oil (1k Bar.)
792,1986-01-01,283248.0
793,1986-02-01,256855.0
794,1986-03-01,279413.0
795,1986-04-01,265917.0
796,1986-05-01,273964.0


In [13]:
production_df.tail()

Unnamed: 0,Date,U.S. Field Production of Crude Oil (1k Bar.)
1217,2021-06-01,338645.0
1218,2021-07-01,351228.0
1219,2021-08-01,347393.0
1220,2021-09-01,324654.0
1221,2021-10-01,355670.0


In [14]:
# Convert the "Date" column to datetime (abbreviated Month-Year)
production_df["Date"] = pd.to_datetime(production_df["Date"]).dt.strftime("%b-%Y")
production_df.head()

Unnamed: 0,Date,U.S. Field Production of Crude Oil (1k Bar.)
792,Jan-1986,283248.0
793,Feb-1986,256855.0
794,Mar-1986,279413.0
795,Apr-1986,265917.0
796,May-1986,273964.0


In [15]:
# Export the Dataframe as a new CSV file without the index.
production_df.to_csv("../Resources/clean_datasets/Cleaned_Crude_Oil_Historical_Production.csv", index=False)