In [1]:
# Import dependencies
import pandas as pd

In [2]:
# Import and read the Historical Crude Petroleum Product Supplied data
file_path = "../Resources/raw_datasets/"
product_supplied_file = f"{file_path}/Historical_Crude_Petroleum_Product_Supplied.csv"

product_supplied_df = pd.read_csv(product_supplied_file, skiprows = 2)
product_supplied_df.head()

Unnamed: 0,Date,U.S. Product Supplied of Crude Oil and Petroleum Products (Thousand Barrels),U.S. Product Supplied of Crude Oil (Thousand Barrels),U.S. Product Supplied of Hydrocarbon Gas Liquids (Thousand Barrels),U.S. Product Supplied of Natural Gas Liquids (Thousand Barrels),U.S. Product Supplied of Ethane (Thousand Barrels),U.S. Product Supplied of Propane (Thousand Barrels),U.S. Product Supplied of Normal Butane (Thousand Barrels),U.S. Product Supplied of Isobutane (Thousand Barrels),U.S. Product Supplied of Natural Gasoline (Thousand Barrels),...,U.S. Product Supplied of Other Oils for Petrochemical Feedstock Use (Thousand Barrels),U.S. Product Supplied of Special Naphthas (Thousand Barrels),U.S. Product Supplied of Lubricants (Thousand Barrels),U.S. Product Supplied of Waxes (Thousand Barrels),U.S. Product Supplied of Petroleum Coke (Thousand Barrels),U.S. Product Supplied of Petroleum Coke Marketable (Thousand Barrels),U.S. Product Supplied of Petroleum Coke Catalyst (Thousand Barrels),U.S. Product Supplied of Asphalt and Road Oil (Thousand Barrels),U.S. Product Supplied of Still Gas (Thousand Barrels),U.S. Product Supplied of Miscellaneous Petroleum Products (Thousand Barrels)
0,Jan-1936,,,,,,,,,,...,,,,,,,,,,
1,Feb-1936,,,,,,,,,,...,,,,,,,,,,
2,Mar-1936,,,,,,,,,,...,,,,,,,,,,
3,Apr-1936,,,,,,,,,,...,,,,,,,,,,
4,May-1936,,,,,,,,,,...,,,,,,,,,,


In [3]:
# Check all of the column names that were imported
product_supplied_df.columns.tolist()

['Date',
 'U.S. Product Supplied of Crude Oil and Petroleum Products (Thousand Barrels)',
 'U.S. Product Supplied of Crude Oil (Thousand Barrels)',
 'U.S. Product Supplied of Hydrocarbon Gas Liquids (Thousand Barrels)',
 'U.S. Product Supplied of Natural Gas Liquids (Thousand Barrels)',
 'U.S. Product Supplied of Ethane (Thousand Barrels)',
 'U.S. Product Supplied of Propane (Thousand Barrels)',
 'U.S. Product Supplied of Normal Butane (Thousand Barrels)',
 'U.S. Product Supplied of Isobutane (Thousand Barrels)',
 'U.S. Product Supplied of Natural Gasoline (Thousand Barrels)',
 'U.S. Product Supplied of Refinery Olefins (Thousand Barrels)',
 'U.S. Product Supplied of Ethylene (Thousand Barrels)',
 'U.S. Product Supplied of Propylene (Thousand Barrels)',
 'U.S. Product Supplied of Normal Butylene (Thousand Barrels)',
 'U.S. Product Supplied of Isobutylene (Thousand Barrels)',
 'U.S. Product Supplied of Pentanes Plus (Thousand Barrels)',
 'U.S. Product Supplied of Liquified Petroleum Gas

In [4]:
# Create a DataFrame with the columns that we want to keep
product_supplied_df = product_supplied_df[["Date", 
                                           "U.S. Product Supplied of Crude Oil and Petroleum Products (Thousand Barrels)"]]
product_supplied_df.head()

Unnamed: 0,Date,U.S. Product Supplied of Crude Oil and Petroleum Products (Thousand Barrels)
0,Jan-1936,
1,Feb-1936,
2,Mar-1936,
3,Apr-1936,
4,May-1936,


In [5]:
# Rename the columns
product_supplied_df.columns = ["Date", "Product Supplied of Crude Oil and Petroleum Products (1k Bar.)"]
product_supplied_df.head()

Unnamed: 0,Date,Product Supplied of Crude Oil and Petroleum Products (1k Bar.)
0,Jan-1936,
1,Feb-1936,
2,Mar-1936,
3,Apr-1936,
4,May-1936,


In [6]:
# Check the datatypes
product_supplied_df.dtypes

Date                                                               object
Product Supplied of Crude Oil and Petroleum Products (1k Bar.)    float64
dtype: object

In [7]:
# Check how many records were pulled in
len(product_supplied_df)

1031

In [8]:
# Determine if there are any missing values in the data
product_supplied_df.isnull().sum()

Date                                                                1
Product Supplied of Crude Oil and Petroleum Products (1k Bar.)    541
dtype: int64

In [9]:
# Drop the null values
product_supplied_df = product_supplied_df.dropna()

In [10]:
# Determine if there are any missing values in the data
product_supplied_df.isnull().sum()

Date                                                              0
Product Supplied of Crude Oil and Petroleum Products (1k Bar.)    0
dtype: int64

In [11]:
# Convert Data types
product_supplied_df["Date"] = pd.to_datetime(product_supplied_df["Date"])
product_supplied_df.head()

Unnamed: 0,Date,Product Supplied of Crude Oil and Petroleum Products (1k Bar.)
540,1981-01-01,571321.0
541,1981-02-01,475689.0
542,1981-03-01,493111.0
543,1981-04-01,460490.0
544,1981-05-01,475943.0


In [12]:
# Filter the DataFrame between two dates
product_supplied_df = product_supplied_df[(product_supplied_df["Date"] >= "1986-01-01") & (product_supplied_df["Date"] <= "2021-10-31")]
product_supplied_df.head()

Unnamed: 0,Date,Product Supplied of Crude Oil and Petroleum Products (1k Bar.)
600,1986-01-01,498728.0
601,1986-02-01,453209.0
602,1986-03-01,504565.0
603,1986-04-01,478339.0
604,1986-05-01,495789.0


In [13]:
product_supplied_df.tail()

Unnamed: 0,Date,Product Supplied of Crude Oil and Petroleum Products (1k Bar.)
1025,2021-06-01,616115.0
1026,2021-07-01,616714.0
1027,2021-08-01,635828.0
1028,2021-09-01,606706.0
1029,2021-10-01,616639.0


In [14]:
# Convert the "Date" column to datetime (abbreviated Month-Year)
product_supplied_df["Date"] = pd.to_datetime(product_supplied_df["Date"]).dt.strftime("%b-%Y")
product_supplied_df.head()

Unnamed: 0,Date,Product Supplied of Crude Oil and Petroleum Products (1k Bar.)
600,Jan-1986,498728.0
601,Feb-1986,453209.0
602,Mar-1986,504565.0
603,Apr-1986,478339.0
604,May-1986,495789.0


In [15]:
# Export the Dataframe as a new CSV file without the index.
product_supplied_df.to_csv("../Resources/clean_datasets/Cleaned_Historical_Crude_Petroleum_Product_Supplied.csv", index=False)