In [65]:
# This code loops through all AEO Price files in a single folder location and creates one harmonized dataframe with all prices
# Reads excel input. Splits data into three different dataframes, manipulates and creates unique fields and concats all
#  dataframes back together

# AEO Price File link: https://www.eia.gov/outlooks/aeo/tables_ref.php
#  -- see "Energy Prices by Sector", tables 3.1 to 3.9


# Field explanations
# Region

Region_dict = {'PRC001': 'New England', 'PRC002': 'Middle Atlantic', 'PRC003': 'East North Central', 'PRC004': 'West North Central', 
              'PRC005': 'South Atlantic', 'PRC006': 'East South Central', 'PRC007': 'West South Central', 'PRC008': 'Mountain',
              'PRC009': 'Pacific'}

#PRC001: 'New England'
#PRC002: 'Middle Atlantic'
#PRC003: 'East North Central'
#PRC004: 'West North Central'
#PRC005: 'South Atlantic'
#PRC006: 'East South Central'
#PRC007: 'West South Central'
#PRC008: 'Mountain'
#PRC009: 'Pacific'

# Currency
#

# Sector
#ba: Residential real
#ca: Commercial real
#da: Industrial real
#ea: Transportation real
#ga: Electric Power real
#ha: Average real price to all users    
#R: Residential nominal
#C: Commercial nominal
#I: Industrial nominal
#T: Transportation nominal
#E: Electric power nominal
#Avg: Average price to all users nominal




# for reading the excel files
import glob
import pandas as pd
import numpy as np

from pandas import Series, DataFrame

# Lookups for regions

Region_list = () 

# Initialize dataframe
DF_AEOprices = []

# Loop that goes through price files and pulls all relevant prices
for file in glob.iglob('AEO21*.xlsx'):
    
    # read excel file. 
    df = pd.read_excel(file,skiplines=16, na_values = '- -').dropna().reset_index().iloc[1:,:-1]
    
    # creates column names for reference
    column_index = [1,2]
    new_column_names = ['UniqueID', 'Fuel']
    old_column_names = df.columns[column_index]
    df.rename(columns=dict(zip(old_column_names, new_column_names)),inplace=True)

    # Splits string in first column into three columns for unique reference
    u = df['UniqueID'].str.split(":|_",n=3,expand = True)
    u1 = u.loc[u[1]!='nom']
    u2 = u.loc[u[1]=='nom']
    u1.insert(1,"Currency Temp","real")
    u1 = u1.iloc[:,:-2]
    u2 = u2.iloc[:,:-1]
    
    # Rename particular columns
    column_index_u1 = [0,1,2]
    new_column_names_u1 = ['Region', 'Currency', 'Sector']
    old_column_names_u1 = u1.columns[column_index_u1]
    u1.rename(columns=dict(zip(old_column_names_u1, new_column_names_u1)),inplace=True)
    
    # Rename particular columns
    column_index_u2 = [0,1,2]
    new_column_names_u2 = ['Region', 'Currency', 'Sector']
    old_column_names_u2 = u2.columns[column_index_u2]
    u2.rename(columns=dict(zip(old_column_names_u2, new_column_names_u2)),inplace=True)
    
    DF_U_Final = u1.append(u2)
    
    # changes data series into float64
    s=df[df.columns[3:]]
    s.astype(np.float64)
    
    # takes "fuel" column from original dataframe
    t=df.iloc[:,2]
    
    # Concats all three tables to one
    DF_CONCAT = pd.concat([DF_U_Final, t, s],axis=1)
    
    # Appends latest dataframe to original
    DF_AEOprices.append(DF_CONCAT)

# Concats latest dataframe to original, initialized dataframe
DF_AEOprices = pd.concat(DF_AEOprices).reset_index()

#DF_AEOprices.replace({'Region': Region_dict})
DF_AEOprices['Region'].replace(Region_dict,inplace=True)
DF_AEOprices.head()


Unnamed: 0,index,Region,Currency,Sector,Fuel,2020,2021,2022,2023,2024,...,2041,2042,2043,2044,2045,2046,2047,2048,2049,2050
0,1,East North Central,real,ba,Distillate Fuel Oil,14.7832,14.1076,14.3432,14.860775,15.0048,...,18.746496,18.904196,19.246132,19.26008,19.291626,19.589476,19.676483,19.676479,19.816244,19.746124
1,2,East North Central,real,ba,Natural Gas,7.74562,8.2971,8.08272,8.079117,8.01765,...,9.403335,9.408748,9.411675,9.409809,9.436717,9.447354,9.485736,9.519991,9.583446,9.629267
2,3,East North Central,real,ba,Electricity,37.0477,37.7227,38.1454,38.132877,37.9005,...,35.146564,35.035698,34.831573,34.642075,34.513695,34.405235,34.250187,34.101852,33.855801,33.556526
3,4,East North Central,real,ca,Propane,11.7056,12.5412,13.2244,13.506413,13.902,...,17.678133,17.790768,17.965965,18.073204,18.054075,18.256695,18.443087,18.532257,18.618753,18.705418
4,5,East North Central,real,ca,Distillate Fuel Oil,16.3728,16.1542,16.0197,16.543488,16.3388,...,19.022861,19.166958,19.440187,19.440559,19.462996,19.762249,19.874834,19.858763,19.983566,19.943867


In [55]:
# Creates dataframe by querying needed prices

DF_AEOprices.head()

Unnamed: 0,index,Region,Currency,Sector,Fuel,2020,2021,2022,2023,2024,...,2041,2042,2043,2044,2045,2046,2047,2048,2049,2050
0,1,PRC003,real,ba,Distillate Fuel Oil,14.7832,14.1076,14.3432,14.860775,15.0048,...,18.746496,18.904196,19.246132,19.26008,19.291626,19.589476,19.676483,19.676479,19.816244,19.746124
1,2,PRC003,real,ba,Natural Gas,7.74562,8.2971,8.08272,8.079117,8.01765,...,9.403335,9.408748,9.411675,9.409809,9.436717,9.447354,9.485736,9.519991,9.583446,9.629267
2,3,PRC003,real,ba,Electricity,37.0477,37.7227,38.1454,38.132877,37.9005,...,35.146564,35.035698,34.831573,34.642075,34.513695,34.405235,34.250187,34.101852,33.855801,33.556526
3,4,PRC003,real,ca,Propane,11.7056,12.5412,13.2244,13.506413,13.902,...,17.678133,17.790768,17.965965,18.073204,18.054075,18.256695,18.443087,18.532257,18.618753,18.705418
4,5,PRC003,real,ca,Distillate Fuel Oil,16.3728,16.1542,16.0197,16.543488,16.3388,...,19.022861,19.166958,19.440187,19.440559,19.462996,19.762249,19.874834,19.858763,19.983566,19.943867
