In [1]:
from usagrid import s3
import pandas as pd
from pathlib import Path

pd.options.display.float_format = '{:20,.2f}'.format

paths = s3.list_files_in_folder("usagrid","landingarea/powerplant")

files = [p for p in paths[:] if Path(p).suffix]

dfs = [s3.read_pyarrow_df_from_s3("usagrid",f).to_pandas() for f in files[:]]

df = pd.concat(dfs)

In [2]:
def set_dtypes(df):
    
    data = df.copy()

    dtypes_dict = {'period':"datetime64[ns]",
    'plantCode':"object",
    'plantName':"object",
    'fuel2002':"object",
    'fuelTypeDescription':"object",
    'state':"object",
    'stateDescription':"object",
    'primeMover':"object",
    'average-heat-content':"float64",
    'average-heat-content-units':"object",
    'consumption-for-eg':"float64",
    'consumption-for-eg-units':"object",
    'consumption-for-eg-btu':"float64",
    'generation':"float64",
    'gross-generation':"float64",
    'total-consumption':"float64",
    'total-consumption-units':"object",
    'total-consumption-btu':"float64",
    'consumption-for-eg-btu-units':"object",
    'generation-units':"object",
    'gross-generation-units':"object",
    'total-consumption-btu-units':"object"
    }

    for k, v in dtypes_dict.items():

        data[k] = data[k].astype(v)

    return data

def filter_out_ALL_fuel(df):

    return df.query('fuel2002 != "ALL"')

def filter_out_ALL_prime_movers(df):

    return df.query('primeMover != "ALL"')


def extract_year(df):

    return df.assign(year=df.period.dt.year)

In [17]:
modified = (
df
.pipe(set_dtypes)
.pipe(extract_year)
)

write_path = str(Path(files[0]).parents[:-1][0].joinpath("powperplants_concat.arrow")).replace("landingarea","bronze")

s3.write_data_to_s3_pyarrow(bucket_name="usagrid",object_key=write_path,data=modified)