# 2022-54 OPIS Fuel Price Data

Test Plan: https://sandag.sharepoint.com/qaqc/_layouts/15/Doc.aspx?sourcedoc={c0fd7f23-7faa-4f2e-b7fb-79e477e131a6}&action=edit&wd=target%282022-41.one%7C5c70225e-b636-4090-a946-7545acf4abc3%2FTest%20Plan%7C683def87-dc70-4863-8cad-59d9ef21bd00%2F%29

In [1]:
import datetime

# Note, this notebook will be opening Excel spreadsheets, which means that pandas requires openpyxl 
# This can be installed in your environment using "pip install openpyxl"
import pandas as pd
import sqlalchemy as sql

from pathlib import Path

ddam = sql.create_engine('mssql+pymssql://DDAMWSQL16/dpoe_stage')

## Download Data

In [2]:
def download_raw_data(user):
    """
    Download the four raw data files. Note that copies of these files were put into SharePoint.
    Also note that the file "San Diego County Retail Margin History.csv" is not actually formatted
    as a csv file in the sense that the delimiter is "|" instead of ","

    :param user:    The user downloading the data from SharePoint. This is mostly here so that it
                    is easy for anyone to run the code
    :returns:       Tuple containing four dataframes. In order, data contained comes from the file:
                    "San Diego County Retail Margin History.csv"
                    "OPIS_FUEL_010105to123113.xlsx" 
                    "San Diego County Monthly Jan 2018-May 2019.xlsx"
                    "SDAG_OPIS retail margin history_020618.xlsx"
    """
    # The folder where raw data is stored
    base_url = Path(f"C:/Users/{user}/San Diego Association of Governments/SANDAG QA QC - Documents/Service Requests/2022/2022-54 OPIS Fuel Price Data QC/data/")

    # The four raw data files we are getting
    raw_files = [
        Path("San Diego County Retail Margin History.csv"),
        Path("OPIS_FUEL_010105to123113.xlsx"),
        Path("San Diego County Monthly Jan 2018-May 2019.xlsx"),
        Path("SDAG_OPIS retail margin history_020618.xlsx")
    ]

    # Get the four raw data files
    # Note the different behaviors depending on file extension
    raw_data = []
    for file in raw_files:
        if(file.suffix == ".csv"):
            raw_data.append(pd.read_csv(base_url / file, sep="|"))
        elif(file.suffix == ".xlsx"):
            raw_data.append(pd.read_excel(base_url / file))

    # Return the four raw data files in tuple format
    return tuple(raw_data)

In [3]:
def download_SQL_data(connection):
    """
    Download the contents of the two SQL tables joined together into one table

    :param connection:  sqlalchemy connection to DDAMWSQL16/dpoe_stage 
    :returns:           One dataframe containing the combined contents of the two tables:
                        [dpoe_stage].[fuel_price_opis].[price_fact] 
                        [dpoe_stage].[fuel_price_opis].[date_dim]
    """
    # The tables are rather small, so there is no issue in just downloading both tables and holding
    # them in memory
    price_fact = pd.read_sql_query("""
        SELECT * FROM [dpoe_stage].[fuel_price_opis].[price_fact]
        """, con=connection)
    date_dim = pd.read_sql_query("""
        SELECT * FROM [dpoe_stage].[fuel_price_opis].[date_dim]
        """, con=connection)

    # Combine the tables along the "date_id" field of price_fact
    combined = price_fact.merge(date_dim, on="date_id")

    # Make sure the "date_code" column is datetime
    combined["date_code"] = pd.to_datetime(combined["date_code"]).dt.date

    # Return the combined SQL tables (yes I know I could have done it in SQL but I perfer doing it
    # in python)
    return combined

## Test Function(s)

In [4]:
def test(raw_df, sql_df):
    """
    Run the tests as specified in the test plan. There should be a link to the test plan at the top
    of this notebook. Note, this function does not return any values, rather it will directly print
    out test results (and differences if a test fails)

    :param raw_df:  The df containing raw data
    :param sql_df:  The df containing SQL data. It is assumed that this df is pre-transformed
                    outside of this function into the correct format with the correct columns
    :returns:       None
    """
    # TODO
    return None

## Running Tests

In [5]:
# Get the raw data and sql data
cost_2019_2020, OPIS_2005_2013, cost_2018_2019, OPIS_2014_2017 = download_raw_data("eli")
sql_data = download_SQL_data(ddam)

In [6]:
# Tests on the file:
# "R:\DPOE\Fuel Price\OPIS\2020\Source\"San Diego County Retail Margin History.csv" 

# Transform the SQL data into the correct format

# First make a copy to avoid changing the original SQL download
sql_cost_2019_2020 = sql_data.copy(deep=True)

# Get the correct date ranges
start_date = datetime.date(2019, 7, 1) # year, month, day
end_date = datetime.date(2020, 7, 1)
sql_cost_2019_2020 = sql_cost_2019_2020[
    (sql_cost_2019_2020["date_code"] >= start_date) & 
    (sql_cost_2019_2020["date_code"] <= end_date)]
sql_cost_2019_2020

# Get the correct columns


# Run tests
# test(cost_2019_2020, sql_cost_2019_2020)

Unnamed: 0,opis_id,region,date_id,product,station_count,retail_avg,wholesale_avg,tax_avg,freight_avg,margin_avg,net_avg,date_code,yr,mnth,dy,qtr,season


In [7]:
# Tests on the file:
# "R:\DPOE\Fuel Price\OPIS\2019\Source\OPIS_FUEL_010105to123113.xlsx"  

In [8]:
# Tests on the file:
# "R:\DPOE\Fuel Price\OPIS\2019\Source\San Diego County Monthly Jan 2018-May 2019.xlsx"   

In [9]:
# Tests on the file:
# "R:\DPOE\Fuel Price\OPIS\2019\Source\SDAG_OPIS retail margin history_020618.xlsx"  