# 2022-54 OPIS Fuel Price Data

Test Plan: https://sandag.sharepoint.com/qaqc/_layouts/15/Doc.aspx?sourcedoc={c0fd7f23-7faa-4f2e-b7fb-79e477e131a6}&action=edit&wd=target%282022-41.one%7C5c70225e-b636-4090-a946-7545acf4abc3%2FTest%20Plan%7C683def87-dc70-4863-8cad-59d9ef21bd00%2F%29

In [1]:
import pandas as pd
import sqlalchemy as sql

from pathlib import Path

ddam = sql.create_engine('mssql+pymssql://DDAMWSQL16/dpoe_stage')

## Download Data

In [2]:
def download_raw_data(user):
    """
    Download the two raw data files. Note that copies of these files were put into SharePoint.

    :param user:    The user downloading the data from SharePoint. This is mostly here so that it
                    is easy for anyone to run the code
    :returns:       Tuple containing two dataframes. In order, data contained comes from the files:
                    "Copy of San Diego Association of Governments.xlsx"
                    "Copy of SanDiegoCountyJune2019"
    """
    # The folder where raw data is stored
    base_url = Path(f"C:/Users/{user}/San Diego Association of Governments/SANDAG QA QC - Documents/Service Requests/2022/2022-54 OPIS Fuel Price Data QC/data/")

    # The two raw data files we are getting
    raw_files = [
        Path("Copy of San Diego Association of Governments.xlsx"),
        Path("Copy of SanDiegoCountyJune2019.xlsx"),
    ]

    # Get the two raw data files
    # Note the different behaviors depending on file extension
    raw_data = []
    for file in raw_files:
        raw_data.append(pd.read_excel(base_url / file))

    # Return the two raw data files in tuple format
    return tuple(raw_data)

In [3]:
def download_SQL_data(connection):
    """
    Download the contents of the two SQL tables joined together into one table

    :param connection:  sqlalchemy connection to DDAMWSQL16/dpoe_stage 
    :returns:           One dataframe containing the combined contents of the two tables:
                        [dpoe_stage].[fuel_price_opis].[price_fact] 
                        [dpoe_stage].[fuel_price_opis].[date_dim]
    """
    # The tables are rather small, so there is no issue in just downloading both tables and holding
    # them in memory
    price_fact = pd.read_sql_query("""
        SELECT * FROM [dpoe_stage].[fuel_price_opis].[price_fact]
        """, con=connection)
    date_dim = pd.read_sql_query("""
        SELECT * FROM [dpoe_stage].[fuel_price_opis].[date_dim]
        """, con=connection)

    # Combine the tables along the "date_id" field of price_fact
    combined = price_fact.merge(date_dim, on="date_id")

    # Make sure the "date_code" column is datetime
    combined["date_code"] = pd.to_datetime(combined["date_code"]).dt.date

    # Return the combined SQL tables (yes I know I could have done it in SQL but I perfer doing it
    # in python)
    return combined

## Running Tests

In [4]:
# Get the raw data and sql data
retail_7_2019_to_4_2022, region_6_2019 = download_raw_data("eli")
source_data = download_SQL_data(ddam)

Tests on the file: "R:/DPOE/Fuel Price/OPIS/2021/Source/Copy of San Diego Association of Governments.xlsx"

In [5]:
# Check that the correct years and months were loaded (August 2020 to April 2022 (see email))

# The correct years and months. Note "MS" is a frequency of every month start
correct_date_range = pd.date_range(start="2020-08-01", end="2022-04-01", freq="MS")

# The actual loaded years and months
csv_date_range = retail_7_2019_to_4_2022["Start Date"].value_counts().index

# Run test
try:
    correct_date_range == csv_date_range
    print("Correct years/months were loaded from SQL")
except ValueError:
    print("Incorrect years/months were loaded from SQL")
    print(f"{'Dates in csv but not in SQL:': <32}", list(csv_date_range.difference(correct_date_range)))
    print(f"{'Dates in SQL but not in csv:': <32}", list(correct_date_range.difference(csv_date_range)))

Incorrect years/months were loaded from SQL
Dates in csv but not in SQL:     [Timestamp('2019-07-01 00:00:00')]
Dates in SQL but not in csv:     []


In [6]:
# Check the correct number of rows were loaded

# Since csv data contains extra rows (as a result of the extra dates), remove them first
csv_data = retail_7_2019_to_4_2022[
    ~retail_7_2019_to_4_2022["Start Date"].isin(csv_date_range.difference(correct_date_range))
]

# SQL data also contains extra rows, remove them first
sql_data = source_data.copy(deep=True)
sql_data = sql_data[pd.to_datetime(sql_data["date_code"]).isin(correct_date_range)]

# Test if the number of rows are the same
test_result = csv_data.shape[0] == sql_data.shape[0]
if(test_result):
    print("Correct number of rows were loaded from SQL")
else:
    print("Incorrect number of rows were loaded from SQL")
    print(f"{'Rows in csv file:': <20}", csv_data.shape[0])
    print(f"{'Rows in SQL:': <20}", sql_data.shape[0])

Correct number of rows were loaded from SQL


In [7]:
# Check each date has the correct number of rows associated with it
# Just use the same dfs as before, with the proper date filters already done

# Test if the number of rows are the same
test_result = (csv_data["Start Date"].value_counts() == sql_data["date_code"].value_counts())
if(test_result.sum() - test_result.shape[0] == 0):
    print("Correct number of rows for each date were loaded from SQL")
else:
    print("Incorrect number of rows for each date were loaded from SQL")
    print("Differences are as follows")
    print(csv_data.value_counts()[test_result])
    print(sql_data.value_counts()[test_result])

Correct number of rows for each date were loaded from SQL


In [8]:
# Check that column names/dtypes match

# The correct columns, which come from SQL. Note "MS" is a frequency of every month start
correct_columns = source_data.columns

# The actual columns, which come from CSV
csv_columns = retail_7_2019_to_4_2022.columns

# Test the column names match
try:
    correct_columns == csv_columns
    print("Correct column names were loaded from SQL")
except ValueError:
    print("Incorrect column names were loaded from SQL")
    print(f"{'Columns in csv but not in SQL:': <32}", list(csv_columns.difference(correct_columns)))
    print(f"{'Columns in SQL but not in csv:': <32}", list(correct_columns.difference(csv_columns)))

Incorrect column names were loaded from SQL
Columns in csv but not in SQL:   ['Freight Average', 'Margin Average', 'Net Average', 'Region Name', 'Retail Average', 'Retail Product Name', 'Start Date', 'Tax Average', 'Wholesale Average']
Columns in SQL but not in csv:   ['date_code', 'date_id', 'dy', 'freight_avg', 'margin_avg', 'mnth', 'net_avg', 'opis_id', 'product', 'qtr', 'region', 'retail_avg', 'season', 'station_count', 'tax_avg', 'wholesale_avg', 'yr']
