# OPIS QC

In [3]:
import os 
import pandas as pd
import numpy as np
import glob
import pyodbc
import gc

# Downloading Data

## Source Data

In [46]:
SANDAG = pd.read_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Service Requests\2022\2022-54 OPIS Fuel Price Data QC\data\Copy of San Diego Association of Governments.xlsx')
June_data = pd.read_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Service Requests\2022\2022-54 OPIS Fuel Price Data QC\data\Copy of SanDiegoCountyJune2019.xlsx')

In [47]:
SANDAG

Unnamed: 0,Region Name,Retail Product Name,Start Date,Retail Average,Wholesale Average,Tax Average,Freight Average,Margin Average,Net Average
0,"County - CA, San Diego",Midgrade Gas,2019-07-01,3.878256,2.495961,0.757949,0.015,0.609346,3.105307
1,"County - CA, San Diego",Unleaded Gas,2019-07-01,3.719346,2.355909,0.753572,0.015,0.594865,2.950774
2,"County - CA, San Diego",Diesel,2019-07-01,3.907932,2.338376,0.980837,0.015,0.573719,2.912095
3,"County - CA, San Diego",Premium Gas,2019-07-01,3.995260,2.575163,0.760255,0.015,0.644842,3.220006
4,"County - CA, San Diego",Premium Gas,2020-08-01,3.506908,2.151249,0.781632,0.015,0.559027,2.710276
...,...,...,...,...,...,...,...,...,...
83,"County - CA, San Diego",Premium Gas,2022-03-01,6.019278,4.676671,0.862529,0.015,0.465078,5.141749
84,"County - CA, San Diego",Premium Gas,2022-04-01,6.121857,4.391234,0.854100,0.015,0.861523,5.252757
85,"County - CA, San Diego",Midgrade Gas,2022-04-01,5.992657,4.244345,0.849617,0.015,0.883695,5.128040
86,"County - CA, San Diego",Unleaded Gas,2022-04-01,5.794604,4.050425,0.844127,0.015,0.885052,4.935477


## SQL Data

In [42]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=dpoe_stage;'
                    'Trusted_Connection=yes;')

query ="SELECT\
      [region]\
      ,[product]\
	  ,dim.date_code\
      ,[retail_avg]\
      ,[wholesale_avg]\
      ,[tax_avg]\
      ,[freight_avg]\
      ,[margin_avg]\
      ,[net_avg]\
  FROM [dpoe_stage].[fuel_price_opis].[price_fact] AS fact\
  INNER JOIN [dpoe_stage].[fuel_price_opis].[date_dim] AS dim\
  ON fact.date_id = dim.date_id\
  WHERE dim.yr >= 2019"

sql_data =  pd.read_sql_query(query, conn)

# Data Type Checks 

In [52]:
sql_data.columns

Index(['region', 'product', 'date_code', 'retail_avg', 'wholesale_avg',
       'tax_avg', 'freight_avg', 'margin_avg', 'net_avg'],
      dtype='object')

In [51]:
June_data.columns

Index(['Region Name', 'Retail Product Name ', 'Start Date', 'Retail Average',
       'Wholesale Average', 'Tax Average', 'Freight Average', 'Margin Average',
       'Net Average'],
      dtype='object')

# Data Cleaning 

In [6]:
sql_data.columns = SANDAG.columns

In [16]:
June_data.columns = SANDAG.columns # Retail product name has an extra space in the column header

In [17]:
June_data['Retail Product Name'] = [string.strip() for string in June_data['Retail Product Name']]

In [13]:
SANDAG['Retail Product Name'] = [string.strip() for string in SANDAG['Retail Product Name']] # Some of the data inside the product name raw files has excessive spaces inside

In [18]:
# SQL Data Cleaning 
sql_data_sorted = sql_data.sort_values(by='Retail Average').reset_index(drop=True)
sql_data_sorted['Start Date'] = pd.to_datetime(sql_data_sorted['Start Date'])

In [21]:
# SANDAG Data Cleaning 
SANDAG_data_sorted = SANDAG.sort_values(by='Retail Average').reset_index(drop=True)
SANDAG_data_sorted['Start Date'] = pd.to_datetime(SANDAG_data_sorted['Start Date'])

In [23]:
# June Data Cleaning
june_data_sorted = June_data.sort_values(by='Retail Average').reset_index(drop=True)
june_data_sorted['Start Date'] = pd.to_datetime(june_data_sorted['Start Date']) 

# Data Comparison

In [24]:
SANDAG.sum()

  SANDAG.sum()


Region Name            County - CA, San DiegoCounty - CA, San DiegoCo...
Retail Product Name    Midgrade GasUnleaded GasDieselPremium GasPremi...
Retail Average                                                374.315147
Wholesale Average                                             245.668631
Tax Average                                                     76.95694
Freight Average                                                     1.32
Margin Average                                                 50.369569
Net Average                                                   296.038203
dtype: object

In [55]:
sql_data.sum()

  sql_data.sum()


Region Name            County - CA, San DiegoCounty - CA, San DiegoCo...
Retail Product Name    Premium GasMidgrade GasUnleaded GasDieselPremi...
Retail Average                                                374.593941
Wholesale Average                                             245.791765
Tax Average                                                    76.783018
Freight Average                                                     1.32
Margin Average                                                 50.699153
Net Average                                                    296.49092
dtype: object

# Checking June Data

In [25]:
june_data_sorted

Unnamed: 0,Region Name,Retail Product Name,Start Date,Retail Average,Wholesale Average,Tax Average,Freight Average,Margin Average,Net Average
0,"County - CA, San Diego",Unleaded Gas,2019-06-01,3.78903,2.42469,0.699437,0.015,0.649903,3.074593
1,"County - CA, San Diego",Midgrade Gas,2019-06-01,3.949064,2.562529,0.703723,0.015,0.667812,3.230341
2,"County - CA, San Diego",Diesel,2019-06-01,3.981497,2.254755,0.969351,0.015,0.742391,2.997146
3,"County - CA, San Diego",Premium Gas,2019-06-01,4.059997,2.646569,0.70618,0.015,0.692249,3.338818


In [28]:
# Grabbing the individual Retail Average values 
june_retail_values = list(june_data_sorted['Retail Average'])

In [36]:
# Sorting the SQL Dataframe to only include retail values that match
sql_june_sort = sql_data_sorted[sql_data_sorted['Retail Average'].isin(june_retail_values)].reset_index(drop=True)
sql_june_sort

Unnamed: 0,Region Name,Retail Product Name,Start Date,Retail Average,Wholesale Average,Tax Average,Freight Average,Margin Average,Net Average
0,"County - CA, San Diego",Unleaded Gas,2019-06-01,3.78903,2.42469,0.699437,0.015,0.649903,3.074593
1,"County - CA, San Diego",Midgrade Gas,2019-06-01,3.949064,2.562529,0.703723,0.015,0.667812,3.230341
2,"County - CA, San Diego",Diesel,2019-06-01,3.981497,2.254755,0.969351,0.015,0.742391,2.997146
3,"County - CA, San Diego",Premium Gas,2019-06-01,4.059997,2.646569,0.70618,0.015,0.692249,3.338818


In [40]:
june_data_sorted['Net Average'][0] 

3.07459306668028

In [41]:
sql_june_sort['Net Average'][0] 

3.074593067

In [39]:
june_data_sorted == sql_june_sort

Unnamed: 0,Region Name,Retail Product Name,Start Date,Retail Average,Wholesale Average,Tax Average,Freight Average,Margin Average,Net Average
0,True,True,True,True,True,False,True,False,False
1,True,True,True,True,True,False,True,False,False
2,True,True,True,True,True,False,True,False,False
3,True,True,True,True,True,False,True,False,False


# Testing of other dataset

In [85]:
not_equivalent_index_vals = []
for num_col in range(len(SANDAG)): 
    equivalent_test = pd.DataFrame(SANDAG_data_sorted.iloc[num_col,:]).T == pd.DataFrame(sql_data_sorted.iloc[num_col,:]).T
    sum_val = equivalent_test.iloc[0].sum()
    if sum_val != 9:
        not_equivalent_index_vals.append(num_col) 

not_equivalent_index_vals


[25, 26, 27, 30, 31, 32, 33, 34, 35]

In [86]:
pd.DataFrame(SANDAG_data_sorted.iloc[25,:]).T == pd.DataFrame(sql_data_sorted.iloc[25,:]).T

Unnamed: 0,Region Name,Retail Product Name,Start Date,Retail Average,Wholesale Average,Tax Average,Freight Average,Margin Average,Net Average
25,True,False,False,False,False,False,True,False,False


In [90]:
pd.DataFrame(SANDAG_data_sorted.iloc[25,:]).T

Unnamed: 0,Region Name,Retail Product Name,Start Date,Retail Average,Wholesale Average,Tax Average,Freight Average,Margin Average,Net Average
25,"County - CA, San Diego",Unleaded Gas,2019-07-01,3.719346,2.355909,0.753572,0.015,0.594865,2.950774


In [89]:
pd.DataFrame(sql_data_sorted.iloc[25,:]).T

Unnamed: 0,Region Name,Retail Product Name,Start Date,Retail Average,Wholesale Average,Tax Average,Freight Average,Margin Average,Net Average
25,"County - CA, San Diego",Diesel,2021-02-01,3.720077,2.273301,1.027826,0.015,0.403949,2.677251
