# OPIS QC

In [1]:
import os 
import pandas as pd
import numpy as np
import glob
import pyodbc
import gc

# Downloading Data

## Source Data

In [43]:
SANDAG = pd.read_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Service Requests\2022\2022-54 OPIS Fuel Price Data QC\data\Copy of San Diego Association of Governments.xlsx')
June_data = pd.read_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Service Requests\2022\2022-54 OPIS Fuel Price Data QC\data\Copy of SanDiegoCountyJune2019.xlsx')

In [46]:
SANDAG

Unnamed: 0,Region Name,Retail Product Name,Start Date,Retail Average,Wholesale Average,Tax Average,Freight Average,Margin Average,Net Average
0,"County - CA, San Diego",Midgrade Gas,2019-07-01,3.878256,2.495961,0.757949,0.015,0.609346,3.105307
1,"County - CA, San Diego",Unleaded Gas,2019-07-01,3.719346,2.355909,0.753572,0.015,0.594865,2.950774
2,"County - CA, San Diego",Diesel,2019-07-01,3.907932,2.338376,0.980837,0.015,0.573719,2.912095
3,"County - CA, San Diego",Premium Gas,2019-07-01,3.995260,2.575163,0.760255,0.015,0.644842,3.220006
4,"County - CA, San Diego",Premium Gas,2020-08-01,3.506908,2.151249,0.781632,0.015,0.559027,2.710276
...,...,...,...,...,...,...,...,...,...
83,"County - CA, San Diego",Premium Gas,2022-03-01,6.019278,4.676671,0.862529,0.015,0.465078,5.141749
84,"County - CA, San Diego",Premium Gas,2022-04-01,6.121857,4.391234,0.854100,0.015,0.861523,5.252757
85,"County - CA, San Diego",Midgrade Gas,2022-04-01,5.992657,4.244345,0.849617,0.015,0.883695,5.128040
86,"County - CA, San Diego",Unleaded Gas,2022-04-01,5.794604,4.050425,0.844127,0.015,0.885052,4.935477


## SQL Data

In [28]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=dpoe_stage;'
                    'Trusted_Connection=yes;')

query ="SELECT\
      [region]\
      ,[product]\
	  ,dim.date_code\
      ,[retail_avg]\
      ,[wholesale_avg]\
      ,[tax_avg]\
      ,[freight_avg]\
      ,[margin_avg]\
      ,[net_avg]\
  FROM [dpoe_stage].[fuel_price_opis].[price_fact] AS fact\
  INNER JOIN [dpoe_stage].[fuel_price_opis].[date_dim] AS dim\
  ON fact.date_id = dim.date_id\
  WHERE dim.yr >= 2019"

sql_data =  pd.read_sql_query(query, conn)

# Data Cleaning 

In [45]:
sql_data.columns = SANDAG.columns

In [62]:
sql_data.columns

Index(['Region Name', 'Retail Product Name', 'Start Date', 'Retail Average',
       'Wholesale Average', 'Tax Average', 'Freight Average', 'Margin Average',
       'Net Average'],
      dtype='object')

In [53]:
SANDAG['Retail Product Name'] = [string.strip() for string in SANDAG['Retail Product Name']] # Some of the data inside the product name raw files has excessive spaces inside

In [73]:
sql_data_sorted = sql_data.sort_values(by='Retail Average').reset_index(drop=True)

In [92]:
sql_data_sorted.columns

Index(['Region Name', 'Retail Product Name', 'Start Date', 'Retail Average',
       'Wholesale Average', 'Tax Average', 'Freight Average', 'Margin Average',
       'Net Average'],
      dtype='object')

In [95]:
sql_data_sorted['Start Date'] = pd.to_datetime(sql_data_sorted['Start Date'])

In [74]:
SANDAG_data_sorted = SANDAG.sort_values(by='Retail Average').reset_index(drop=True)

In [96]:
SANDAG_data_sorted['Start Date'] = pd.to_datetime(SANDAG_data_sorted['Start Date'])

# Data Comparison

In [54]:
SANDAG.sum()

  SANDAG.sum()


Region Name            County - CA, San DiegoCounty - CA, San DiegoCo...
Retail Product Name    Midgrade GasUnleaded GasDieselPremium GasPremi...
Retail Average                                                374.315147
Wholesale Average                                             245.668631
Tax Average                                                     76.95694
Freight Average                                                     1.32
Margin Average                                                 50.369569
Net Average                                                   296.038203
dtype: object

In [55]:
sql_data.sum()

  sql_data.sum()


Region Name            County - CA, San DiegoCounty - CA, San DiegoCo...
Retail Product Name    Premium GasMidgrade GasUnleaded GasDieselPremi...
Retail Average                                                374.593941
Wholesale Average                                             245.791765
Tax Average                                                    76.783018
Freight Average                                                     1.32
Margin Average                                                 50.699153
Net Average                                                    296.49092
dtype: object

In [85]:
not_equivalent_index_vals = []
for num_col in range(len(SANDAG)): 
    equivalent_test = pd.DataFrame(SANDAG_data_sorted.iloc[num_col,:]).T == pd.DataFrame(sql_data_sorted.iloc[num_col,:]).T
    sum_val = equivalent_test.iloc[0].sum()
    if sum_val != 9:
        not_equivalent_index_vals.append(num_col) 

not_equivalent_index_vals


[25, 26, 27, 30, 31, 32, 33, 34, 35]

In [86]:
pd.DataFrame(SANDAG_data_sorted.iloc[25,:]).T == pd.DataFrame(sql_data_sorted.iloc[25,:]).T

Unnamed: 0,Region Name,Retail Product Name,Start Date,Retail Average,Wholesale Average,Tax Average,Freight Average,Margin Average,Net Average
25,True,False,False,False,False,False,True,False,False


In [90]:
pd.DataFrame(SANDAG_data_sorted.iloc[25,:]).T

Unnamed: 0,Region Name,Retail Product Name,Start Date,Retail Average,Wholesale Average,Tax Average,Freight Average,Margin Average,Net Average
25,"County - CA, San Diego",Unleaded Gas,2019-07-01,3.719346,2.355909,0.753572,0.015,0.594865,2.950774


In [89]:
pd.DataFrame(sql_data_sorted.iloc[25,:]).T

Unnamed: 0,Region Name,Retail Product Name,Start Date,Retail Average,Wholesale Average,Tax Average,Freight Average,Margin Average,Net Average
25,"County - CA, San Diego",Diesel,2021-02-01,3.720077,2.273301,1.027826,0.015,0.403949,2.677251


In [83]:
for num_col in range(len(SANDAG)): 
    print(num_col)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87


In [81]:
for i in range(1,10):
    print(i)

1
2
3
4
5
6
7
8
9


In [78]:
(pd.DataFrame(SANDAG_data_sorted.iloc[0,:]).T == pd.DataFrame(sql_data_sorted.iloc[0,:]).T).iloc[0].sum()

9

In [68]:
pd.DataFrame(sql_data_sorted.iloc[0,:]).T

Unnamed: 0,Region Name,Retail Product Name,Start Date,Retail Average,Wholesale Average,Tax Average,Freight Average,Margin Average,Net Average
16,"County - CA, San Diego",Unleaded Gas,2020-11-01,3.151355,1.747771,0.769642,0.015,0.618942,2.366713


In [30]:
sql_data.columns = SANDAG.columns

In [33]:
sql_data

Unnamed: 0,Region Name,Retail Product Name,Start Date,Retail Average,Wholesale Average,Tax Average,Freight Average,Margin Average,Net Average
0,"County - CA, San Diego",Premium Gas,2019-06-01,4.059997,2.646569,0.706180,0.015,0.692249,3.338818
1,"County - CA, San Diego",Midgrade Gas,2019-06-01,3.949064,2.562529,0.703723,0.015,0.667812,3.230341
2,"County - CA, San Diego",Unleaded Gas,2019-06-01,3.789030,2.424690,0.699437,0.015,0.649903,3.074593
3,"County - CA, San Diego",Diesel,2019-06-01,3.981497,2.254755,0.969351,0.015,0.742391,2.997146
4,"County - CA, San Diego",Premium Gas,2020-08-01,3.506908,2.151249,0.781632,0.015,0.559027,2.710276
...,...,...,...,...,...,...,...,...,...
83,"County - CA, San Diego",Premium Gas,2022-03-01,6.019278,4.676671,0.862529,0.015,0.465078,5.141749
84,"County - CA, San Diego",Premium Gas,2022-04-01,6.121857,4.391234,0.854100,0.015,0.861523,5.252757
85,"County - CA, San Diego",Midgrade Gas,2022-04-01,5.992657,4.244345,0.849617,0.015,0.883695,5.128040
86,"County - CA, San Diego",Unleaded Gas,2022-04-01,5.794604,4.050425,0.844127,0.015,0.885052,4.935477


In [34]:
sql_data.sort_values(by='Retail Average')

Unnamed: 0,Region Name,Retail Product Name,Start Date,Retail Average,Wholesale Average,Tax Average,Freight Average,Margin Average,Net Average
16,"County - CA, San Diego",Unleaded Gas,2020-11-01,3.151355,1.747771,0.769642,0.015,0.618942,2.366713
23,"County - CA, San Diego",Unleaded Gas,2020-12-01,3.177152,1.823218,0.771942,0.015,0.566992,2.390210
14,"County - CA, San Diego",Unleaded Gas,2020-10-01,3.178574,1.724378,0.768999,0.015,0.670197,2.394575
5,"County - CA, San Diego",Unleaded Gas,2020-08-01,3.211559,1.838607,0.772384,0.015,0.585569,2.424175
8,"County - CA, San Diego",Unleaded Gas,2020-09-01,3.214100,1.774930,0.770494,0.015,0.653676,2.428606
...,...,...,...,...,...,...,...,...,...
85,"County - CA, San Diego",Midgrade Gas,2022-04-01,5.992657,4.244345,0.849617,0.015,0.883695,5.128040
82,"County - CA, San Diego",Diesel,2022-03-01,6.000822,4.163553,1.302629,0.015,0.519641,4.683194
83,"County - CA, San Diego",Premium Gas,2022-03-01,6.019278,4.676671,0.862529,0.015,0.465078,5.141749
84,"County - CA, San Diego",Premium Gas,2022-04-01,6.121857,4.391234,0.854100,0.015,0.861523,5.252757


In [None]:
def download_data(path, year):
    '''This function downloads both the source and staging data'''
    # Source Data (R Drive)
    files_list = glob.glob(path + f"/*{year}*")
    df_list = []
    for filename in files_list:
        df = pd.read_csv(filename, index_col=None, header=0)
        df_list.append(df)

    # Staging Data (SQL)
    conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=dpoe_stage;'
                      'Trusted_Connection=yes;')
    
    query ="SELECT *"\
        "FROM [dpoe_stage].[veh_reg_dmv].[fact]"\
        f"WHERE yr = {year};"

    return pd.concat(df_list, axis=0, ignore_index=True), pd.read_sql_query(query, conn)

In [None]:
def clean_data(source_data, sql_data):
    '''This function cleans the data to make them equivalent'''
    sql_data = sql_data.drop('dmv_registration_id', axis=1) #This column doesn't exist in source data
    source_data.columns = sql_data.columns #This is making the assumption formatting does not change, if it does, our analysis portin should catch errors
    sql_data['own'] = sql_data['own'].str.rstrip("\r") #Documented in findings 
    return source_data, sql_data

In [None]:
def analyze_data(source_data, sql_data):
    '''This function runs the QC test. If this function returns 'True', all tests were passed'''
    length_boolean = len(source_data) == len(sql_data) #Checking for identical lengths
    personal_owned_boolean = (len(source_data[source_data['own']=='Personal']) == len(sql_data[sql_data['own']=='Personal']))
    commercial_owned_boolean = (len(source_data[source_data['own']=='Commercial']) == len(sql_data[sql_data['own']=='Commercial']))
    government_owned_boolean = (len(source_data[source_data['own']=='Government']) == len(sql_data[sql_data['own']=='Government']))
    identical_shapes_boolean = sql_data.shape == source_data.shape
    return (length_boolean and personal_owned_boolean and commercial_owned_boolean and government_owned_boolean and identical_shapes_boolean)