# Data Manipulation Function 
This notebook houses all of the data manipulation functions. 

Data Manipulation Functions 
1. Diff Files 
2. Both Files

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import yaml
import os
import pyodbc
import glob
import copy
import traceback
import warnings
warnings.filterwarnings('ignore')

# Input Data

In [16]:
# Information
DATASOURCE_IDS = ['38', '42']
data_input_folder = 'J:/DataScience/DataQuality/QAQC/forecast_automation/mgra_series_13_outputs_CSV_data/aggregated_data/'
data_output_folders = 'J:/DataScience/DataQuality/QAQC/forecast_automation/mgra_series_13_outputs_CSV_data'
geography_levels = ['taz']

# Download Data

In [17]:
# download all of the data 
all_data = {}
for ds_id in DATASOURCE_IDS:
    temp_dict = {}
    for file in os.listdir(data_input_folder):
        if (ds_id in file):
            # TODO: Have two folders input and output and have this grab from the input folder 
            temp_dict[file] = pd.read_csv(data_input_folder + file)
    all_data[ds_id] = temp_dict

# Both Files

In [12]:
def create_both_file(DS1, DS2, level, send_to_j_drive):
    df1 = all_data[DS1][f"{level}_DS{DS1}_ind_QA.csv"]
    df2 = all_data[DS2][f"{level}_DS{DS2}_ind_QA.csv"]

    # Fix the year id issue 
    if 'yr_id' in df1.columns:
        df1 = df1.rename(columns={'yr_id':'year'})
    elif 'yr_id' in df2.columns:
        df2 = df2.rename(columns={'yr_id':'year'})
    
    # First check to see if shapes are identical
    if df1.shape != df2.shape:
        print(f"Shapes are not identical. DF1 is shape {df1.shape} and DF2 is shape {df2.shape}")
    elif len(set(df1.columns).difference(set(df2.columns))) != 0:
        print(f"There is a difference in columns. The columns that exist in DF1 but not in DF2 are {[x for x in df1.columns if x not in df2.columns]}. The columns that exist in DF2 but not in DF1 are {[x for x in df2.columns if x not in df1.columns]}")
    elif len(set(df1['year']).difference(set(df2['year']))) != 0:
        print(f"There is a difference in the years of these dataframes. The years that exist in DF1 but not in DF2 are {[x for x in list(set(df1['year'])) if x not in list(set(df2['year']))]}. The columns that exist in DF2 but not in DF1 are {[x for x in list(set(df2['year'])) if x not in list(set(df1['year']))]}")
    

    if level == 'region':
        output = df1.merge(df2, on=['year'], how='left', suffixes=[f'_DS{DS1}', f'_DS{DS2}'])
        output = output.groupby(['year']).sum()
    else:
        output = df1.merge(df2, on=[f'{level}','year'], how='left', suffixes=[f'_DS{DS1}', f'_DS{DS2}'])
        output = output.groupby([f'{level}', 'year']).sum()

    if send_to_j_drive == 'Y':
        output.to_csv(data_output_folders + f"/both_files/{level}_both_DS{DS1}_DS{DS2}_QA.csv", index=True) 

    return output

In [13]:
# both_data = create_both_file('38', '42', 'taz', send_to_j_drive='Y')
# both_data

Unnamed: 0_level_0,Unnamed: 1_level_0,hs_DS38,hs_sf_DS38,hs_mf_DS38,hs_mh_DS38,hh_DS38,hh_sf_DS38,hh_mf_DS38,hh_mh_DS38,gq_civ_DS38,gq_mil_DS38,...,luz_id_DS42,truckregiontype_DS42,district27_DS42,milestocoast_DS42,acres_DS42,effective_acres_DS42,land_acres_DS42,MicroAccessTime_DS42,remoteAVParking_DS42,refueling_stations_DS42
taz,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
13,2016,2,2,0,0,1,1,0,0,0,0,...,6,2,2,7.7545,20.387422,7.799114,20.387422,240,0,1
13,2018,2,2,0,0,1,1,0,0,0,0,...,6,2,2,7.7545,20.387422,7.799114,20.387422,240,0,1
13,2020,2,2,0,0,1,1,0,0,0,0,...,6,2,2,7.7545,20.387422,7.799114,20.387422,240,0,1
13,2023,2,2,0,0,1,1,0,0,0,0,...,6,2,2,7.7545,20.387422,7.799114,20.387422,240,0,1
13,2025,2,2,0,0,1,1,0,0,0,0,...,6,2,2,7.7545,20.387422,7.799114,20.387422,240,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4996,2032,105,0,105,0,91,0,91,0,0,0,...,229,1,11,5.3225,22.853249,22.853249,22.853249,3,0,0
4996,2035,105,0,105,0,92,0,92,0,0,0,...,229,1,11,5.3225,22.853249,22.853249,22.853249,3,0,0
4996,2040,210,0,210,0,109,0,109,0,0,0,...,229,1,11,5.3225,22.853249,22.853249,22.853249,3,0,0
4996,2045,210,0,210,0,120,0,120,0,0,0,...,229,1,11,5.3225,22.853249,22.853249,22.853249,3,0,0


# Diff Functions

In [19]:
# TODO: May want to conside to little amount of QC I do in this to be a seperate function all together 
def create_diff_file(DS1, DS2, level, send_to_j_drive):
    df1 = all_data[DS1][f"{level}_DS{DS1}_ind_QA.csv"]
    df2 = all_data[DS2][f"{level}_DS{DS2}_ind_QA.csv"]

    # Fix the year id issue 
    if 'yr_id' in df1.columns:
        df1 = df1.rename(columns={'yr_id':'year'})
    elif 'yr_id' in df2.columns:
        df2 = df2.rename(columns={'yr_id':'year'})

    # First check to see if shapes are identical
    if df1.shape != df2.shape:
        print(f"Shapes are not identical. DF1 is shape {df1.shape} and DF2 is shape {df2.shape}")

    # Check if different columns are present amongst datasets, if so, manipulate so that only the shared columns are present
    if len(set(df1.columns).difference(set(df2.columns))) != 0:
        print(f"There is a difference in columns. The columns that exist in DF1 but not in DF2 are {[x for x in df1.columns if x not in df2.columns]}. The columns that exist in DF2 but not in DF1 are {[x for x in df2.columns if x not in df1.columns]}")
        # Ensure each data frame has the same columns
        df1 = df1[[x for x in df1.columns if x in df2.columns]]
        df2 = df2[[x for x in df1.columns if x in df2.columns]]

    # Check if different years are present amongst datasets, if so, manipulate so that only the shared years are compared
    if len(set(df1['year']).difference(set(df2['year']))) != 0:
        print(f"There is a difference in the years of these dataframes. The years that exist in DF1 but not in DF2 are {[x for x in list(set(df1['year'])) if x not in list(set(df2['year']))]}. The columns that exist in DF2 but not in DF1 are {[x for x in list(set(df2['year'])) if x not in list(set(df1['year']))]}")
        shared_years = [x for x in list(set(df1['year'])) if x in list(set(df2['year']))]
        df1 = df1[df1['year'].isin(shared_years)]
        df2 = df2[df2['year'].isin(shared_years)]
    
    df1 = df1.groupby([f'{level}', 'year']).sum()
    df2 = df2.groupby([f'{level}', 'year']).sum()

    diff_df = df2 - df1


    if send_to_j_drive == 'Y':
        diff_df.to_csv(data_output_folders + f"/diff_files/{level}_diff_DS{DS2}_minus_DS{DS1}_QA.csv", index=True) 

    return diff_df

In [20]:
# diff_output = create_diff_file('38', '42', 'taz', send_to_j_drive='Y')
# diff_output

Unnamed: 0_level_0,Unnamed: 1_level_0,hs,hs_sf,hs_mf,hs_mh,hh,hh_sf,hh_mf,hh_mh,gq_civ,gq_mil,...,luz_id,truckregiontype,district27,milestocoast,acres,effective_acres,land_acres,MicroAccessTime,remoteAVParking,refueling_stations
taz,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
13,2016,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0.0,0.0,0.0,0.0,0,0,0
13,2018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0.0,0.0,0.0,0.0,0,0,0
13,2020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0.0,0.0,0.0,0.0,0,0,0
13,2023,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0.0,0.0,0.0,0.0,0,0,0
13,2025,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0.0,0.0,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4996,2032,0,0,0,0,2,0,2,0,0,0,...,0,0,0,0.0,0.0,0.0,0.0,0,0,0
4996,2035,0,0,0,0,2,0,2,0,0,0,...,0,0,0,0.0,0.0,0.0,0.0,0,0,0
4996,2040,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0.0,0.0,0.0,0.0,0,0,0
4996,2045,0,0,0,0,-3,0,-3,0,0,0,...,0,0,0,0.0,0.0,0.0,0.0,0,0,0
