# Data Manipulation Function 
This notebook houses all of the data manipulation functions. 

Data Manipulation Functions 
1. Diff Files 
2. Both Files

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import yaml
import os
import pyodbc
import glob
import copy
import traceback
import warnings
warnings.filterwarnings('ignore')

# Input Data

In [None]:
# Information
DATASOURCE_IDS = ['39', '42', '99', 'SQLDS99']
j_drive_output_folder_path = 'J:/DataScience/DataQuality/QAQC/forecast_automation/mgra_series_13_outputs_CSV_data/aggregated_data/'
geography_levels = ['mgra', 'region']

# Download Data

In [None]:
# download all of the data 
all_data = {}
for ds_id in DATASOURCE_IDS:
    temp_dict = {}
    for file in os.listdir(j_drive_output_folder_path):
        if (ds_id in file):
            # TODO: Have two folders input and output and have this grab from the input folder 
            temp_dict[file] = pd.read_csv(j_drive_output_folder_path + file)
    all_data[ds_id] = temp_dict

# Both Files

In [None]:
def create_both_file(DS1, DS2, level, send_to_j_drive):
    df1 = all_data[DS1][f"{level}_DS{DS1}_ind_QA.csv"]
    df2 = all_data[DS2][f"{level}_DS{DS2}_ind_QA.csv"]

    # First check to see if shapes are identical
    if df1.shape != df2.shape:
        print(f"Shapes are not identical. DF1 is shape {df1.shape} and DF2 is shape {df2.shape}")
    elif len(set(df1.columns).difference(set(df2.columns))) != 0:
        print(f"There is a difference in columns. The columns that exist in DF1 but not in DF2 are {[x for x in df1.columns if x not in df2.columns]}. The columns that exist in DF2 but not in DF1 are {[x for x in df2.columns if x not in df1.columns]}")
    elif len(set(df1['yr_id']).difference(set(df2['yr_id']))) != 0:
        print(f"There is a difference in the years of these dataframes. The years that exist in DF1 but not in DF2 are {[x for x in list(set(df1['yr_id'])) if x not in list(set(df2['yr_id']))]}. The columns that exist in DF2 but not in DF1 are {[x for x in list(set(df2['yr_id'])) if x not in list(set(df1['yr_id']))]}")
    

    if level == 'region':
        output = df1.merge(df2, on=['year'], how='left', suffixes=[f'_DS{DS1}', f'_DS{DS2}'])
        output = output.groupby(['year']).sum()
    else:
        output = df1.merge(df2, on=[f'{level}','year'], how='left', suffixes=[f'_DS{DS1}', f'_DS{DS2}'])
        output = output.groupby([f'{level}', 'year']).sum()

    if send_to_j_drive == 'Y':
        output.to_csv('J:/DataScience/DataQuality/QAQC/forecast_automation/mgra_series_13_outputs_CSV_data/both_files/' + f"{level}_both_DS{DS1}_DS{DS2}_QA.csv", index=True) 

    return output

# Diff Functions

In [2]:
# TODO: write a check in here to see if the column is 'yr_id', 'yr' or 'year', if not 'year' change to year. 
# TODO: May want to conside to little amount of QC I do in this to be a seperate function all together 
def create_diff_file(DS1, DS2, level, send_to_j_drive):
    df1 = all_data[DS1][f"{level}_DS{DS1}_ind_QA.csv"]
    df2 = all_data[DS2][f"{level}_DS{DS2}_ind_QA.csv"]

    # First check to see if shapes are identical
    if df1.shape != df2.shape:
        print(f"Shapes are not identical. DF1 is shape {df1.shape} and DF2 is shape {df2.shape}")

    # Check if different columns are present amongst datasets, if so, manipulate so that only the shared columns are present
    if len(set(df1.columns).difference(set(df2.columns))) != 0:
        print(f"There is a difference in columns. The columns that exist in DF1 but not in DF2 are {[x for x in df1.columns if x not in df2.columns]}. The columns that exist in DF2 but not in DF1 are {[x for x in df2.columns if x not in df1.columns]}")
        # Ensure each data frame has the same columns
        df1 = df1[[x for x in df1.columns if x in df2.columns]]
        df2 = df2[[x for x in df1.columns if x in df2.columns]]

    # Check if different years are present amongst datasets, if so, manipulate so that only the shared years are compared
    if len(set(df1['yr_id']).difference(set(df2['yr_id']))) != 0:
        print(f"There is a difference in the years of these dataframes. The years that exist in DF1 but not in DF2 are {[x for x in list(set(df1['yr_id'])) if x not in list(set(df2['yr_id']))]}. The columns that exist in DF2 but not in DF1 are {[x for x in list(set(df2['yr_id'])) if x not in list(set(df1['yr_id']))]}")
        shared_years = [x for x in list(set(df1['yr_id'])) if x in list(set(df2['yr_id']))]
        df1 = df1[df1['yr_id'].isin(shared_years)]
        df2 = df2[df2['yr_id'].isin(shared_years)]
    
    df1 = df1.groupby([f'{level}', 'yr_id']).sum()
    df2 = df2.groupby([f'{level}', 'yr_id']).sum()

    diff_df = df2 - df1


    if send_to_j_drive == 'Y':
        diff_df.to_csv('J:/DataScience/DataQuality/QAQC/forecast_automation/mgra_series_13_outputs/' + f"{level}_diff_DS{DS2}_minus_DS{DS1}_QA.csv", index=True) 

    return diff_df