# Part One

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import yaml
import matplotlib.pyplot as plt
import seaborn as sns
#Libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
from matplotlib.pyplot import figure
import pyodbc 
import glob
import copy
import PySimpleGUI as sg

# Comparison Functions

## MGRA Level Data

### Concatenate both DS dataframes

In [2]:
def concat_dfs(comparison_first_ID_processed_data, comparison_second_ID_processed_data):
    # Added geozone to merge keys to account for mgra's in multiple jurisdictions (or other geographical levels)
    first_second_ID_comparison = comparison_first_ID_processed_data.merge(
        comparison_second_ID_processed_data,
        how='left',
        left_on=[f'mgra_{first_ID}',
                 f'year_{first_ID}',
                 f'geozone_{first_ID}'],
        right_on=[f'mgra_{second_ID}',
                 f'year_{second_ID}',
                 f'geozone_{second_ID}'])
    
    # Clean green combined
    first_second_ID_comparison = first_second_ID_comparison.drop([f'mgra_{second_ID}', f'year_{second_ID}', f'geozone_{second_ID}'], axis=1)
    first_second_ID_comparison = first_second_ID_comparison.rename(columns={f'mgra_{first_ID}': 'mgra', f'year_{first_ID}': 'year', f'geozone_{first_ID}': 'geozone'})
    
    # Because we're summing, if using series 14 data, mgra's in multiple jurisdictions will be counted multiple times
    first_second_ID_comparison = first_second_ID_comparison.groupby(['mgra', 'year']).sum()
    
    #first_second_ID_comparison.to_csv(f"{output_path}/{first_ID}_{second_ID}_comparison")
    
    return first_second_ID_comparison

## CPA level Data

In [3]:
def cpa_aggregation(first_ID_df, second_ID_df, cpa_level):
    
    # Adding SQl Data (CPA) to first_id_df
    comparison_first_ID_processed_data_cpa = first_ID_df.merge(cpa_level, how='left', on='mgra')
    comparison_first_ID_processed_data_cpa = comparison_first_ID_processed_data_cpa[comparison_first_ID_processed_data_cpa['geozone'] != '*Not in a CPA*']

    # Adding SQl Data (CPA) to second_id_df
    comparison_second_ID_processed_data_cpa = second_ID_df.merge(cpa_level, how='left', on='mgra')
    comparison_second_ID_processed_data_cpa = comparison_second_ID_processed_data_cpa[comparison_second_ID_processed_data_cpa['geozone'] != '*Not in a CPA*']

    # Merge first_id_df and second_id_df together on mgra, year, and geozone
    comparison_processed_data_cpa = comparison_first_ID_processed_data_cpa.merge(comparison_second_ID_processed_data_cpa, how='inner', on=['mgra', 'year', 'geozone'], suffixes=[f'_{first_ID}', f'_{second_ID}'])

    # Drop the MGRA column because it isn't really a quantitative value
    comparison_processed_data_cpa = comparison_processed_data_cpa.drop('mgra', axis=1)

    # Aggregate the sum of features by geozone and year
    comparison_processed_data_cpa = comparison_processed_data_cpa.groupby(['geozone', 'year']).sum()

    # Rename index (geozone -> cpa)
    comparison_processed_data_cpa.index.names = ['cpa', 'year']
    
    #comparison_processed_data_cpa.to_csv(f"{output_path}/{first_ID}_{second_ID}_comparison_cpa")

    return comparison_processed_data_cpa

## Jurisdiction level Data

In [4]:
def jur_aggregation(first_ID_df, second_ID_df, jur_level):
    # Adding SQl Data (Jurisdiction) to first_id_df
    comparison_first_ID_processed_data_jur = first_ID_df.merge(jur_level, how='left', on='mgra')
    
    # Adding SQl Data (Jurisdiction) to second_id_df
    comparison_second_ID_processed_data_jur = second_ID_df.merge(jur_level, how='left', on='mgra')
    
    # Merge first_id_df and second_id_df together on mgra, year, and geozone
    comparison_processed_data_jur = comparison_first_ID_processed_data_jur.merge(comparison_second_ID_processed_data_jur, how='inner', on=['mgra', 'year', 'geozone'], suffixes=[f'_{first_ID}', f'_{second_ID}'])
    
    # Drop the MGRA column because it isn't really a quantitative value
    comparison_processed_data_jur = comparison_processed_data_jur.drop('mgra', axis=1)
    
    # Aggregate the sum of features by geozone and year
    comparison_processed_data_jur = comparison_processed_data_jur.groupby(['geozone', 'year']).sum()
    
    # Rename index (geozone -> jurisdiction)
    comparison_processed_data_jur.index.names = ['jurisdiction', 'year']
    
    #comparison_processed_data_jur.to_csv(f"{output_path}/{first_ID}_{second_ID}_comparison_jurisdiction")
    
    return comparison_processed_data_jur

## Creating Diff File for all Geo Levels

In [5]:
def non_shared_features(features_first_ID, features_second_ID):
    """
    (Comparison only)
    Identifies non-shared features between two different DS_ID's.
    """
    # In case we want to display non-shared features
    return list(first_ID_unique ^ set(features_second_ID))

In [6]:
def create_diff(features_first_ID, features_second_ID, first_second_ID_comparison):
    """
    (Comparison only)
    """
    # Finding features common to both DSID data frames
    first_ID_unique = set(features_first_ID)
    intersection = first_ID_unique.intersection(features_second_ID)
    shared_features = list(intersection)
    
    # Calculate diff values between the two DS_ID's
    diff_df = pd.DataFrame()

    # NOTE: Subtracts second DS ID from first DS ID. If negative, then second DS ID was greater than first DS ID.
    for column in shared_features:
        diff_df[f'{column}_diff'] = first_second_ID_comparison[f'{column}_{first_ID}'] - first_second_ID_comparison[f'{column}_{second_ID}']
    
    #diff_df.to_csv(f"{output_path}/{first_ID}_{second_ID}_complete_diff")
    
    return diff_df

## Region level Data

In [7]:
def region_aggregation(first_ID_df, second_ID_df):
    # Merge first_id_df and second_id_df together on mgra and year
    comparison_processed_data_reg = first_ID_df.merge(second_ID_df, how='inner', on=['mgra', 'year'], suffixes=[f'_{first_ID}', f'_{second_ID}'])
    
    # Aggregate the sum of features by year
    comparison_processed_data_reg = comparison_processed_data_reg.groupby('year').sum()
    
    # Drop the MGRA column because it isn't really a quantitative value
    comparison_processed_data_reg = comparison_processed_data_reg.drop('mgra', axis=1)
    
    #comparison_processed_data_reg.to_csv(f"{output_path}/{first_ID}_{second_ID}_comparison_region")
    
    return comparison_processed_data_reg

# Individual Functions

In [8]:
# maybe config argument?
def download_DS_data(ds_ID, jur_level):
    datafiles = config[ds_ID].values()
    
    comparison_no_geozone_df = pd.DataFrame()
    for file_name in datafiles:
        working_df = pd.read_csv(file_name)
        working_df['year'] = f"{file_name[-11:-7]}"
        comparison_no_geozone_df = comparison_no_geozone_df.append(working_df)
        
    # Save the features_first_ID for future use (Used when creating the diff file)
    features = comparison_no_geozone_df.drop(['mgra', 'year'], axis=1).columns
    
    comparison_no_geozone = copy.deepcopy(comparison_no_geozone_df)
    
    # Adding SQl Data to first_id_df
    comparison_processed_data = comparison_no_geozone.merge(jur_level, how='left', on='mgra')
    
    # making it original
    comparison_processed_data.columns = [x + f'_{ds_ID}' for x in comparison_processed_data.columns]
    
    #comparison_first_ID_processed_data.to_csv(f"{output_path}/comparison_{first_ID}_processed_data.csv")
    
    return comparison_processed_data, comparison_no_geozone_df, features

## CPA Aggregation

In [9]:
def cpa_aggregation_ind(first_ID_df, cpa_level):
    
    # Adding SQl Data (CPA) to first_id_df
    comparison_first_ID_processed_data_cpa = first_ID_df.merge(cpa_level, how='left', on='mgra')
    comparison_first_ID_processed_data_cpa = comparison_first_ID_processed_data_cpa[comparison_first_ID_processed_data_cpa['geozone'] != '*Not in a CPA*']

    # Drop the MGRA column because it isn't really a quantitative value
    comparison_processed_data_cpa = comparison_first_ID_processed_data_cpa.drop('mgra', axis=1)

    # Aggregate the sum of features by geozone and year
    comparison_processed_data_cpa = comparison_processed_data_cpa.groupby(['geozone', 'year']).sum()

    # Rename index (geozone -> cpa)
    comparison_processed_data_cpa.index.names = ['cpa', 'year']
    
    #comparison_processed_data_cpa.to_csv(f"{output_path}/{first_ID}_{second_ID}_comparison_cpa")

    return comparison_processed_data_cpa

## Jurisdiction level Data

In [10]:
def jur_aggregation_ind(first_ID_df, jur_level):
    # Adding SQl Data (Jurisdiction) to first_id_df
    comparison_first_ID_processed_data_jur = first_ID_df.merge(jur_level, how='left', on='mgra')
    
    # Drop the MGRA column because it isn't really a quantitative value
    comparison_processed_data_jur = comparison_first_ID_processed_data_jur.drop('mgra', axis=1)
    
    # Aggregate the sum of features by geozone and year
    comparison_processed_data_jur = comparison_processed_data_jur.groupby(['geozone', 'year']).sum()
    
    # Rename index (geozone -> jurisdiction)
    comparison_processed_data_jur.index.names = ['jurisdiction', 'year']
    
    #comparison_processed_data_jur.to_csv(f"{output_path}/{first_ID}_{second_ID}_comparison_jurisdiction")
    
    return comparison_processed_data_jur

## Region level Data

In [11]:
def region_aggregation_ind(first_ID_df):
    
    # Aggregate the sum of features by year
    comparison_processed_data_reg = first_ID_df.groupby('year').sum()
    
    # Drop the MGRA column because it isn't really a quantitative value
    comparison_processed_data_reg = comparison_processed_data_reg.drop('mgra', axis=1)
    
    #comparison_processed_data_reg.to_csv(f"{output_path}/{first_ID}_{second_ID}_comparison_region")
    
    return comparison_processed_data_reg

# Environment Setup

## Pulling Info From YML File

In [12]:
# Localise with . files 
# config_filename = 'C:/Users/cra/OneDrive - San Diego Association of Governments/DS41_42/ds41_42_config.yml'
config_filename = './ds_config.yml'

In [13]:
with open(config_filename, "r") as yml_file:
    config = yaml.safe_load(yml_file)

## Downloading SQL Data

In [14]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=demographic_warehouse;'
                      'Trusted_Connection=yes;')

In [15]:
# NOTE: Format to make it select correct MGRA series based on input DS
query_all = "SELECT mgra, geotype, geozone FROM demographic_warehouse.dim.mgra WHERE series = 14 AND (geotype='cpa' OR geotype='jurisdiction')" #Remove the last and part when I do this for real 

In [16]:
sql_query = pd.read_sql_query(query_all,conn)

sql_df_all = pd.DataFrame(sql_query)

In [17]:
# SQl Data at different levels
jur_level = sql_df_all[sql_df_all['geotype']=='jurisdiction'].drop('geotype', axis=1).drop_duplicates()
cpa_level = sql_df_all[sql_df_all['geotype']=='cpa'].drop('geotype', axis=1).drop_duplicates()

# GUI Implementation

## Base window

In [73]:
def base_window():
    # Very basic window.
    # Return event
    layout_first = [
        [sg.Text('Please Designate An Output Path (. for local) And Select A Comparison Option')],
        [sg.Text('Output Path', size =(15, 1)), sg.FolderBrowse(key='output-path')],
        [sg.Listbox(values=(list(config.keys())[1:-1]), select_mode='multiple', size=(30, 6))],
        [sg.Button(button_text='comparison', key='comparison-select'),
         sg.Button(button_text='individual', key='individual-select'),
         sg.Cancel()]
    ]
    
    # Add inputs for output_path, desired features, and desired outputs (maybe a dropdown if possible?)

    window = sg.Window('Base window', layout_first)
    event, values = window.read()
    window.close()

    return event, values

In [74]:
base_window()

('comparison-select', {'output-path': '', 0: []})

## Comparison window

In [43]:
def comparison_window():
    # Very basic window.
    # Return values using
    # automatic-numbered keys
    layout_comparison = [
        [sg.Text('Please Input DS_ID Numbers (Format as DSXX)')],
        [sg.Text('First DS_ID', size =(15, 1)), sg.InputText()],
        [sg.Text('Second DS_ID', size =(15, 1)), sg.InputText()],
        [sg.Submit(key='comparison'), sg.Cancel()]
    ]
    
    window = sg.Window('Comparison window', layout_comparison)
    event, values = window.read()
    window.close()
    
    return event, values

In [44]:
def comparison_window():
    # Very basic window.
    # Return values using
    # automatic-numbered keys
    layout_comparison = [
        [sg.Text('Please Input DS_ID Numbers (Format as DSXX)')],
        [sg.Text('First DS_ID', size =(15, 1)), sg.InputText()],
        [sg.Text('Second DS_ID', size =(15, 1)), sg.InputText()],
        [sg.Submit(key='comparison'), sg.Cancel()]
    ]
    
    window = sg.Window('Comparison window', layout_comparison)
    event, values = window.read()
    window.close()
    
    return event, values

## Individual window

In [45]:
def individual_window():
    # Very basic window.
    # Return values using
    # automatic-numbered keys
    layout_comparison = [
        [sg.Text('Please Input DS_ID Number (Format as DSXX)')],
        [sg.Text('Individual DS_ID', size =(15, 1)), sg.InputText()],
        [sg.Submit(key='individual'), sg.Cancel()]
    ]
    
    window = sg.Window('Individual window', layout_comparison)
    event, values = window.read()
    window.close()
    
    return event, values

## Initialize GUI

In [46]:
# Add some color
# to the window
sg.theme('SandyBeach')

'SandyBeach'

In [47]:
def initiate_window(event):
    event, values = base_window()
    output_path = values['output-path']
    while True:
        if event in [None, 'Cancel']:
            return
        if event == 'comparison-select':
            event, values = comparison_window()
            return event, values
        if event == 'individual-select':
            event, values = individual_window()
            return event, values

In [48]:
event, values = initiate_window(None)

In [50]:
event, values, output_path

NameError: name 'output_path' is not defined

In [24]:
# invalid check_type input
if event not in ['comparison', 'individual']:
    raise AssertionError("Invalid check_type in yml file. Please use either 'comparison' or 'individual'")
    
if event == 'comparison':
    
    first_ID = values[0]
    second_ID = values[1]
    
    # identical ds_id's
    if first_ID == second_ID:
        raise AssertionError('first_ID and second_ID inputs are equivalent. Please make sure they are different values')
    
    # first_id actually exists in list
    if first_ID not in config.keys():
        raise AssertionError('first_ID not found in specified DS_ID list. Please reference yml file for selectable DS_IDs')
    
    # second_id actually exists in list
    if second_ID not in config.keys():
        raise AssertionError('second_ID not found in specified DS_ID list. Please reference yml file for selectable DS_IDs')
        
elif event == 'individual':
    
    individual_ID = values[0]
    
    # individual_id actually exists in list
    if individual_ID not in config.keys():
        raise AssertionError(f'individual_ID not found in specified DS_ID list. Valid DS_IDs include: {", ".join(list(config.keys())[1:-1])}.')

In [25]:
# if event == 'comparison':
    
#     input_list = ['mgra_diff', 'first_ID_processed']
    
#     comparison_function_dict = {}
    
#     # download data for each ds_id
#     first_ID_processed, first_ID_unprocessed, first_ID_features = download_DS_data(first_ID, jur_level)
#     second_ID_processed, second_ID_unprocessed, second_ID_features = download_DS_data(second_ID, jur_level)
    
#     # concatenate both processed ds_id's together to get mgra level data by mgra and year
#     mgra_both = concat_dfs(first_ID_processed, second_ID_processed)
    
#     # aggregate both ds_id's together by cpa level and year
#     cpa_both = cpa_aggregation(first_ID_unprocessed, second_ID_unprocessed, cpa_level)
    
#     # aggregate both ds_id's together by jurisdiction level and year
#     jur_both = jur_aggregation(first_ID_unprocessed, second_ID_unprocessed, jur_level)
    
#     # aggregate both ds_id's together by region level (sum total)
#     region_both = region_aggregation(first_ID_unprocessed, second_ID_unprocessed)
    
#     # Create diff at mgra level
#     mgra_diff = create_diff(first_ID_features, second_ID_features, mgra_both)

#     # Create diff at cpa level
#     cpa_diff = create_diff(first_ID_features, second_ID_features, cpa_both)

#     # Create diff at jurisdiction level
#     jur_diff = create_diff(first_ID_features, second_ID_features, jur_both)
    
#     # Create diff at region level
#     reg_diff = create_diff(first_ID_features, second_ID_features, region_both)
    
#     # output func here

In [27]:
if event == 'comparison':
    
    input_list = ['jur_diff', 'mgra_both', 'region_diff']
    
    # download data for each ds_id    
    first_ID_processed, first_ID_unprocessed, first_ID_features = download_DS_data(first_ID, jur_level)
    second_ID_processed, second_ID_unprocessed, second_ID_features = download_DS_data(second_ID, jur_level)
    
    if 'mgra_both' in input_list:
        mgra_both = concat_dfs(first_ID_processed, second_ID_processed)
    if 'cpa_both' in input_list:
        cpa_both = cpa_aggregation(first_ID_unprocessed, second_ID_unprocessed, cpa_level)
    if 'jur_both' in input_list: 
        jur_both = jur_aggregation(first_ID_unprocessed, second_ID_unprocessed, jur_level)
    if 'region_both' in input_list:
        region_both = region_aggregation(first_ID_unprocessed, second_ID_unprocessed)
    if 'mgra_diff' in input_list:
        if 'mgra_both' not in input_list:
            mgra_both = concat_dfs(first_ID_processed, second_ID_processed)
        mgra_diff = create_diff(first_ID_features, second_ID_features, mgra_both)
    if 'cpa_diff' in input_list: 
        if 'cpa_both' not in input_list:
            cpa_both = cpa_aggregation(first_ID_unprocessed, second_ID_unprocessed, cpa_level)
        cpa_diff = create_diff(first_ID_features, second_ID_features, cpa_both)
    if 'jur_diff' in input_list:
        if 'jur_both' not in input_list:
            jur_both = jur_aggregation(first_ID_unprocessed, second_ID_unprocessed, jur_level)
        jur_diff = create_diff(first_ID_features, second_ID_features, jur_both)
    if 'region_diff' in input_list:
        if 'region_both' not in input_list:
            region_both = region_aggregation(first_ID_unprocessed, second_ID_unprocessed)
        region_diff = create_diff(first_ID_features, second_ID_features, region_both)

        # output func here

In [None]:
# for df in input_list: #May have to do the created thing if strings can't be used as variable names
#      globals()[df].to_csv(f"{output_file}/{df}.csv")

To do: 
- Add the input_list feature in the GUI 
    - Set that value equal to an input_list
- Create the output based on the data frames created 
- Add default folder for unspecified output_path

In [None]:
if event == 'individual':
    
    # download data for the ds_id
    individual_ID_processed, individual_ID_unprocessed, individual_ID_features = download_DS_data(individual_ID, jur_level)
    
    if 'cpa_ind' in input_list:
        cpa_ind = cpa_aggregation_ind(individual_ID_unprocessed, cpa_level)
    if 'jur_ind' in input_list:
        jur_ind = jur_aggregation_ind(individual_ID_unprocessed, jur_level)
    if 'region_ind' in input_list:
        region_ind = region_aggregation_ind(individual_ID_unprocessed)
    
    # aggregate both ds_id's together by cpa level and year
    cpa_ind = cpa_aggregation_ind(individual_ID_unprocessed, cpa_level)
    
    # aggregate both ds_id's together by jurisdiction level and year
    jur_ind = jur_aggregation_ind(individual_ID_unprocessed, jur_level)

    # aggregate both ds_id's together by region level (sum total)
    region_ind = region_aggregation_ind(individual_ID_unprocessed)
    
    # output func

# Output csv files

In [None]:
def comparison_output(input_lst, df_list):
    if 'all' in lst.lower():
        mgra_both.to_csv(f"{output_path}/{first_ID}_{second_ID}_comparison.csv")
        cpa_both.to_csv(f"{output_path}/{first_ID}_{second_ID}_comparison_cpa.csv")
        jur_both.to_csv(f"{output_path}/{first_ID}_{second_ID}_comparison_jurisdiction.csv")
        region_both.to_csv(f"{output_path}/{first_ID}_{second_ID}_comparison_region.csv")

        mgra_diff.to_csv(f"{output_path}/{first_ID}_{second_ID}_mgra_diff.csv")
        cpa_diff.to_csv(f"{output_path}/{first_ID}_{second_ID}_cpa_diff.csv")
        jur_diff.to_csv(f"{output_path}/{first_ID}_{second_ID}_jurisdiction_diff.csv")
        reg_diff.to_csv(f"{output_path}/{first_ID}_{second_ID}_region_diff.csv")
    elif 'all':
        

In [None]:
# Ind outputs

#individual_ID_processed.to_csv(f"{output_path}/{individual_ID}_processed_data.csv")
#cpa_ind.to_csv(f"{output_path}/{individual_ID}_cpa.csv")
#jur_ind.to_csv(f"{output_path}/{individual_ID}_jurisdiction.csv")
#region_ind.to_csv(f"{output_path}/{individual_ID}_region.csv")

In [None]:
# Comparison outputs

#mgra_both.to_csv(f"{output_path}/{first_ID}_{second_ID}_comparison.csv")
#cpa_both.to_csv(f"{output_path}/{first_ID}_{second_ID}_comparison_cpa.csv")
#jur_both.to_csv(f"{output_path}/{first_ID}_{second_ID}_comparison_jurisdiction.csv")
#region_both.to_csv(f"{output_path}/{first_ID}_{second_ID}_comparison_region.csv")

#mgra_diff.to_csv(f"{output_path}/{first_ID}_{second_ID}_mgra_diff.csv")
#cpa_diff.to_csv(f"{output_path}/{first_ID}_{second_ID}_cpa_diff.csv")
#jur_diff.to_csv(f"{output_path}/{first_ID}_{second_ID}_jurisdiction_diff.csv")
#reg_diff.to_csv(f"{output_path}/{first_ID}_{second_ID}_region_diff.csv")

In [None]:
def foo(x,y):
    return(x+y)

In [None]:
a = {'function1': foo}

In [None]:
a['function1'](1,2)

In [None]:
def mult(x,y):
    return (x*y)

In [None]:
mult(4,a['function1'](1,2))

# Brainstorming for Part 2

In [None]:
config['dof']['dof_data'] # figure out what this is for and what to do with it

In [None]:
mgra_both.head()

In [None]:
# MGRA level stats
mgra_both.describe(percentiles=[0.5])

### Outlier Detection

In [None]:
# Good article: https://machinelearningmastery.com/how-to-use-statistics-to-identify-outliers-in-data/

In [None]:
diff_stats = mgra_diff.describe()

In [None]:
diff_stats

In [None]:
from scipy import stats

In [None]:
mgra_both

In [None]:
q = mgra_both["taz_DS35"].quantile(0.99)
mgra_both[mgra_both["taz_DS35"] > q]

In [None]:
mgra_both[mgra_both["taz_DS35"] > q]

In [None]:
mgra_both['taz_DS35']

In [None]:
mgra_both[(np.abs(stats.zscore(mgra_both)) < 3).all(axis=1)]

In [None]:
diff_stats.loc['max']

In [None]:
diff_stats.loc['max']