# Combine & Upload All State files into single upload Project.
Purpose:  To combine each individual organizations by state ProcessedInputData csv files into a single WaDE 2.0 upload project.

Notes: 
- asdf

In [None]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library
import geoplot as gplt # for plotting maps
import geoplot.crs as gcrs #used to pull in webdata

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [None]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/Washington/Regulatory" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

## Input Data

In [None]:
# organizations #1 - Regulatory_WDNR
# ----------------------------------------------
dfd1 = pd.read_csv("Regulatory_WDNR/ProcessedInputData/date.csv")
dfo1 = pd.read_csv("Regulatory_WDNR/ProcessedInputData/organizations.csv")
dfro1 = pd.read_csv("Regulatory_WDNR/ProcessedInputData/regulatoryoverlays.csv")
dfru1 = pd.read_csv("Regulatory_WDNR/ProcessedInputData/reportingunits.csv")
dfrru1 = pd.read_csv("Regulatory_WDNR/ProcessedInputData/regulatoryreportingunits.csv")
dfs1 = pd.read_csv("Regulatory_WDNR/ProcessedInputData/sites.csv")

In [None]:
# organizations #1 - Regulatory_WDOE
# ----------------------------------------------
dfd2 = pd.read_csv("Regulatory_WDOE/ProcessedInputData/date.csv")
dfo2 = pd.read_csv("Regulatory_WDOE/ProcessedInputData/organizations.csv")
dfro2 = pd.read_csv("Regulatory_WDOE/ProcessedInputData/regulatoryoverlays.csv")
dfru2 = pd.read_csv("Regulatory_WDOE/ProcessedInputData/reportingunits.csv")
dfrru2 = pd.read_csv("Regulatory_WDOE/ProcessedInputData/regulatoryreportingunits.csv")
dfs2 = pd.read_csv("Regulatory_WDOE/ProcessedInputData/sites.csv")

## Concatenate Like Files Together

In [None]:
# Concatenate data.csv files
frames = [dfd1, dfd2]  # list all out dataframes here
dfd = pd.concat(frames)
dfd = dfd.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(dfd))
dfd.head(1)

In [None]:
# Concatenate organizations.csv files
frames = [dfo1, dfo2]  # list all out dataframes here
dfo = pd.concat(frames)
dfo = dfo.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(dfo))
dfo.head()

In [None]:
# Concatenate organizations.csv files
frames = [dfo1, dfo2]  # list all out dataframes here
dfo = pd.concat(frames)
dfo = dfo.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(dfo))
dfo.head()

In [None]:
# Concatenate regulatoryoverlays.csv files
frames = [dfro1, dfro2]  # list all out dataframes here
dfro = pd.concat(frames)
dfro = dfro.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(dfro))
dfro.head()

In [None]:
# Concatenate reportingunits.csv files
frames = [dfru1, dfru2]  # list all out dataframes here
dfru = pd.concat(frames)
dfru = dfru.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(dfru))
dfru.head()

In [None]:
# Concatenate regulatoryreportingunits.csv files
frames = [dfrru1, dfrru2]  # list all out dataframes here
dfrru = pd.concat(frames)
dfrru = dfrru.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(dfrru))
dfrru.head()

In [None]:
# Concatenate sites.csv files
# this will require performing a groupby() method
frames = [dfs1, dfs2]  # list all out dataframes here
dfs = pd.concat(frames)
dfs = dfs.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
dfs = dfs.groupby('SiteUUID').agg(lambda x: ','.join([str(elem) for elem in (list(set(x))) if elem != ""])).replace(np.nan, "").reset_index()
print(len(dfs))
dfs.head()

# Inspect Output Data & Export

In [None]:
# Export out to CSV
dfd.to_csv('ProcessedInputData/date.csv', index=False)
dfo.to_csv('ProcessedInputData/organizations.csv', index=False)
dfro.to_csv('ProcessedInputData/regulatoryoverlays.csv', index=False)
dfru.to_csv('ProcessedInputData/reportingunits.csv', index=False)
dfrru.to_csv('ProcessedInputData/regulatoryreportingunits.csv', index=False)
dfs.to_csv('ProcessedInputData/sites.csv', index=False)