# Merge with Python (so bad)

In [None]:
# This is why we don't use Python for this...
import json
import csv

# Open up both of our files
with open("practice.json", "r") as jsonfile, open(r"visual_date.csv", "r") as csv_file:
    json_load = json.load(jsonfile)

    # Where we're storing our dictionaries
    list_of_dicts = []
    # List to store the fips value we have already used
    added_fips = []

    # Loop through csv dicts
    for elem in csv.DictReader(csv_file):
        vis_dict = dict(elem)
        
        # Loop through json dicts & pull out anything you want for your final table
        for i in range(len(json_load['features'])):
            # Pull out the data we want, this returns a dictionary
            properties = json_load['features'][i]['properties']
            # Adding the polygon data to the properties dictionary
            properties['coordinates'] = json_load['features'][i]['geometry']['coordinates']

            # Get the fips value from our csv dict, we use this to check duplicates
            new_fips = vis_dict['fips']
            # Check the fips value to the list of fips values we have already used. (Duplicate chack)
            if new_fips in added_fips:
                continue
            # Adjust the two values so they are same length. Visuals needs to be brought up to 5 digits, while Properties needs to be sliced down to 5.
            elif vis_dict['fips'].rjust(5, '0') == properties['GEO_ID'][-5:]:
                # Add fips value to our list which we use for duplicates
                added_fips.append(new_fips)
                # Use "update" to merge the dicts together
                properties.update(vis_dict)
                # Append the merged dict to our final list
                list_of_dicts.append(properties)
            else:
                pass

# This whole process takes over 1 minute to run...
print(len(list_of_dicts))

# Merge with Pandas

In [None]:
# This is why we use Pandas instead...
import pandas as pd

# "Converters" parameter will format any strings less than 5 to have leading 0's.
visualization = pd.read_csv(r"visual_date.csv", delimiter=',', header='infer', converters={'fips': '{:0>5}'.format})
# Json contains latin characters
geojson = pd.read_json(r"practice.json", encoding="latin-1")

# Visual dataframe
visual_df = pd.DataFrame(visualization)

# "json_normalize" will flatten jsons for us, converting "keys" to column names. ie, "key.key.key = value" depending on level
# "max_level" parameter allows us to control how much of the json will be flattened.
geo_df_pandas = pd.json_normalize(geojson['features'], max_level=1)

# "geo_df_pandas['properties.GEO_ID'].str[-5:]" will reduce our string to last 5 digits for merging with "fips".
# Visual data contained duplicates, needed to drop.
merged = pd.merge(geo_df_pandas, visual_df, left_on=geo_df_pandas['properties.GEO_ID'].str[-5:], right_on=['fips'], how='inner').drop_duplicates(subset='fips')

# Wow! Less than 10 seconds!
# Python is a very slow language, Pandas is built off of a hybrid of Python & C referred to a "Cython" and will always out perform raw Python.
display(merged)

# Merge with SQL (loading the geosjon to postgresql)

In [None]:
# Temporarily doing this to load the json into postgres database, annoying issue with geojson.
import json
from sqlalchemy import create_engine, Column, Integer, String, JSON
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker


def load_json_file(file_path, table_name):
    # Create a connection to the database
    engine = create_engine('postgresql://postgres:postgres@localhost:5432/psql_playground')

    # Read the JSON file
    with open(file_path, 'r') as f:
        json_data = json.load(f)

    # Create the table with the same name as the provided table name
    Base = declarative_base()
    class MyTable(Base):
        __tablename__ = table_name
        id = Column(Integer, primary_key=True)
        data = Column(JSON)
    Base.metadata.create_all(engine)
    
    # Create a session to add the data to the table
    Session = sessionmaker(bind=engine)
    session = Session()
    record = MyTable(data=json_data)
    session.add(record)
    session.commit()

# load the data
load_json_file("practice.json", "json_table")

# Rebuilding Geojson

In [3]:
import pandas as pd
import json

# import data to 
data = pd.read_csv(r"map_data.csv", delimiter=',')

In [4]:
# to_json() returns a string of the data
json_result_string = data.to_json(
    orient='records', 
    double_precision=12,
    date_format='iso'
)

# using json.loads() to convert string to json
json_result = json.loads(json_result_string)

In [15]:
# Using this to create personal reference points when reconstructing geojson
with open("practice.json", "r") as jsonfile:
    json_load = json.load(jsonfile)

    print(json_load['features'][0]['geometry'])

{'type': 'Polygon', 'coordinates': [[[-85.388717, 33.913044], [-85.380885, 33.873508], [-85.379455, 33.866291], [-85.377426, 33.856047], [-85.376403, 33.850656], [-85.364595, 33.788446], [-85.361844, 33.773951], [-85.360491, 33.767958], [-85.357402, 33.750104], [-85.355252, 33.739245], [-85.344054, 33.682684], [-85.342722, 33.675953], [-85.323792, 33.580339], [-85.31534, 33.537646], [-85.314994, 33.535898], [-85.314843, 33.534951], [-85.314091, 33.530218], [-85.313999, 33.529807], [-85.304439, 33.482884], [-85.308211, 33.481579], [-85.30925, 33.483137], [-85.314852, 33.487603], [-85.316028, 33.488267], [-85.320893, 33.488359], [-85.324856, 33.489161], [-85.331061, 33.491014], [-85.33828, 33.4947], [-85.342544, 33.495961], [-85.344923, 33.497608], [-85.346705, 33.501148], [-85.349958, 33.501216], [-85.351594, 33.4996], [-85.354491, 33.498866], [-85.352576, 33.494538], [-85.352573, 33.492438], [-85.355315, 33.49248], [-85.497455, 33.494624], [-85.501645, 33.494456], [-85.51731, 33.494524

In [18]:
# Rebuilding a geojson file
import ast

# Setup the container for our data
geojson = {
    'type': 'FeatureCollection',
    'features': []
}

# Go through all the dictionaries. NOTE: remember our data is flat with no levels, we need to add levels to it.
for record in json_result:
    # I run these two lines of code once to create a output to reference when working on the geosjon
    # print(record)
    # break
    
    # Appending out features with data
    geojson['features'].append({
        'type': 'Feature',
        'properties': {
            'GEO_ID': record['properties.GEO_ID'],
            'STATE': record['properties.STATE'],
            'COUNTY': record['properties.COUNTY'],
            'NAME': record['properties.NAME'],
            'LSAD': record['properties.LSAD'],
            'CENSUSAREA': record['properties.CENSUSAREA'],
            'FIPS': record['FIPS'],
            'recent_trend': record['recent_trend'],
            'prediction_trend': record['prediction_trend'],
            'O3_max_pred': record['O3_max_pred'],
            'PM25_max_pred': record['PM25_max_pred'],
            'future_prediction_trend': record['future_prediction_trend']
        },
        'geometry': {
            'type': record['geometry.type'],
            # ast.literal_eval() is a very efficient way to convert a "string like list" back to a real list
            'coordinates': ast.literal_eval(record['geometry.coordinates']),
        }
    })

In [19]:
# Finally use json.dump() to output our dict to a json file.
with open("map_data.json", "w") as outfile:
    json.dump(geojson, outfile)