# DSE 203 - Assignment 3
## Group 5 - Christopher Vanhook, Vaaruni Desai, Zufeshan Imran
### <font color='red'>Query - List all the unincorporated places where there is a cafe</font>

#### Import the required libraries

In [49]:
import psycopg2
import pandas as pd
import json
from jsonpath_ng.ext import parse
import time
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

#### We're looking at two files. A JSON file containing information about places in San Diego and a CSV file containing information about businesses in and around San Diego

In [2]:
json_file = 'statisticalAtlas.json'
csv_file = 'nourish_public_ca_business.csv'

#### Load in JSON file using json library

In [3]:
with open(json_file, "r") as f:
    json_data = json.load(f)

#### Connect to pgadmin host - The cell takes user input

In [4]:
database = input("Enter database name:")
host = input("Enter host name:")
user = input("Enter user name:")
password = input("Enter password:")
port = input("Enter port number:")

conn = psycopg2.connect(database=database,
                        host=host,
                        user=user,
                        password=password,
                        port=port)

cur = conn.cursor()

Enter database name: postgres
Enter host name: localhost
Enter user name: postgres
Enter password: password
Enter port number: 5432


#### Import csv data into pgadmin. 
1. Create a table 
2. Copy/import csv data into respective columns in the created table 

In [5]:
CREATE_DB = """CREATE table IF NOT EXISTS nourish_public
(id bigserial primary key,
 name varchar,
 address varchar,
 avg_rating varchar,
 zip varchar(5),
 categories varchar,
 city varchar
);"""
cur.execute(CREATE_DB)
conn.commit()

In [6]:
IMPORT_CSV = """COPY nourish_public(name,address,avg_rating,zip,categories,city) FROM '/tmp/nourish_public_ca_business.csv' DELIMITER ',' CSV HEADER;"""
cur.execute(IMPORT_CSV)
conn.commit()

#### SQL Query to extract places where there is a cafe

In [7]:
cur.execute("SELECT * FROM nourish_public WHERE categories LIKE '%Cafe%';")
records = cur.fetchall()
col_names = [desc[0] for desc in cur.description]
cur.close()

#### JSON Query to extract Unincorporated Places

In [8]:
jsonpath_expr = parse("$.['Unincorporated Places']")
matches = [match.value for match in jsonpath_expr.find(json_data)]

#### Creating a Hash Table for JSON data - key being the city name

In [9]:
start_time = time.time()
json_hash = {}
for entry in matches[0]:
    city_name = entry.get('City Name') 
    if city_name is not None:
        if city_name not in json_hash:
            json_hash[city_name] = []
        json_hash[city_name].append(entry)
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.0001533031463623047 seconds ---


#### Creating a Hash Table for SQL data - key being the city

In [10]:
start_time = time.time()
sql_hash = {}
for entry in records:
    city_name = entry[6] 
    if city_name is not None:
        if city_name not in sql_hash:
            sql_hash[city_name] = []
        sql_hash[city_name].append(entry)
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.00011491775512695312 seconds ---


#### Joining both data based on hash table keys

In [56]:
start_time = time.time()
threshold = 85
matching_keys = []

for sql_key in sql_hash:
    matches = process.extract(sql_key, json_hash.keys(), scorer=fuzz.token_sort_ratio)
    best_match = max(matches, key=lambda x: x[1])
    if best_match[1] >= threshold:
        print(json_hash[best_match[0]])
time_taken = (time.time() - start_time)
print(f"{time_taken * 100000} microseconds")

[{'State': 'California', 'County': 'San Diego', 'ZIP Code': '92067', 'City Name': 'Rancho Santa Fe', 'Metro Area': 'San Diego Area', 'Nearby Cities': 'Carlsbad, Del Mar, Encinitas, Escondido, Poway, San Marcos, Solana Beach, Vista', 'Assembly District': 'CA-77', 'State Senate District': 'CA-38', 'Congressional District': 'CA-49', 'Secondary School District': 'San Dieguito', 'Elementary School Districts': 'Rancho Santa Fe, Solana Beach', 'Nearby Unincorporated Places': 'Hidden Meadows, Lake San Marcos', 'Neighboring Unincorporated Places': 'Fairbanks Ranch'}]
[{'State': 'California', 'County': 'San Diego', 'City Name': 'Pala', 'ZIP Codes': '92059, 92082', 'Metro Area': 'San Diego Area', 'Nearby Cities': 'Escondido, Temecula, Vista', 'Assembly District': 'CA-75', 'State Senate District': 'CA-38', 'Congressional District': 'CA-50', 'Unified School District': 'Bonsall', 'Nearby Unincorporated Places': 'Aguanga, Bonsall, Camp Pendleton North, Fallbrook, Hidden Meadows, Rainbow, Valley Cente