In [1]:
from bs4 import BeautifulSoup

import tldextract

import requests

from urllib.parse import urljoin

import pandas as pd

from joblib import Parallel, delayed


DOMAINS = ['coopercity.gov', 'coopercityfl.org']


def get_all_links(base_url):

    print(f"Fetching {base_url}...")

    try:

        page = requests.get(base_url, timeout=2)

        doc = BeautifulSoup(page.text, "html.parser")

        tags = doc.find_all("a")

        links = []


        for link in tags:

            try:

                href = link['href'].lstrip()

                full_url = urljoin(base_url, href)

                urlDomain = f"{tldextract.extract(full_url).domain}.{tldextract.extract(full_url).suffix}"

                if (urlDomain in DOMAINS) and (full_url not in links):

                    links.append(full_url)


            except KeyError:

                pass


        return links

    except requests.RequestException as e:

        print(f"Error fetching {base_url}: {e}")

        return []


    except KeyboardInterrupt:

        print("Interrupted")

        return []

        # Occasionally this program will probably get hung up parsing something

        # You should make a daemon to kill it after a certain amount of time (like 1-2 seconds),

        # or we can be lazy and just pass this problem with a keyboard interrupt.

        # generally you want to have some kind of high level logger, error handler, and interrupt handler

        # so that you can turn off your computer and start it again without losing any data.


def get_recursive_links(base_url, depth=2):

    visited = set()

    to_visit = [base_url]


    for depthIndex in range(depth):

        print(f"\n\nDepth {depthIndex + 1}...")

        new_to_visit = []

        for url in to_visit:

            if url not in visited:

                visited.add(url)

                try:

                    new_links = get_all_links(url)

                    new_to_visit.extend(new_links)

                    # print(f"Found {len(new_links)} links at {url}")

                    to_visit = new_to_visit

                except:

                    pass

        


    return visited


if __name__ == "__main__":

    # links = get_all_links('https://coopercity.gov/')

    # links = get_recursive_links('https://coopercity.gov/?SEC=%7BAD7C348E-C110-425A-B91C-2CA5769BF937%7D', depth=2)
    links_nested = Parallel(n_jobs=-1, prefer="threads")(delayed(get_recursive_links)(s) for s in ['https://coopercity.gov/?SEC=%7BAD7C348E-C110-425A-B91C-2CA5769BF937%7D'])
    # links = Parallel(n_jobs=-1, prefer="threads")(delayed(get_recursive_links)(s) for s in urls)
    print(links_nested)
    links_flat = [link for sublist in links_nested for link in sublist]
    df = pd.DataFrame(links_flat, columns=['URL'])

    print(df.to_string())





Depth 1...
Fetching https://coopercity.gov/?SEC=%7BAD7C348E-C110-425A-B91C-2CA5769BF937%7D...


Depth 2...
Fetching https://coopercity.gov/?SEC=%7BAD7C348E-C110-425A-B91C-2CA5769BF937%7D#mainContent...
Fetching https://coopercity.gov/...
Fetching https://coopercity.gov/government...
Fetching https://coopercity.gov/cityofficials...
Fetching https://coopercity.gov/city-commission-agendas-minutes...
Fetching https://coopercity.gov/viewmeeting...
Fetching https://coopercity.gov/index.asp?SEC=D045C1C8-6282-421F-A8E2-186C1F0BFAE9...
Fetching https://coopercity.gov/commission-initiatives...
Fetching https://coopercity.gov/intermediate.asp?link=http://www.halleyweb.com/c058010/hh/index.php...
Error fetching https://coopercity.gov/intermediate.asp?link=http://www.halleyweb.com/c058010/hh/index.php: HTTPSConnectionPool(host='coopercity.gov', port=443): Read timed out. (read timeout=2)
Fetching https://coopercity.gov/contact-us...
Fetching https://coopercity.gov/strategicplan...
Fetching https:

In [2]:


import os
import fitz  
import json

## This is just a 1 time thing to get the jsons in my folders 

def pdf_to_json(pdf_path):
    print(f"Opening PDF: {pdf_path}")
    try:
        document = fitz.open(pdf_path)
    except Exception as e:
        print(f"Failed to open PDF: {e}")
        return None
    
    pdf_content = {}
    
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text = page.get_text("text")
        
        pdf_content[f'page_{page_num + 1}'] = text
    
    pdf_json = json.dumps(pdf_content, indent=4)
    return pdf_json

def process_pdfs_in_folder(folder_path):
    for root, dirs, files in os.walk(folder_path):
        print(f"Checking directory: {root}")
        for filename in files:
            if filename.endswith('.pdf'):
                pdf_path = os.path.join(root, filename)
                absolute_pdf_path = os.path.abspath(pdf_path)
                print(f"Processing {absolute_pdf_path}")
                
                # Check if the file exists
                if not os.path.exists(absolute_pdf_path):
                    print(f"File not found: {absolute_pdf_path}")
                    continue
                
                pdf_json = pdf_to_json(absolute_pdf_path)
                if pdf_json is None:
                    print(f"Failed to convert {absolute_pdf_path} to JSON.")
                    continue
                
                # Create output directory within the same city directory
                output_folder = os.path.dirname(absolute_pdf_path)
                
                json_filename = os.path.splitext(filename)[0] + '.json'
                json_path = os.path.join(output_folder, json_filename)
                
                try:
                    with open(json_path, 'w') as json_file:
                        json_file.write(pdf_json)
                    print(f"Saved JSON to {json_path}")
                except Exception as e:
                    print(f"Failed to save JSON to {json_path}: {e}")


folder_path = 'put in your absolute path' # will be changed to relative after this thing works

process_pdfs_in_folder(folder_path)

In [3]:
import os
import re
import importlib.util

# get relative path to the Florida folder
base_path = "/storage/Florida/children/"

results_dict = {}
links = {}

# Check if the base path exists
for county in os.listdir(os.getcwd() + base_path):
    if os.path.isdir(os.getcwd() + base_path + county) and county != "__pycache__":
        for municipality in os.listdir(os.getcwd() + base_path + county + "/children/"):
            if os.path.isdir(os.getcwd() + base_path + county + "/children/" + municipality) and municipality != "__pycache__":
                module = importlib.import_module(f"storage.Florida.children.{county}.children.{municipality}")
                my_class = getattr(module, re.sub(r'\W+','', municipality))
                results = my_class.getMunicipalityInfo() #dict

                # if county not in links:
                #     links[county] = {}
                #     links[county][municipality] = results

                # print(municipality, 2)
# print(links['MiamiDade'])
                if county not in results_dict:
                    results_dict[county] = {}

                try:
                    results_dict[county][municipality] = results['website'][0]['url']
                except:
                    pass



# print(results_dict['Broward']['COOPER CITY'])

Broward_links = list(results_dict['Broward'].values())
MiamiDade_links = list(results_dict['MiamiDade'].values())
# print(MiamiDade_links)
all_links = MiamiDade_links + Broward_links
# print(all_links)

Indian Creek Info
Golden Beach Info
Miami Beach Info
Homestead Info
Bal Harbour Info
North Bay Village Info
Doral Info
West Miami Info
Key Biscayne Info
Miami Gardens Info
Bay Harbor Islands Info
Cutler Bay Info
Unincorporated MiamiDade Info
Biscayne Park Info
Palmetto Bay Info
Miami Lakes Info
Miami Springs Info
Medley Info
Hialeah Gardens Info
Pinecrest Info
El Portal Info
Sweetwater Info
Miami Shores Info
Virginia Gardens Info
South Miami Info
North Miami Info
Miami Info
Hialeah Info
Opa-locka Info
Florida City Info
Sunny Isles Beach Info
Coral Gables Info
Aventura Info
North Miami Beach Info
Surfside Info
Plantation Info
Tamarac Info
Dania Beach Info
West Park Info
Lauderdale By The Sea Info
Weston Info
Coconut Creek Info
Lauderdale Lakes Info
County Regional Facility Info
Fort Lauderdale Info
Sunrise Info
Lighthouse Point Info
BMSD Info
Tribal Land Info
Parkland Info
Davie Info
Lazy Lake Info
Sea Ranch Lakes Info
Oakland Park Info
Hollywood Info
Hillsboro Beach Info
Miramar Info
S

In [4]:
import os
import fitz  # PyMuPDF
import json

## Creating functions to load in all jsons. Similar to the section above but not the same.


def load_json(json_file):
    try:
        with open(json_file, 'r') as file:
            data = json.load(file)
        return data
    
    except FileNotFoundError:
        print(f"Error: File '{json_file}' not found.")
        return None
    
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON from '{json_file}': {e}")
        return None


def process_jsons(folder_path):

    json_list = []

    for root, dirs, files in os.walk(folder_path):
        print(f"Checking directory: {root}")
        for filename in files:
            if filename.endswith('.json') and filename != 'base.json':
                folder_path = os.path.join(root, filename)
                absolute_json_path = os.path.abspath(folder_path)
                print(f"Processing {absolute_json_path}")
                
                # # Check if the file exists
                # if not os.path.exists(absolute_json_path):
                #     print(f"File not found: {absolute_json_path}")
                #     continue
                
                # if json is None:
                #     print(f"Failed to convert {absolute_json_path} to JSON.")
                #     continue

                json_file = load_json(absolute_json_path)


                json_list.append(str(json_file))
                
    return json_list
                


folder_path = '/Users/willandrews/Desktop/Coding/Python/Roofing_Project/storage/Florida/children'


process_jsons(folder_path)


Checking directory: /Users/willandrews/Desktop/Coding/Python/Roofing_Project/storage/Florida/children
Checking directory: /Users/willandrews/Desktop/Coding/Python/Roofing_Project/storage/Florida/children/MiamiDade
Checking directory: /Users/willandrews/Desktop/Coding/Python/Roofing_Project/storage/Florida/children/MiamiDade/__pycache__
Checking directory: /Users/willandrews/Desktop/Coding/Python/Roofing_Project/storage/Florida/children/MiamiDade/standard
Processing /Users/willandrews/Desktop/Coding/Python/Roofing_Project/storage/Florida/children/MiamiDade/standard/OwnersNotification.json
Processing /Users/willandrews/Desktop/Coding/Python/Roofing_Project/storage/Florida/children/MiamiDade/standard/NOC.json
Checking directory: /Users/willandrews/Desktop/Coding/Python/Roofing_Project/storage/Florida/children/MiamiDade/standard/affidavit
Processing /Users/willandrews/Desktop/Coding/Python/Roofing_Project/storage/Florida/children/MiamiDade/standard/affidavit/Sheathing.json
Processing /User

["{'page_1': 'MIAMI-DADE COUNTY\\nREQUIRED OWNERS NOTIFICATION FOR \\n ROOFING CONSIDERATIONS\\nIt is the responsibility of the roofing contractor to provide the owner with the required roofing permit, and to ex-\\nplain to the owner the content of this form. The owner’s initials in the designated space indicates that the item has \\nbeen explained.\\n❏ \\x07\\n1.\\t\\x07\\nAesthetics-workmanship: The workmanship provisions of Chapter 15 (High Velocity Hurricane Zone) are \\nfor the purpose of providing that the roofing system meets the wind resistance and water intrusion perfor-\\nmance standards. Aesthetics (appearance) are not a consideration with respect to workmanship provisions. \\nAesthetic issues such as color or architectural appearance, that are not part of a zoning code, should be \\naddressed as part of the agreement between the owner and the contractor.\\n\\t\\n\\t\\n❏ \\x07\\n2.\\t \\x07\\nRenailing wood decks: When replacing roofing, the existing wood roof deck may have 

In [5]:
# Creating and clearing the target_dict

target_dict = results_dict.copy()

for county, cities in results_dict.items():

    for city, url in cities.items():

        target_dict[county][city] = None


In [6]:
## Setting up the target_dict


for county, cities in results_dict.items():

        for city, url in cities.items():

            path = os.path.join(folder_path, county, 'children', city) # replace with relative path eventually

            list_of_jsons = process_jsons(path)

            # print(list_of_jsons)

            newdict = {}   # I'm running out of names lol

            for i in list_of_jsons:
                 
                 newdict['json'] = i

            for json_file in list_of_jsons:

                target_dict[county][city] = newdict

    # for city, url in cities.items():



        



Checking directory: /Users/willandrews/Desktop/Coding/Python/Roofing_Project/storage/Florida/children/MiamiDade/children/INDIAN CREEK VILLAGE
Processing /Users/willandrews/Desktop/Coding/Python/Roofing_Project/storage/Florida/children/MiamiDade/children/INDIAN CREEK VILLAGE/PermitApplication.json
Checking directory: /Users/willandrews/Desktop/Coding/Python/Roofing_Project/storage/Florida/children/MiamiDade/children/INDIAN CREEK VILLAGE/__pycache__
Checking directory: /Users/willandrews/Desktop/Coding/Python/Roofing_Project/storage/Florida/children/MiamiDade/children/GOLDEN BEACH
Processing /Users/willandrews/Desktop/Coding/Python/Roofing_Project/storage/Florida/children/MiamiDade/children/GOLDEN BEACH/HVHZ.json
Processing /Users/willandrews/Desktop/Coding/Python/Roofing_Project/storage/Florida/children/MiamiDade/children/GOLDEN BEACH/PermitApplication.json
Processing /Users/willandrews/Desktop/Coding/Python/Roofing_Project/storage/Florida/children/MiamiDade/children/GOLDEN BEACH/Owners

In [7]:
from bs4 import BeautifulSoup

import tldextract

import requests

from urllib.parse import urljoin

import pandas as pd

from joblib import Parallel, delayed

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity


new_dict = {}

    # for county, cities in results_dict.items():

for county, cities in results_dict.items():

    for city, url in cities.items():

    # for city, url in results_dict['Broward']['COOPER CITY']:

        # new_dict[county] = {}

        # target_dict = {county:{city:{'url1':target_1.json, 'url2':target_2.json, ... 'urln':target_n.json}}}

        url = results_dict[county][city]

        pdfs_nested = Parallel(n_jobs=-1, prefer="threads")(delayed(get_recursive_links)(s) for s in [url])

        pdfs_flat = [link for sublist in pdfs_nested for link in sublist]

        local_municipality_dict = {county:{city:{}}}

        # cosine_similarity_scores_dict = {county:{city:{}}}   # I think I will delete this

        for pdf_link in pdfs_flat:

            the_pdf = check_pdfs(pdf_link) # returns the json for the pdf, and if the link is not a pdf it returns none

            if the_pdf: # if that link is not None

                local_municipality_dict[county][city][pdf_link] = the_pdf

            else:

                pass

            # cosine_similarity_scores_dict[county][city][pdf_link] = None   # I think I will delete this

        for feature_link in local_municipality_dict[county][city]:

            for target_link in target_dict[county][city]:
                
                if local_municipality_dict[county][city][feature_link] == target_dict[county][city][target_link]:

                    local_municipality_dict[county][city].pop(feature_link)

                    target_dict[county][city].pop(target_link)

        if not bool(target_dict):

            cos_similarity_scores = {}
            
            for link_feature in local_municipality_dict[county][city]:

                for link_target in target_dict[county][city]:

                    compare_list = [local_municipality_dict[county][city][link_feature], target_dict[county][city][link_target]]

                    tfidf = TfidfVectorizer()

                    tfidf_matrix = tfidf.fit_transform(compare_list)

                    cosine_sim = cosine_similarity(tfidf_matrix)
                        
                    # cosine_similarity_scores_dict[county][city][link_feature] = local_municipality_dict[county][city][link_feature]   # I think I will delete this

                    cos_similarity_scores[feature_link] = cosine_sim

            max_key = max(cos_similarity_scores, key=cos_similarity_scores.get)

            if cos_similarity_scores[max_key] >= 0.95:

                # go into files and replace the old json with the new json

                # pop that file out of the feature dict and the target dict

                print('eureka!') # will replace this with the popping

            else:

                final_guess = local_municipality_dict[county][city][max_key] # this is what the model's best guess on what the replacement file should be if no easy fix could be made

                print(f'The file {max_key} has the highest similarity score of {cos_similarity_scores[max_key]}') # or perform whatever action needed






Depth 1...


TypeError: unhashable type: 'dict'