In [14]:
import pandas as pd
import numpy as np
import feedparser
from bs4 import BeautifulSoup
import requests
import PyPDF2 as pdf
import io
import os
import re
from gsheet import create
from gsheet import update
import weblist
import scrape_pdf

## Step 1: Scrape the list of webpages

In [15]:
galveston_url = "https://www.swg.usace.army.mil/Media/Public-Notices/"
web_list = weblist.get_web_list(galveston_url)
web_list

Unnamed: 0,web_link,web_title,published_date,web_expire_date,pdf_url
0,http://www.swg.usace.army.mil/Media/Public-Not...,SWG-1992-02709 - Cedar Marine Terminals (subsi...,9/5/2023,10/5/2023,
1,http://www.swg.usace.army.mil/Media/Public-Not...,SWG-2003-02165 - TEXAS DEPARTMENT OF TRANSPORT...,9/5/2023,10/5/2023,
2,http://www.swg.usace.army.mil/Media/Public-Not...,"SWG-2021-00499 - Venture Global CP2 LNG, LLC -...",8/24/2023,8/25/2023,
3,http://www.swg.usace.army.mil/Media/Public-Not...,"SWG-1993-00525 - J.W. Kelso Company, Inc. - Ga...",8/17/2023,9/18/2023,
4,http://www.swg.usace.army.mil/Media/Public-Not...,SWG-2008-00319 - Kinder Morgan Liquids Termina...,8/15/2023,9/15/2023,
...,...,...,...,...,...
1086,http://www.swg.usace.army.mil/Media/Public-Not...,SWG-2011-01123: TGS Development: Barge Docking...,5/2/2012,,
1087,http://www.swg.usace.army.mil/Media/Public-Not...,"SWG-2011-00740: Fairport North Industrial, LLC...",5/2/2012,,
1088,http://www.swg.usace.army.mil/Media/Public-Not...,Public Scoping Meeting and Preparation of EIS ...,3/14/2012,,
1089,http://www.swg.usace.army.mil/Media/Public-Not...,Public Notice For Federal Register Notice Anno...,2/21/2012,,


## Step2: Scrap webpage for pdf urls

In [20]:
def galveston_pdf_url(web_url):
    try:
        content = requests.get(web_url).text
        soup = BeautifulSoup(content, 'html.parser')
        body = soup.find("div", {"itemprop":"articleBody"})
        pdf_end = body.a.get("href").replace(" ", "%20")
        pdf_url = "https://www.swg.usace.army.mil" + pdf_end
    except Exception as e:
        pdf_url = "ERROR: " + str(e)
    finally:
        return(pdf_url)

web_list["pdf_url"] = [galveston_pdf_url(x) for x in web_list["web_link"]]

web_list

Unnamed: 0,web_link,web_title,published_date,web_expire_date,pdf_url
0,http://www.swg.usace.army.mil/Media/Public-Not...,SWG-1992-02709 - Cedar Marine Terminals (subsi...,9/5/2023,10/5/2023,https://www.swg.usace.army.mil/Portals/26/docs...
1,http://www.swg.usace.army.mil/Media/Public-Not...,SWG-2003-02165 - TEXAS DEPARTMENT OF TRANSPORT...,9/5/2023,10/5/2023,https://www.swg.usace.army.mil/Portals/26/docs...
2,http://www.swg.usace.army.mil/Media/Public-Not...,"SWG-2021-00499 - Venture Global CP2 LNG, LLC -...",8/24/2023,8/25/2023,https://www.swg.usace.army.mil/Portals/26/docs...
3,http://www.swg.usace.army.mil/Media/Public-Not...,"SWG-1993-00525 - J.W. Kelso Company, Inc. - Ga...",8/17/2023,9/18/2023,https://www.swg.usace.army.mil/Portals/26/docs...
4,http://www.swg.usace.army.mil/Media/Public-Not...,SWG-2008-00319 - Kinder Morgan Liquids Termina...,8/15/2023,9/15/2023,https://www.swg.usace.army.mil/Portals/26/docs...
...,...,...,...,...,...
1086,http://www.swg.usace.army.mil/Media/Public-Not...,SWG-2011-01123: TGS Development: Barge Docking...,5/2/2012,,https://www.swg.usace.army.mil/Portals/26/docs...
1087,http://www.swg.usace.army.mil/Media/Public-Not...,"SWG-2011-00740: Fairport North Industrial, LLC...",5/2/2012,,https://www.swg.usace.army.milhttp://ww3.swg.u...
1088,http://www.swg.usace.army.mil/Media/Public-Not...,Public Scoping Meeting and Preparation of EIS ...,3/14/2012,,https://www.swg.usace.army.milhttp://ww3.swg.u...
1089,http://www.swg.usace.army.mil/Media/Public-Not...,Public Notice For Federal Register Notice Anno...,2/21/2012,,https://www.swg.usace.army.milhttp://www.swg.u...


In [21]:
web_list.to_csv("galveston_weblist.csv")

## Step3: Scrap PDFs

In [25]:
district_dic = {"MVN": "New Orleans District",
               "SWG": "Galveston District",
               "SAM": "Mobile District",
               "SAJ": "Jacksonville District"}

# Seperate pdf texts into big chuncks:

## Permit application # + district code + district Name
def get_pdf_app_num_dist(pdf_text):
    try:
        permit_application_number = re.search(r'(?<=Application).*(?=PUBLIC)', pdf_text).group().replace("#", "").replace(":", "").strip()
        district_code = permit_application_number[0:3]
        district_name = district_dic[district_code]
    except Exception as e:
        permit_application_number = "ERROR: " + str(e)
        district_code = "ERROR: cannot get permit application number"
        district_name = "ERROR: cannot get permit application number"
    finally:
        return permit_application_number, district_code, district_name

## Manager name + phone + email
def get_pdf_manager(pdf_text):
    try:
        manager_name = re.search(r'(?<=Project Manager)[\:a-zA-Z\s\.]*', pdf_text).group().replace(":", "").strip()
    except Exception as e:
        manager_name = "ERROR: " + str(e)
    try:
        manager_phone = re.search(r'\(\d{3}\)\s{1,3}\d{3}\s?-?\s?\d{4}', pdf_text).group().strip()
    except Exception as e:
        manager_phone = "ERROR: " + str(e)
    try:
        manager_email = re.search(r'[\w\.-]+@us\s?a\s?c\s?e\s?\.army\.m\s?i\s?l', pdf_text).group().strip()
    except Exception as e:
        manager_email = "ERROR: " + str(e)
    return manager_name, manager_phone, manager_email

    
## Location of work
def get_pdf_location(pdf_text):
    if pdf_text.find("LOCATION OF WORK") != -1:
        try:
            pdf_location = re.search(r'((?<=LOCATION OF WORK).*(?=CHARACTER OF WORK))', pdf_text).group().replace(":", "").strip()
        except Exception as e:
            pdf_location = "ERROR: " + str(e)
    else:
        pdf_location = None
    return pdf_location

## Character of work 
def get_pdf_character(pdf_text):
    if pdf_text.find("CHARACTER OF WORK") != -1:
        try:
            pdf_character = re.search(r'((?<=CHARACTER OF WORK).*(?=(MITIGATION|The comment period)))', pdf_text).group().replace(":", "").strip()
        except Exception as e:
            pdf_character = "ERROR: " + str(e)
    else:
        pdf_character = None
    return pdf_character

# Extract fields from paragraphs

## From location of work
def get_pdf_county_parish(pdf_text):
    if pdf_text.find("County") != -1:
        try:
            county = re.search(r'((?<=, ).*County)', pdf_text).group().strip()
        except Exception as e:
            county = "ERROR: " + str(e)
    else:
        county = None
    if pdf_text.find("Parish") != -1:
        try:
            parish = re.search(r'((?<=in ).{1,100}(?= Parish))', pdf_text).group().strip()
        except Exception as e:
            parish = "ERROR: " + str(e)
    else:
        parish = None
    return county, parish


def get_pdf_hydrologic(pdf_text):
    if pdf_text.find(r'Hydrologic Unit Code') != -1:
        try:
            hydrologic_unit_code = re.search(r'(?<=Hydrologic Unit Code(:|\s))[\s|\d]*', pdf_text).group().strip()
        except Exception as e:
            hydrologic_unit_code = "ERROR: " + str(e)
    else:
        hydrologic_unit_code = None
    return hydrologic_unit_code

## From character of work
def get_pdf_acreage(pdf_text):
    if all(w in pdf_text for w in ["acre", "impact"]):
        try:
            acreage = re.findall(r'(\d*\.\d*-?\s?(?=acres of))', pdf_text)
            acreage = [a.strip() for a in acreage]
            # wetland_type = re.search(r'(?<=acres of).+? wetlands', pdf_text).group().strip()
        except Exception as e:
            acreage = "ERROR: " + str(e)
            # wetland_type = "ERROR: " + str(e)
    else:
        acreage = None
        # wetland_type = None
    return acreage


def get_wqc(pdf_text):
    if pdf_text.find("WQC") != -1:
        try:
            wqc = re.search(r'(?<=WQC)[\d\s\:]*-[\s\d]*', pdf_text).group().strip().replace(" ", "")
        except Exception as e:
            wqc = "ERROR: " + str(e)
    else:
        wqc = None
    return wqc


def get_coastal_use_permit(pdf_text):
    if pdf_text.find("Natural Resource’s Coastal Resources Program") != -1:
        try:
            coastal_use_permit_list = re.findall(r'P\d{8}', pdf_text)
            coastal_use_permit = ", ".join(coastal_use_permit_list)
        except Exception as e:
            coastal_use_permit = "ERROR: " + str(e)
    else:
        coastal_use_permit = None
    return coastal_use_permit


def pdf_read(pdf_url):
    try:
        # Download the PDF content as a bytes object
        pdf_bytes = requests.get(pdf_url).content

        # Create a PyPDF2 PdfFileReader object from the PDF content
        pdf_reader = pdf.PdfReader(io.BytesIO(pdf_bytes))

        # Extract text from all pages in the PDF file
        pdf_text_list = [pdf_reader.pages[page_num].extract_text() for page_num in range(len(pdf_reader.pages))]
        pdf_text = "".join(pdf_text_list).replace("\n", "")
        
    except Exception as e:
        pdf_text = "ERROR: " + str(e)
    finally:
        return pdf_text

# Read in PDF as texts and extract fields
def pdf_extraction(pdf_url):
    
    pdf_text = pdf_read(pdf_url)
    
    # standardized public notice
    if  pdf_text.find("ERROR") == -1:

        pdf_app_num_dist = get_pdf_app_num_dist(pdf_text)

        pdf_manager = get_pdf_manager(pdf_text)

        pdf_location = get_pdf_location(pdf_text)

        if pdf_location is None:
            county = parish = hydrologic_unit_code = None
        elif "ERROR" in pdf_location:
            county = parish = hydrologic_unit_code = "ERROR: cannot extract location of work"
        else:
            county = get_pdf_county_parish(pdf_location)[0]
            parish = get_pdf_county_parish(pdf_location)[1]
            hydrologic_unit_code = get_pdf_hydrologic(pdf_location)

        pdf_character = get_pdf_character(pdf_text)

        if pdf_character is None:
            acreage = None
        elif "ERROR" in pdf_character:
            acreage = "ERROR: cannot extract character of work"
        else:
            acreage = get_pdf_acreage(pdf_character)
    
    # Special public notice
    else:
        pdf_app_num_dist = ["ERROR: fail to read pdf " + pdf_text]*3
        pdf_manager = ["ERROR: fail to read pdf " + pdf_text]*3
        pdf_location = pdf_character = county = parish = hydrologic_unit_code = acreage = "ERROR: fail to read pdf " + pdf_text
        
    return [pdf_app_num_dist[0], pdf_app_num_dist[1], pdf_app_num_dist[2], pdf_manager[0], pdf_manager[1], \
            pdf_manager[2], pdf_location, pdf_character, county, parish, hydrologic_unit_code, acreage]


In [23]:
# test

pdf_url = "https://www.mvn.usace.army.mil/Portals/56/docs/regulatory/publicnotices/2020-00168-WRR%20PNALL.pdf?ver=TKjkvnv5Zg4fAplngIrBTQ%3d%3d"
pdf_text = pdf_read(pdf_url)

pdf_text

'JOINT PUBLIC NOTICE   December 21, 2020   United States Army  State of Louisiana Corps of Engineers  Department of Environmental Quality  New Orleans District  ATTN: Water Quality Certifications  Regulatory Branch, ODR -W 7400 Leake Avenue                                                                             Post Office Box 4313  New Orleans, Louisiana  70118  Baton Rouge, Louisiana  70821- 4313     (504) 862- 1445  (225) 219- 3225  Pierre.P.Castaing@usace.army.mil   Project Manager  Project Manager  Pierre  Castaing    Elizabeth Hill Permit Application Number  WQC Application Number  MVN -2020- 00168 -WRR                                                                 WQC 201 210-01    Interested parties are hereby notified that a permit application has been received by the New Orleans District of the U.S. Army Corps of Engineers pursuant to: [ X] Section 10 of the Rivers and Harbors Act of March 3, 1899 (30 Stat. 1151; 33 USC 403); and/or [X] Section 404 of the Clean Water Act

In [26]:
get_wqc(pdf_text)

'201210-01'

In [3]:
# Loop
error_collect = pd.DataFrame(columns = ["error_title", "error_link", "error_message"])

# %%time
for row in rss_feed.index:
    try:
        main_url = rss_feed.iloc[row, 1]
        req = requests.get(main_url)
        content = req.text # json structure
        soup = BeautifulSoup(content)
        
        # Get the pdf links
        
        def pdf_url():
            try:
                pdf_end = soup.findAll('a', {"class": "link"})[2]['href']
                pdf_url = "https://www.mvn.usace.army.mil" + pdf_end
            except Exception as pdf_error:
                print("ERROR when getting pdf links", pdf_error)
                
        # Check if the public notice is "Special Public Notice" or "Joint Public Notice"

        public_notice = soup.find_all("a", {"class":"link"})[2].get_text()

        if public_notice != "Public Notice and Drawings":
            expire_date = None
            applicant = None
            contractor = None
            location = None
            work_character = None
            mitigation = None
        else: 

            # Get expiration date

            expire_date = re.search(r'(?<=:\s).+', soup.find_all("div", "expire")[0].get_text()).group()

            # Extract webpage body: two types of structures

            body = soup.find_all("div", {"itemprop":"articleBody"})[0]
            if body.find("p") is None:
                body_text = body.get_text()  
            else:
                body_text = body.get_text().replace(u'\xa0', u'').replace("\n", "")

            applicant_contents = re.search(r'(?<=APPLICANT:)\s*.+(?=LOCATION)', body_text).group().strip()
            location = re.search(r'(?<=LOCATION OF WORK:)\s*.+(?=CHARACTER OF WORK)', body_text).group().strip()

            if "MITIGATION" in body_text:
                work_character = re.search(r'(?<=CHARACTER OF WORK:)\s*.+(?=MITIGATION)', body_text).group().strip()
                mitigation = re.search(r'(?<=MITIGATION:)\s*.+', body_text).group().strip()
            else:
                work_character = re.search(r'(?<=CHARACTER OF WORK:)\s*.+', body_text).group().strip()
                mitigation = None

            # Get applicant name and permitting process contractor

            if applicant_contents.find("c/o") != -1:
                applicant = re.search(r'.+?(?=\,* c/o)', applicant_contents).group().strip()
                # if any(w in applicant_contents for w in ["Attention", "Attn"]):
                contractor = re.search(r'(?<=c/o( |:)).+?(?=(, Post|, PO|, P\.O\.|, \d|,? [Aa][tT]))', applicant_contents).group().strip()
                # else:
                #     contractor = re.search(r'(?<=c/o( |:))\D+(?=(, |, P.O.))', applicant_contents).group()
            else:
                contractor = None
                # if any(w in applicant_contents for w in ["Attention", "Attn"]):
                applicant = re.search(r'.+?(?=(, Post|, PO|, P\.O\.|, \d|,? [Aa][tT]))', applicant_contents).group().strip()
                # else:
                #     applicant = re.search(r'\D+(?=, )', applicant_contents).group()

            # Get location
            
            
            #lon = re.findall(r'(?<=[-W])\s*\d{2}\.\d+', location)
            #lat = re.findall(r"(?<=[^-W][^-\d])\d{2}\.\d+", location)
            
        rss_feed.loc[row, empty_columns] = np.array([pdf_url, expire_date, applicant_contents, applicant, contractor, location, lon, lat, work_character, mitigation], dtype = "object")
    
    except Exception as error_message:
        error_row = [rss_feed.iloc[row, 0], rss_feed.iloc[row, 1], repr(error_message)]
        error_collect.loc[len(error_collect.index)] = error_row
        # print("ERROR: ", error_link, error)

rss_feed

Unnamed: 0,title,link,desc,date,pdf,expiration_date,applicant_full_info,applicant,contractor,location,lon,lat,work_character,mitigation
0,MVN-1999-02360-EPP,https://www.mvn.usace.army.mil/Missions/Regula...,TIME EXTENSION FOR THE ENGLISH TURN DEVELOPMEN...,"Mon, 14 Aug 2023 04:00:00 GMT",https://www.mvn.usace.army.mil/Portals/56/docs...,9/4/2023,"English Turn Limited Partnership, Attention: J...",English Turn Limited Partnership,,"At approximately Latitude: 29.90826, Longitude...",[89.952013],[29.90826],Continual operations to install and maintain f...,
1,MVN-2020-00234-WPP,https://www.mvn.usace.army.mil/Missions/Regula...,MODIFICATION FOR MAINTENANCE DREDGING OF NAVIG...,"Mon, 14 Aug 2023 04:00:00 GMT",https://www.mvn.usace.army.mil/Portals/56/docs...,9/4/2023,"Port of Iberia, c/o: GIS Engineering, LLC, 450...",Port of Iberia,"GIS Engineering, LLC","Located in Iberia Parish, Section 34, Township...","[91.8412, 92.3127306]","[29.958239, 29.5109862]",A Department of Army permit was issued on Apri...,
2,MVN-2021-00560-WQQ,https://www.mvn.usace.army.mil/Missions/Regula...,PROPOSED CP2 LIQUEFIED NATURAL GAS FACILITY AN...,"Mon, 14 Aug 2023 04:00:00 GMT",https://www.mvn.usace.army.mil/Portals/56/docs...,9/4/2023,"Venture Global CP2 LNG, LLC and Venture Global...","Venture Global CP2 LNG, LLC and Venture Global...",Environmental Resources Management,The proposed facility and pipeline are located...,[],"[85.4, 58.1]",To construct and operate (1) natural gas lique...,
3,MVN-2023-00058-WII,https://www.mvn.usace.army.mil/Missions/Regula...,TEST WELLS FOR CARBON DIOXIDE STORAGE IN VERMI...,"Mon, 14 Aug 2023 04:00:00 GMT",https://www.mvn.usace.army.mil/Portals/56/docs...,9/4/2023,ExxonMobil Low Carbon Solutions Onshore Storag...,ExxonMobil Low Carbon Solutions Onshore Storag...,Duplantis Design Group,Test Well #1 @ Lat. 29-46-57.31 N / Long. -92-...,"[57.31, 56.39, 44.82, 37.40]",[],Proposed geological investigations via install...,
4,MVN-2008-01186-EPP,https://www.mvn.usace.army.mil/Missions/Regula...,AFTER-THE-FACT WEIGH STATION AND TRUCK LOADING...,"Mon, 07 Aug 2023 04:00:00 GMT",https://www.mvn.usace.army.mil/Portals/56/docs...,8/28/2023,"KV Enterprises, LLC, c/o Professional Engineer...","KV Enterprises, LLC",Professional Engineering & Environmental Consu...,"At approximately Latitude: 30.00584, Longitude...",[90.51464],[30.00584],"Clear, grade, excavate, and fill for a weigh s...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,MVN-2020-00422-WS,https://www.mvn.usace.army.mil/Missions/Regula...,"BULKHEAD REPLACEMENT IN TERREBONNE PARISH, LOU...","Mon, 04 May 2020 04:00:00 GMT",https://www.mvn.usace.army.mil/Portals/56/docs...,5/24/2020,"Savage Inland Marine, LLC, c/o: Mr. David Boud...","Savage Inland Marine, LLC",Mr. David Boudreaux,"Located in Terrebonne Parish, Section 14, Town...",[91.018125],[29.665289],The applicant proposes to install approximatel...,
496,MVN-2019-00377-WI,https://www.mvn.usace.army.mil/Missions/Regula...,CONSTRUCT ONE BRINE PIPELINE AND ONE ETHYLENE ...,"Mon, 04 May 2020 04:00:00 GMT",https://www.mvn.usace.army.mil/Portals/56/docs...,5/23/2020,"Boardwalk Louisiana Midstream, LLC, c/o Provid...","Boardwalk Louisiana Midstream, LLC",Providence Engineering and Environmental Group...,"In Iberville Parish, North 30.316461, West -91...",[91.304178],[30.316461],"Clear, grade, trench and temporarily backfill ...",
497,MVN 2006-01716-CM,https://www.mvn.usace.army.mil/Missions/Regula...,PROPOSED CONSTRUCTION OF A RESIDNETIAL SUBDIVI...,"Mon, 04 May 2020 04:00:00 GMT",https://www.mvn.usace.army.mil/Portals/56/docs...,5/24/2020,"J. Breaux Enterprises, c/o Hydrik Wetlands Con...",J. Breaux Enterprises,Hydrik Wetlands Consultants,"Between West Colyell Creek and Beaver Branch, ...",[90.881303],[30.57454],The applicant has requested Department of the ...,
498,MVN-2020-00393-CE,https://www.mvn.usace.army.mil/Missions/Regula...,MODIFICATION OF AN EXISTING BARGE FLEETING OPE...,"Mon, 27 Apr 2020 04:00:00 GMT",https://www.mvn.usace.army.mil/Portals/56/docs...,5/26/2020,"Cooper Consolidated, LLC., c/o Lanier & Associ...","Cooper Consolidated, LLC.",Lanier & Associates Consulting Engineers Inc.,"In the Mississippi River, right descending ban...",[91.026944],[30.129722],The applicant has requested Department of the ...,


In [3]:
web_list_sub = web_list.loc[:99]

In [5]:
pdf_df = pd.DataFrame([scrape_pdf.pdf_extraction(x) for x in web_list_sub["pdf_url"]],
                            columns = ['pdf_permit_app_num',
                                       'pdf_dist_code',
                                       'pdf_dist_name',  
                                       'pdf_manager_name', 
                                       'pdf_manager_phone', 
                                       'pdf_manager_email',
                                       'pdf_applicant_contents',
                                       'pdf_applicant',
                                       'pdf_contract',
                                       'pdf_location', 
                                       'pdf_character', 
                                       'pdf_county',
                                       'pdf_parish',
                                       'pdf_hydrologic_unit_code', 
                                       'pdf_longitude',
                                       'pdf_latitude',
                                       'pdf_acreage',
                                       'pdf_wqc',
                                       'pdf_cup'])

# # Merge with RSS feed table
final_df = web_list_sub.join(pdf_df)

# Add a new column to track the number of errors for each webpage
final_df["error"] = final_df.apply(lambda row: row.str.contains("ERROR").sum(), axis = 1)

final_df

Unnamed: 0,web_link,web_title,published_date,web_expire_date,pdf_url,pdf_permit_app_num,pdf_dist_code,pdf_dist_name,pdf_manager_name,pdf_manager_phone,...,pdf_character,pdf_county,pdf_parish,pdf_hydrologic_unit_code,pdf_longitude,pdf_latitude,pdf_acreage,pdf_wqc,pdf_cup,error
0,http://www.sam.usace.army.mil/Missions/Regulat...,SAM-2023-00455-AMR PROPOSED DISCHARGE OF FILL...,9/6/2023,10/6/2023,https://www.sam.usace.army.mil//Portals/46/doc...,ERROR: 'NoneType' object has no attribute 'group',ERROR: cannot get permit application number,ERROR: cannot get permit application number,ERROR: 'NoneType' object has no attribute 'group',(251) 455-6785,...,,,,,,,,,,4
1,http://www.sam.usace.army.mil/Missions/Regulat...,SAM-2023-00077-MJF PROPOSED WETLAND FILL FOR ...,8/30/2023,9/29/2023,https://www.sam.usace.army.mil//Portals/46/doc...,ERROR: 'NoneType' object has no attribute 'group',ERROR: cannot get permit application number,ERROR: cannot get permit application number,ERROR: 'NoneType' object has no attribute 'group',ERROR: 'NoneType' object has no attribute 'group',...,,,,,,,,,,5
2,http://www.sam.usace.army.mil/Missions/Regulat...,"SAM-2022-00542-CDJ PROPOSED NEW DREDGING, SHO...",8/30/2023,9/29/2023,https://www.sam.usace.army.mil//Portals/46/doc...,ERROR: 'NoneType' object has no attribute 'group',ERROR: cannot get permit application number,ERROR: cannot get permit application number,ERROR: 'NoneType' object has no attribute 'group',(251) 690-2349,...,,,,,,,,,,4
3,http://www.sam.usace.army.mil/Missions/Regulat...,SAM-2022-01041-JEB PROPOSED SHORELINE DEVELOP...,8/25/2023,9/25/2023,https://www.sam.usace.army.mil//Portals/46/doc...,ERROR: 'NoneType' object has no attribute 'group',ERROR: cannot get permit application number,ERROR: cannot get permit application number,Date,(251) 386-4037,...,,,,,,,,,,3
4,http://www.sam.usace.army.mil/Missions/Regulat...,SAM-2023-00405-STB PROPOSED RESIDENTIAL DEVEL...,8/24/2023,9/25/2023,https://www.sam.usace.army.mil//Portals/46/doc...,ERROR: 'NoneType' object has no attribute 'group',ERROR: cannot get permit application number,ERROR: cannot get permit application number,Date,(251) 386-4037,...,,,,,,,,,,3
5,http://www.sam.usace.army.mil/Missions/Regulat...,SAM-2023-00415-CMS PROPOSED DISCHARGE OF FILL...,8/24/2023,9/25/2023,https://www.sam.usace.army.mil//Portals/46/doc...,ERROR: 'NoneType' object has no attribute 'group',ERROR: cannot get permit application number,ERROR: cannot get permit application number,ERROR: 'NoneType' object has no attribute 'group',(205) 381-8108,...,,,,,,,,,,4
6,http://www.sam.usace.army.mil/Missions/Regulat...,SAM-2023-00464-MJF PROPOSED WETLAND FILL FOR ...,8/22/2023,9/21/2023,https://www.sam.usace.army.mil//Portals/46/doc...,ERROR: 'NoneType' object has no attribute 'group',ERROR: cannot get permit application number,ERROR: cannot get permit application number,ERROR: 'NoneType' object has no attribute 'group',ERROR: 'NoneType' object has no attribute 'group',...,,,,,,,,,,5
7,http://www.sam.usace.army.mil/Missions/Regulat...,SAM-2017-00487-LET REQUEST TO MODIFY PERMIT A...,8/14/2023,9/13/2023,https://www.sam.usace.army.mil//Portals/46/doc...,ERROR: 'NoneType' object has no attribute 'group',ERROR: cannot get permit application number,ERROR: cannot get permit application number,ERROR: 'NoneType' object has no attribute 'group',(251) 508-4266,...,,,,,,,,,,4
8,http://www.sam.usace.army.mil/Missions/Regulat...,SAM-2023-00362-MJF PROPOSED WETLAND FILL FOR ...,8/3/2023,9/4/2023,https://www.sam.usace.army.mil//Portals/46/doc...,ERROR: 'NoneType' object has no attribute 'group',ERROR: cannot get permit application number,ERROR: cannot get permit application number,ERROR: 'NoneType' object has no attribute 'group',ERROR: 'NoneType' object has no attribute 'group',...,,,,,,,,,,5
9,http://www.sam.usace.army.mil/Missions/Regulat...,SAM-2023-00617-AMR PROPOSED DISCHARGE OF FILL...,8/2/2023,9/1/2023,https://www.sam.usace.army.mil//Portals/46/doc...,ERROR: 'NoneType' object has no attribute 'group',ERROR: cannot get permit application number,ERROR: cannot get permit application number,ERROR: 'NoneType' object has no attribute 'group',(251) 455-6785,...,,,,,,,,,,4


In [7]:
final_df.to_csv("galveston_subset.csv")