# Acquire

- https://www.latimes.com/california/story/2020-03-16/los-angeles-parking-ticket-street-sweeping-coronavirus-covid19
- https://www.latimes.com/california/story/2020-10-15/street-sweeping-parking-enforcement-resumes-today
- https://abc7.com/society/las-resumed-parking-enforcement-prompts-outcry/7079278/

- https://www.theeastsiderla.com/site/about_the_eastsider/
- https://xtown.la/2020/10/15/parking-ticket-los-angeles/

In [1]:
import numpy as np
import pandas as pd
import os
import spacy

from pdfminer.high_level import extract_text

In [2]:
# Load in the text from a pdf using pdfminer
extract_text('city-documents/city-council/LADOT-transition-plan.pdf')[:500]

'CITY OF LOS ANGELES\nINTER-DEPARTMENTAL MEMORANDUM\n\nDate:\n\nSeptember 17, 2020\n\nTo:\n\nHonorable City Council\nc/o City Clerk, Room 395, City Hall\nAttention: Honorable Mike Bonin, Chair, Transportation Committee\n\nFrom:\n\nSeleta J. Reynolds, General Manager ^ \nDepartment of Transportation\n\nSubject:\n\nTRANSITION PLAN TO RESUME PARKING ENFORCEMENT FOR PREVIOUSLY SUSPENDED \n\nPARKING INFRACTIONS AND PROPOSED ECONOMIC RELIEF MEASURES FOR PARKING \n\nFINES\n\nSUMMARY\n\nThe Los Angeles Department of Transportation '

In [3]:
# Create an empty dictionary to scrape text from all the pdfs
# stored in the city-documents folder.
documents = []

for root, dirs, files in os.walk("city-documents/"):
    for file in files:
        # If the ends with .pdf, display the path
        if file.endswith(".pdf"):
            print(os.path.join(root, file))
            # Add the filename and relative path as a dictionary to documents
            documents.append({'pdf_name': file,
                              'path': os.path.join(root, file)})

city-documents/city-council/LADOT-transition-plan.pdf
city-documents/city-council/public-outreach-period.pdf
city-documents/city-council/relief-report-motion.pdf
city-documents/city-council/relief-program-report-121720.pdf
city-documents/public-comments/public-comments-parking-enforcement.pdf
city-documents/LADOT/enforcement.pdf
city-documents/LADOT/citation-pay-program.pdf


In [4]:
# Convert the list of dictionaries into a dataframe
pdfs = pd.DataFrame(documents)
pdfs

Unnamed: 0,pdf_name,path
0,LADOT-transition-plan.pdf,city-documents/city-council/LADOT-transition-p...
1,public-outreach-period.pdf,city-documents/city-council/public-outreach-pe...
2,relief-report-motion.pdf,city-documents/city-council/relief-report-moti...
3,relief-program-report-121720.pdf,city-documents/city-council/relief-program-rep...
4,public-comments-parking-enforcement.pdf,city-documents/public-comments/public-comments...
5,enforcement.pdf,city-documents/LADOT/enforcement.pdf
6,citation-pay-program.pdf,city-documents/LADOT/citation-pay-program.pdf


In [5]:
# Collect the relative paths for each pdfs
pdfs_to_scrape = []

for index, file in pdfs.iterrows():
    print(file['path'])
    pdfs_to_scrape.append(file['path'])

city-documents/city-council/LADOT-transition-plan.pdf
city-documents/city-council/public-outreach-period.pdf
city-documents/city-council/relief-report-motion.pdf
city-documents/city-council/relief-program-report-121720.pdf
city-documents/public-comments/public-comments-parking-enforcement.pdf
city-documents/LADOT/enforcement.pdf
city-documents/LADOT/citation-pay-program.pdf


In [6]:
# Display all file paths
pdfs_to_scrape

['city-documents/city-council/LADOT-transition-plan.pdf',
 'city-documents/city-council/public-outreach-period.pdf',
 'city-documents/city-council/relief-report-motion.pdf',
 'city-documents/city-council/relief-program-report-121720.pdf',
 'city-documents/public-comments/public-comments-parking-enforcement.pdf',
 'city-documents/LADOT/enforcement.pdf',
 'city-documents/LADOT/citation-pay-program.pdf']

In [7]:
# Create an empty list to store text extracted from each pdf.
text = []

# Scrape the text from each pdf and store the result in text
for file in pdfs_to_scrape:
    text.append(extract_text(file))

In [8]:
# Display the number of documents stored in the variable `text`.
len(text)

7

In [9]:
# Display text from the first document
print(text[0][:200])

CITY OF LOS ANGELES
INTER-DEPARTMENTAL MEMORANDUM

Date:

September 17, 2020

To:

Honorable City Council
c/o City Clerk, Room 395, City Hall
Attention: Honorable Mike Bonin, Chair, Transportation Com


In [10]:
# Display the first 200 characters of each docuemnt in the variable `text`.
for i in range(0, len(text)):
    print(f"\nDOCUMENT #{i+1}")
    print("-----------------")
    print(text[i][:200].strip())


DOCUMENT #1
-----------------
CITY OF LOS ANGELES
INTER-DEPARTMENTAL MEMORANDUM

Date:

September 17, 2020

To:

Honorable City Council
c/o City Clerk, Room 395, City Hall
Attention: Honorable Mike Bonin, Chair, Transportation Com

DOCUMENT #2
-----------------
MOTION

3 0 A

I MOVE that the report from the Transportation Committee, Item #30 on today’s Council 
agenda (C.F. 20-0147-S7), relative to the transition plan to resume parking enforcement for 
previ

DOCUMENT #3
-----------------
TRANSPORTATION

MOTION

On October 15th, the Los Angeles Department of Transportation (LADOT) resumed parking 

enforcement for most violations that had been suspended during the initial months of the

DOCUMENT #4
-----------------
File No. 20-1365

TRANSPORTATION  COMMITTEE  REPORT  relative  to  expanding  and  broadening  the
economic hardship relief programs for persons who have received parking fines.
 
Recommendation for C

DOCUMENT #5
-----------------
Communication from Public

 
 
Name:
Date

In [19]:
# Load the large English NLP model
nlp = spacy.load('en_core_web_lg')

# The text we want to examine

# Parse the text with spaCy. This runs the entire pipeline.
doc = nlp(text[0])

# 'doc' now contains a parsed version of text. We can use it to do anything we want!
# For example, this will print out all the named entities that were detected:
for entity in doc.ents:
    print(f"{entity.text:<25} ({entity.label_})")

CITY OF                   (ORG)
LOS ANGELES               (GPE)
September 17, 2020        (DATE)
City Council              (ORG)
395                       (CARDINAL)
City Hall
                (FAC)
Mike Bonin                (PERSON)
Transportation Committee

 (ORG)
Seleta J. Reynolds        (PERSON)
Department of Transportation

 (ORG)
The Los Angeles Department of Transportation (ORG)
LADOT                     (ORG)
CF 20-0147-S7             (ORG)
the City Council          (ORG)
1                         (CARDINAL)
LADOT                     (ORG)
October 1, 2020           (DATE)
overnight                 (TIME)
2                         (CARDINAL)
LADOT                     (ORG)
October 15, 2020          (DATE)
3                         (CARDINAL)
LADOT                     (ORG)
October 22, 2020          (DATE)
4                         (CARDINAL)
LADOT                     (ORG)
January 1, 2021           (DATE)
March 4, 2020             (DATE)
the City Council          (ORG)
the Decla

In [12]:
import textacy.extract

In [20]:
# Load the large English NLP model
nlp = spacy.load('en_core_web_lg')

# Parse the document with spaCy
doc = nlp(text[0])

# Extract semi-structured statements
statements = textacy.extract.semistructured_statements(doc, "public")

# Print the results
print("Here are the things I know about London:")

for statement in statements:
    subject, verb, fact = statement
    print(f" - {fact}")

Here are the things I know about London:


In [21]:
from spacy import displacy

sentence = text[0]
sentence_nlp = nlp(sentence)

# visualize named entities
displacy.render(sentence_nlp, style='ent', jupyter=True)