# Acquire

- https://www.latimes.com/california/story/2020-03-16/los-angeles-parking-ticket-street-sweeping-coronavirus-covid19
- https://www.latimes.com/california/story/2020-10-15/street-sweeping-parking-enforcement-resumes-today
- https://abc7.com/society/las-resumed-parking-enforcement-prompts-outcry/7079278/

- https://www.theeastsiderla.com/site/about_the_eastsider/
- https://xtown.la/2020/10/15/parking-ticket-los-angeles/

In [1]:
import numpy as np
import pandas as pd
import os
import spacy

from pdfminer.high_level import extract_text
from transformers import pipeline

In [2]:
# Load in the text from a pdf using pdfminer
extract_text('city-documents/city-council/LADOT-transition-plan.pdf')[:500]

'CITY OF LOS ANGELES\nINTER-DEPARTMENTAL MEMORANDUM\n\nDate:\n\nSeptember 17, 2020\n\nTo:\n\nHonorable City Council\nc/o City Clerk, Room 395, City Hall\nAttention: Honorable Mike Bonin, Chair, Transportation Committee\n\nFrom:\n\nSeleta J. Reynolds, General Manager ^ \nDepartment of Transportation\n\nSubject:\n\nTRANSITION PLAN TO RESUME PARKING ENFORCEMENT FOR PREVIOUSLY SUSPENDED \n\nPARKING INFRACTIONS AND PROPOSED ECONOMIC RELIEF MEASURES FOR PARKING \n\nFINES\n\nSUMMARY\n\nThe Los Angeles Department of Transportation '

In [3]:
# Create an empty dictionary to scrape text from all the pdfs
# stored in the city-documents folder.
documents = []

for root, dirs, files in os.walk("city-documents/"):
    for file in files:
        # If the ends with .pdf, display the path
        if file.endswith(".pdf"):
            print(os.path.join(root, file))
            # Add the filename and relative path as a dictionary to documents
            documents.append({'pdf_name': file,
                              'path': os.path.join(root, file)})

city-documents/city-council/LADOT-transition-plan.pdf
city-documents/city-council/public-outreach-period.pdf
city-documents/city-council/relief-report-motion.pdf
city-documents/city-council/relief-program-report-121720.pdf
city-documents/public-comments/public-comments-parking-enforcement.pdf
city-documents/LADOT/enforcement.pdf
city-documents/LADOT/citation-pay-program.pdf


In [96]:
# Convert the list of dictionaries into a dataframe
pdfs = pd.DataFrame(documents)
pdfs

Unnamed: 0,pdf_name,path
0,LADOT-transition-plan.pdf,city-documents/city-council/LADOT-transition-p...
1,public-outreach-period.pdf,city-documents/city-council/public-outreach-pe...
2,relief-report-motion.pdf,city-documents/city-council/relief-report-moti...
3,relief-program-report-121720.pdf,city-documents/city-council/relief-program-rep...
4,public-comments-parking-enforcement.pdf,city-documents/public-comments/public-comments...
5,enforcement.pdf,city-documents/LADOT/enforcement.pdf
6,citation-pay-program.pdf,city-documents/LADOT/citation-pay-program.pdf


In [97]:
# Create an empty list to store text extracted from each pdf.
text = []

# Scrape the text from each pdf and store the result in text
for index, file in pdfs.iterrows():
    text.append(extract_text(file['pat']))

TypeError: Unsupported input type: <class 'pandas.core.series.Series'>

In [8]:
# Display the number of documents stored in the variable `text`.
len(text)

7

# Prepare

In [72]:
# Display the first 20 characters of each docuemnt in the variable `text`.
for i in range(0, len(text)):
    print(f"\nDOCUMENT #{i+1}")
    print("-----------------")
    print(text[i][:20].strip())


DOCUMENT #1
-----------------
CITY OF LOS ANGELES

DOCUMENT #2
-----------------
MOTION

3 0 A

I MOV

DOCUMENT #3
-----------------
TRANSPORTATION

MOTI

DOCUMENT #4
-----------------
File No. 20-1365

TR

DOCUMENT #5
-----------------
Communication from P

DOCUMENT #6
-----------------
FOR IMMEDI

DOCUMENT #7
-----------------
FOR IMMEDI


In [52]:
# text[4] contains the pdf named 'city-documents/public-comments/public-comments-parking-enforcement.pdf'
# It is a collection of citizens comments on parking reinforcement during town hall meetings.
public_comments = text[4].replace("\xa0", ' ').replace('\n', ' ').strip()

# Explore

In [53]:
# Instantiate a model to analyze sentiment of public comments.
classifier = pipeline('sentiment-analysis')

# Pass the public comments into spacy's nlp model
doc = nlp(public_comments)

filtered_words = [token for token in doc if not token.is_stop]

In [76]:
df_words_sentiment = pd.DataFrame()

for word in filtered_words:
    # For each filtered token, analyze the sentiment
    if word.is_alpha:
        sentiment_data = classifier(word.text)[0]

        # Extract the sentiment and score from the dict
        sentiment = sentiment_data['label']
        score = sentiment_data['score']

        # Append the word, sentiment, and score as a row in a dataframe
        df_words_sentiment = df_words_sentiment.append([{'word': word.text.lower(),
                                                         'sentiment': sentiment,
                                                         'score': score}])

In [77]:
df_words_sentiment

Unnamed: 0,word,sentiment,score
0,communication,POSITIVE,0.997782
0,public,POSITIVE,0.998105
0,date,POSITIVE,0.998968
0,submitted,POSITIVE,0.994183
0,council,POSITIVE,0.990301
...,...,...,...
0,los,POSITIVE,0.904931
0,angeles,POSITIVE,0.997110
0,beg,NEGATIVE,0.997654
0,council,POSITIVE,0.990301


In [80]:
df_words_sentiment.word.value_counts()

parking        29
city           21
people         20
enforcement    17
public         16
               ..
fall            1
turned          1
access          1
care            1
urge            1
Name: word, Length: 467, dtype: int64

In [79]:
df_words_sentiment.sentiment.value_counts()

POSITIVE    766
NEGATIVE    199
Name: sentiment, dtype: int64

In [92]:
df_words_sentiment.groupby(by=['sentiment', 'word']).agg('count').sort_values('score', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,score
sentiment,word,Unnamed: 2_level_1
POSITIVE,parking,29
POSITIVE,city,21
POSITIVE,people,20
POSITIVE,enforcement,17
POSITIVE,public,16
POSITIVE,...,...
POSITIVE,concerned,1
POSITIVE,company,1
POSITIVE,community,1
POSITIVE,comical,1
