In [1]:
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import ssl
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import fitz
import string
import sys
import json
import plotly.io as pio

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [2]:
nltk.download('punkt')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /Users/stan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/stan/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
file_path = "../esg_reports/2022_Apple_ESG_Report.pdf"

In [4]:
def cleanup_text(text):
    # remove non-breaking spaces
    text = text.replace(u'\xa0', u' ')
    # remove bullet points
    text = text.replace(u'•', u'')
    # remove any non-alphanumeric, non-hyphen characters
    text = re.sub(r'[^A-Za-z0-9- ]', '', text)
    # remove words with hyphens, as they could be compound words
    text = re.sub(r'\w+-\w+', '', text)
    return text

In [5]:
def text_process(text_str):
    # handle punctuation and special characters
    text_str = re.sub(r'[^\w\s]', '', text_str)
    if '-' in text_str:
        text_str = text_str.replace(
            '- ', '').replace(' -', '').replace(' ,', ',').replace(' .', '.')
    text_str = text_str.strip()
    return text_str

In [19]:
all_text = []
stop_words = set(stopwords.words('english'))

In [7]:
with fitz.open(file_path) as doc:
    for page in doc:
        text = page.get_text()
        # text = cleanup_text(text)  # Apply cleanup_text here
        text = text.replace('•', ' ')
        text = text_process(text)
        text = text.replace('\n', ' ')
        text = text.replace('\u2003', ' ')
        text = text.strip()
        all_text.append(text)

In [8]:
lemmatizer = WordNetLemmatizer()
processed_text = []

for doc in all_text:
    tokens = word_tokenize(doc.lower())
    lemmatized = [lemmatizer.lemmatize(
        token) for token in tokens if token not in string.punctuation]
    no_stops = [token for token in lemmatized if token not in stop_words]
    cleaned_doc = " ".join(no_stops)
    # remove digits from each document
    cleaned_doc = re.sub(r'\d+', '', cleaned_doc)
    # remove extra spaces from each document
    cleaned_doc = re.sub(' +', ' ', cleaned_doc)
    processed_text.append(cleaned_doc)

doc_list = [doc.split() for doc in processed_text]

In [9]:
# Initialize VADER
sid = SentimentIntensityAnalyzer()

# Apply VADER to each sentence and store the scores
sentiment_scores = []
for sentence in processed_text:
    scores = sid.polarity_scores(sentence)
    sentiment_scores.append(scores)

# Get individual sentiment scores over the course of the report
overall_sentiment = {
    'neg': [score['neg'] for score in sentiment_scores],
    'neu': [score['neu'] for score in sentiment_scores],
    'pos': [score['pos'] for score in sentiment_scores],
    'compound': [score['compound'] for score in sentiment_scores]
}

In [18]:
print(json.dumps(overall_sentiment))

{"neg": [0.0, 0.0, 0.005, 0.035, 0.039, 0.015, 0.02, 0.06, 0.064, 0.053, 0.039, 0.0, 0.022, 0.038, 0.127, 0.006, 0.032, 0.037, 0.013, 0.12, 0.028, 0.0, 0.024, 0.011, 0.019, 0.005, 0.036, 0.051, 0.046, 0.072, 0.035, 0.019, 0.101, 0.156, 0.01, 0.0, 0.026, 0.027, 0.031, 0.028, 0.114, 0.085, 0.007, 0.045, 0.008, 0.023, 0.0, 0.031, 0.034, 0.032, 0.052, 0.022, 0.005, 0.004, 0.007, 0.022, 0.038, 0.0, 0.026, 0.059, 0.005, 0.004, 0.007, 0.011, 0.066, 0.009, 0.099, 0.005, 0.0, 0.076, 0.029, 0.049, 0.0, 0.037, 0.047, 0.066, 0.017, 0.049, 0.008, 0.122, 0.0, 0.0, 0.014, 0.053, 0.05], "neu": [1.0, 0.697, 0.619, 0.78, 0.761, 0.74, 0.659, 0.831, 0.691, 0.742, 0.696, 0.875, 0.68, 0.705, 0.668, 0.808, 0.762, 0.75, 0.822, 0.688, 0.685, 0.678, 0.622, 0.83, 0.786, 0.69, 0.649, 0.683, 0.686, 0.73, 0.755, 0.769, 0.71, 0.641, 0.802, 0.729, 0.766, 0.79, 0.764, 0.73, 0.723, 0.759, 0.69, 0.872, 0.764, 0.762, 0.878, 0.718, 0.762, 0.747, 0.73, 0.82, 0.681, 0.688, 0.725, 0.742, 0.713, 0.707, 0.666, 0.606, 0.766, 0.