In [1]:
import fitz
import re
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import ssl
from transformers import BartForConditionalGeneration, BartTokenizer

In [2]:
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/stan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
file_path = "ESG Reports/2022_Apple_ESG_Report.pdf"

In [4]:
def cleanup_text(text):
    # remove non-breaking spaces
    text = text.replace(u'\xa0', u' ')
    # remove bullet points
    text = text.replace(u'•', u'')
    # remove any non-alphanumeric, non-hyphen characters
    text = re.sub(r'[^A-Za-z0-9- ]', '', text)
    # remove words with hyphens, as they could be compound words
    text = re.sub(r'\w+-\w+', '', text)
    return text

In [5]:
def text_process(text_str):
    text_str = re.sub(r'[^\w\s]', '', text_str)  # remove punctuation
    text_str = re.sub(r'\b\d+\b', '', text_str)  # remove standalone digits
    text_str = ' '.join(text_str.split())  # remove extra spaces
    return text_str

In [6]:
all_text = []

with fitz.open(file_path) as doc:
    for page in doc:
        text = page.get_text()
        # text = cleanup_text(text)  # Apply cleanup_text here
        text = text.replace('•', ' ')
        text = text_process(text)
        text = text.replace('\n', ' ')
        text = text.replace('\u2003', ' ')
        text = text.strip()
        all_text.append(text)


processed_text = ' '.join(all_text)

In [7]:
all_text

['Apples ESG Report Environmental Social Governance Report',
 'Introduction Letter from Tim Cook Report highlights Our approach Our commitment to transparency Advocating for change Our commitment to human rights Environment Our approach Climate change Resources Smarter chemistry Our People Our approach Inclusion and diversity Growth and development Benefits Compensation Engagement Workplace practices and policies Health and safety at Apple Suppliers Our approach Labor and human rights in the supply chain Health safety and wellness Responsible materials sourcing Education and professional development Environment Customers Our approach Privacy Accessibility Inclusive design Education Health Caring for customers Communities Our approach Racial Equity and Justice Initiative Education Affordable housing initiative Corporate donations Employee giving Job creation Governance Corporate governance Ethics and compliance Tax payments Appendix Awards and recognition United Nations Sustainable Deve

In [8]:
processed_text



In [31]:
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

# Tokenize the text
inputs = tokenizer(processed_text, truncation=True, max_length=1024, return_tensors='pt')

# Generate a summary
summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=1500, early_stopping=True)

# Decode the summary and print it
summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
print(summary)

['Apples ESG Report Environmental Social Governance Report Introduction Letter from Tim Cook Report highlights Our approach Our commitment to transparency Advocating for change Our commitment for human rights Environment Our approach Climate change Resources Smarter chemistry Our People Our approach Inclusion and diversity Growth and development Benefits Compensation Engagement Workplace practices and policies.']


In [15]:
def chunk_text(text, chunk_size):
    """Breaks the text into chunks of a specified size."""
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

# Break the text into chunks of 500 words each
chunks = chunk_text(processed_text, 500)

summaries = []
for chunk in chunks:
    inputs = tokenizer(chunk, truncation=True, max_length=1024, return_tensors='pt')
    summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=200, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
    summaries.append(summary)

# Concatenate the summaries to form a final summary
final_summary = ' '.join(summaries)
print(final_summary)

KeyboardInterrupt: 

In [17]:
from transformers import LEDForConditionalGeneration, LEDTokenizer

model = LEDForConditionalGeneration.from_pretrained('allenai/led-base-16384')
tokenizer = LEDTokenizer.from_pretrained('allenai/led-base-16384')

inputs = tokenizer(processed_text, truncation=True, max_length=8192, return_tensors='pt')

summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=2048, early_stopping=True)

summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
print(summary)


IndexError: index out of range in self

In [19]:
from transformers import LEDForConditionalGeneration, LEDTokenizer

def chunk_text(text, chunk_size):
    """Breaks the text into chunks of a specified size."""
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

model = LEDForConditionalGeneration.from_pretrained('allenai/led-base-16384')
tokenizer = LEDTokenizer.from_pretrained('allenai/led-base-16384')

# Break the text into chunks of 8000 tokens each
chunks = chunk_text(processed_text, 8000)

summaries = []
for chunk in chunks:
    inputs = tokenizer(chunk, truncation=True, max_length=2048, return_tensors='pt')
    summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=1024, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
    summaries.append(summary)

# Concatenate the summaries to form a final summary
final_summary = ' '.join(summaries)
print(final_summary)


KeyboardInterrupt: 

In [21]:
from summarizer import Summarizer

model = Summarizer()

# Break the text into chunks of 8000 tokens each
chunks = chunk_text(processed_text, 8000)

summaries = []
for chunk in chunks:
    summary = model(chunk, min_length=60, max_length=600)
    summaries.append("".join(summary))

# Concatenate the summaries to form a final summary
final_summary = ' '.join(summaries)
print(final_summary)

TypeError: 'Summarizer' object is not callable

In [22]:
from transformers import pipeline

In [23]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [24]:
processed_text



In [28]:
print(summarizer(processed_text, max_length=1000, min_length=30, do_sample=False))

IndexError: index out of range in self

In [30]:
from transformers import pipeline

def chunk_text(text, chunk_size):
    """Breaks the text into chunks of a specified size."""
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Break the text into chunks of 1024 tokens each
chunks = chunk_text(processed_text, 1024)

summaries = []
for chunk in chunks:
    summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
    summaries.append(summary[0]['summary_text'])

# Concatenate the summaries to form a final summary
final_summary = ' '.join(summaries)
print(final_summary)

KeyboardInterrupt: 

In [32]:
from transformers import pipeline

def chunk_text(text, chunk_size):
    """Breaks the text into chunks of a specified size."""
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Break the text into chunks of 1024 tokens each
chunks = chunk_text(processed_text, 1024)

summaries = []
for chunk in chunks:
    summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
    summaries.append(summary[0]['summary_text'])

# Concatenate the summaries to form a final summary
final_summary = ' '.join(summaries)
print(final_summary)


KeyboardInterrupt: 

In [33]:
from transformers import pipeline

def chunk_text(text, chunk_size):
    """Breaks the text into chunks of a specified size."""
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

# Break the text into smaller chunks
chunks = chunk_text(processed_text, 512)

summaries = []
for chunk in chunks:
    summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
    summaries.append(summary[0]['summary_text'])

# Concatenate the summaries to form a final summary
final_summary = ' '.join(summaries)
print(final_summary)
from transformers import pipeline

def chunk_text(text, chunk_size):
    """Breaks the text into chunks of a specified size."""
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

# Break the text into smaller chunks
chunks = chunk_text(processed_text, 512)

summaries = []
for chunk in chunks:
    summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
    summaries.append(summary[0]['summary_text'])

# Concatenate the summaries to form a final summary
final_summary = ' '.join(summaries)
print(final_summary)


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Your max_length is set to 130, but your input_length is only 83. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=41)
Your max_length is set to 130, but your input_length is only 71. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)
Your max_length is set to 130, but your input_length is only 91. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=45)
Your max_length is set to 130, but your input_length is only 94. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=47)
Your

 Apples ESG Report Environmental Social Governance Report . Report highlights Our approach Our commitment to transparency Advocating for change . Smarter chemistry Our People Our approach Inclusion and diversity Growth and Development Benefits Compensation Engagement Workplace practices and policies Health and safety at Apple Suppliers Our approach Labor and human rights in the supply chain .  Corporate governance, ethics, ethics and compliance are key issues for the project . The project is based on the United Nations Sustainable Development Goals . The company is committed to creating a sustainable development community . The team will focus on developing a sustainable community of people .  This report contains forwardlooking statements and actual results may differ . Numbers and percentages in this report include estimates or approximations and may be based on assumptions .  As a company we know that our impact on peoples lives reaches beyond the technology we create And we apply t

Your max_length is set to 130, but your input_length is only 83. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=41)
Your max_length is set to 130, but your input_length is only 71. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)
Your max_length is set to 130, but your input_length is only 91. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=45)
Your max_length is set to 130, but your input_length is only 94. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=47)
Your

KeyboardInterrupt: 

In [34]:
from summarizer import Summarizer

model = Summarizer()

result = model(processed_text, min_length=60)
summary = ''.join(result)
print(summary)


TypeError: 'Summarizer' object is not callable

In [37]:
from summarizer import Summarizer

model = Summarizer()

# Concatenate all processed text into one single string
complete_text = ' '.join(processed_text)

result = model(complete_text, min_length=60, max_length=200)
summary = "".join(result)
print(summary)

TypeError: 'Summarizer' object is not callable

In [39]:
from summarizer import Summarizer

model = Summarizer()

# Concatenate all processed text into one single string
complete_text = ' '.join(processed_text)

result = model(complete_text, min_length=60, max_length=200)
summary = "".join(result)
print(summary)

TypeError: 'Summarizer' object is not callable