In [1]:
!pip install pdfminer3

Collecting pdfminer3
  Using cached pdfminer3-2018.12.3.0-py3-none-any.whl
Collecting pycryptodome (from pdfminer3)
  Using cached pycryptodome-3.20.0-cp35-abi3-macosx_10_9_universal2.whl.metadata (3.4 kB)
Using cached pycryptodome-3.20.0-cp35-abi3-macosx_10_9_universal2.whl (2.4 MB)
Installing collected packages: pycryptodome, pdfminer3
Successfully installed pdfminer3-2018.12.3.0 pycryptodome-3.20.0


In [None]:
!pip install spacy

In [1]:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
import pandas as pd
import spacy
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
import os
import time
import io
import string
# PDF text extractionå
from pdfminer3.layout import LAParams
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer3.converter import TextConverter


from importlib import reload
import text_cleaner  # Import the module first
text_cleaner = reload(text_cleaner)  # Reload the module
from text_cleaner import TextCleaner 

import sentence_extractor  # Import the module first
sentence_extractor = reload(sentence_extractor)  # Reload the module
from sentence_extractor import SentenceExtractor

## Web Scraping
### Get the company reports from www.responsibilityreports.co.uk

In [2]:
def get_company_link(ticker):
    url = f"https://www.responsibilityreports.co.uk/Companies?search={ticker}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    # The container for each company might be a list item, as per the HTML structure shown.
    list_items = soup.select('ul > li')  # Assuming the 'ul' directly contains the 'li' items

    for item in list_items:
        sector_name = item.find('span', {'class': 'sectorName'})
        if sector_name and sector_name.text.strip() == 'Technology':
            # If the item's sector is Technology, then find the link
            link = item.find('a', href=True)
            if link:
                company_link = "https://www.responsibilityreports.co.uk" + link['href']
                return company_link

    return None
company_link = get_company_link('MSFT')

In [3]:
def get_pdf_download_links(report_url):
    response = requests.get(report_url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Initialize a list for all found PDF URLs
    pdf_urls = []

    # Attempt to find the 'onclick' attribute for the most recent report
    recent_report = soup.find('a', class_='btn_form_10k')
    if recent_report and 'onclick' in recent_report.attrs:
        onclick_text = recent_report['onclick']
        # Extract URL from the onclick javascript
        url_match = re.search(r"window.open\('([^']*)", onclick_text)
        if url_match:
            most_recent_url = urljoin(report_url, url_match.group(1))
            pdf_urls.append(most_recent_url)
    else:
        # If 'onclick' method fails, try to match by report viewing text
        recent_report_link = soup.find('a', string=re.compile("View Environmental Sustainability Report"))
        if recent_report_link:
            pdf_urls.append(urljoin(report_url, recent_report_link['href']))

    # Find all PDF links that contain the text 'Report' in their URL for historical reports
    historical_report_links = soup.find_all('a', href=lambda x: x and 'Report' in x and x.endswith('.pdf'))
    for link in historical_report_links:
        pdf_url = urljoin(report_url, link['href'])
        if pdf_url not in pdf_urls:  # Avoid duplicates
            pdf_urls.append(pdf_url)

    return pdf_urls
pdf_urls = get_pdf_download_links(company_link)
pdf_urls

['https://www.responsibilityreports.co.uk/Click/2398',
 'https://www.responsibilityreports.co.uk/HostedData/ResponsibilityReportArchive/m/NASDAQ_MSFT_2021.pdf',
 'https://www.responsibilityreports.co.uk/HostedData/ResponsibilityReportArchive/m/NASDAQ_MSFT_2020.pdf',
 'https://www.responsibilityreports.co.uk/HostedData/ResponsibilityReportArchive/m/NASDAQ_MSFT_2019.pdf',
 'https://www.responsibilityreports.co.uk/HostedData/ResponsibilityReportArchive/m/NASDAQ_MSFT_2018.pdf',
 'https://www.responsibilityreports.co.uk/HostedData/ResponsibilityReportArchive/m/NASDAQ_MSFT_2017.pdf',
 'https://www.responsibilityreports.co.uk/HostedData/ResponsibilityReportArchive/m/NASDAQ_MSFT_2016.pdf',
 'https://www.responsibilityreports.co.uk/HostedData/ResponsibilityReportArchive/m/NASDAQ_MSFT_2015.pdf',
 'https://www.responsibilityreports.co.uk/HostedData/ResponsibilityReportArchive/m/NASDAQ_MSFT_2014.pdf',
 'https://www.responsibilityreports.co.uk/HostedData/ResponsibilityReportArchive/m/NASDAQ_MSFT_20

In [4]:
reports_dict = {'MSFT': pdf_urls}
reports_dict

{'MSFT': ['https://www.responsibilityreports.co.uk/Click/2398',
  'https://www.responsibilityreports.co.uk/HostedData/ResponsibilityReportArchive/m/NASDAQ_MSFT_2021.pdf',
  'https://www.responsibilityreports.co.uk/HostedData/ResponsibilityReportArchive/m/NASDAQ_MSFT_2020.pdf',
  'https://www.responsibilityreports.co.uk/HostedData/ResponsibilityReportArchive/m/NASDAQ_MSFT_2019.pdf',
  'https://www.responsibilityreports.co.uk/HostedData/ResponsibilityReportArchive/m/NASDAQ_MSFT_2018.pdf',
  'https://www.responsibilityreports.co.uk/HostedData/ResponsibilityReportArchive/m/NASDAQ_MSFT_2017.pdf',
  'https://www.responsibilityreports.co.uk/HostedData/ResponsibilityReportArchive/m/NASDAQ_MSFT_2016.pdf',
  'https://www.responsibilityreports.co.uk/HostedData/ResponsibilityReportArchive/m/NASDAQ_MSFT_2015.pdf',
  'https://www.responsibilityreports.co.uk/HostedData/ResponsibilityReportArchive/m/NASDAQ_MSFT_2014.pdf',
  'https://www.responsibilityreports.co.uk/HostedData/ResponsibilityReportArchiv

## Report's content extraction

In [5]:
def extract_pdf(file, verbose=False):
    if verbose:
        print('Processing {}'.format(file))

    try:
        resource_manager = PDFResourceManager()
        fake_file_handle = io.StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        converter = TextConverter(resource_manager, fake_file_handle, codec=codec, laparams=laparams)
        page_interpreter = PDFPageInterpreter(resource_manager, converter)

        content = []

        for page in PDFPage.get_pages(file, caching=True, check_extractable=True):
            page_interpreter.process_page(page)
            content.append(fake_file_handle.getvalue())
            fake_file_handle.truncate(0)
            fake_file_handle.seek(0)

        text = '##PAGE_BREAK##'.join(content)

        converter.close()
        fake_file_handle.close()

        return text

    except Exception as e:
        print(e)
        converter.close()
        fake_file_handle.close()
        return ""

def extract_content(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        response = requests.get(url, headers=headers, allow_redirects=True)
        final_url = response.url  # After redirection
        text = extract_pdf(io.BytesIO(response.content))
        return text, final_url
    except Exception as e:
        print("Error retrieving or processing PDF:", e)
        return "", url

def process_pdf_reports(report_urls_dict):
    reports_by_company = {}
    for company, urls in report_urls_dict.items():
        if company not in reports_by_company:
            reports_by_company[company] = {}

        for url in urls:
            print(f"Processing report for {company} from {url}")
            content, final_url = extract_content(url)
            year_match = re.search(r'_(\d{4})\.pdf', final_url)
            if year_match:
                year = int(year_match.group(1))
                if year not in reports_by_company[company]:
                    reports_by_company[company][year] = []
                reports_by_company[company][year].append(content)
            else:
                print(f"Year not found in URL: {final_url}")

    return reports_by_company

# Example dictionary with company tickers and URLs
report_urls_dict = {
    'MSFT': [
        'https://www.responsibilityreports.co.uk/Click/2398',  # This URL needs to follow redirection
        'https://www.responsibilityreports.co.uk/HostedData/ResponsibilityReportArchive/m/NASDAQ_MSFT_2021.pdf',
        # Additional URLs as needed...
    ]
}

# Process reports and organize by year
company_reports = process_pdf_reports(reports_dict)

Processing report for MSFT from https://www.responsibilityreports.co.uk/Click/2398
Processing report for MSFT from https://www.responsibilityreports.co.uk/HostedData/ResponsibilityReportArchive/m/NASDAQ_MSFT_2021.pdf
Processing report for MSFT from https://www.responsibilityreports.co.uk/HostedData/ResponsibilityReportArchive/m/NASDAQ_MSFT_2020.pdf
Processing report for MSFT from https://www.responsibilityreports.co.uk/HostedData/ResponsibilityReportArchive/m/NASDAQ_MSFT_2019.pdf
Processing report for MSFT from https://www.responsibilityreports.co.uk/HostedData/ResponsibilityReportArchive/m/NASDAQ_MSFT_2018.pdf
Processing report for MSFT from https://www.responsibilityreports.co.uk/HostedData/ResponsibilityReportArchive/m/NASDAQ_MSFT_2017.pdf
Processing report for MSFT from https://www.responsibilityreports.co.uk/HostedData/ResponsibilityReportArchive/m/NASDAQ_MSFT_2016.pdf
Processing report for MSFT from https://www.responsibilityreports.co.uk/HostedData/ResponsibilityReportArchive/m/

## Sentence Extraction

In [6]:
processor = SentenceExtractor()
# Assuming you have a dictionary `company_reports` structured as {company: {year: [content]}}
def process_reports_to_dataframe(company_reports):
    data = []  # List to store all sentences with the corresponding year and company

    # Iterate over each company and their reports organized by year
    for company, years_contents in company_reports.items():
        for year, contents in years_contents.items():
            for content in contents:
                # Extract sentences from each report content
                sentences = processor.extract_sentences(content)
                # Append each sentence to the data list with the year and company
                for sentence in sentences:
                    data.append({'Company': company, 'Year': year, 'Sentence': sentence})

    # Convert the list to DataFrame
    return pd.DataFrame(data)

df_sentences = process_reports_to_dataframe(company_reports)
df_sentences

Unnamed: 0,Company,Year,Sentence
0,MSFT,2022,Foreword Enabling sustainability for our compa...
1,MSFT,2022,"Extreme weather caused devastating droughts, w..."
2,MSFT,2022,We felt the effects of climate change like nev...
3,MSFT,2022,The most recent report from the Intergovernmen...
4,MSFT,2022,Meaningful climate action requires an enduring...
...,...,...,...
9614,MSFT,2003,Microsofts diversity education pro- grams prov...
9615,MSFT,2003,BusinessesMicrosoft sponsors a variety of prog...
9616,MSFT,2003,"Since its inception, the national award-winnin..."
9617,MSFT,2003,"At Microsoft, our commitment to diversity goes..."


## Text Cleaning

In [7]:
cleaner = TextCleaner() 
# Apply the text-cleaning methods
df_sentences['Sentence'] = df_sentences['Sentence'].apply(cleaner.remove_special_characters)
df_sentences['cleaned_text'] = df_sentences['Sentence'].apply(cleaner.lowercase_text)
df_sentences['cleaned_text'] = df_sentences['cleaned_text'].apply(cleaner.remove_entities)
df_sentences['cleaned_text'] = df_sentences['cleaned_text'].apply(cleaner.remove_punctuation)
df_sentences['cleaned_text'] = df_sentences['cleaned_text'].apply(cleaner.remove_stopwords)
df_sentences

Unnamed: 0,Company,Year,Sentence,cleaned_text
0,MSFT,2022,Foreword Enabling sustainability for our compa...,foreword enabling sustainability company custo...
1,MSFT,2022,"Extreme weather caused devastating droughts, w...",extreme weather caused devastating droughts wi...
2,MSFT,2022,We felt the effects of climate change like nev...,felt effects climate change like never planet ...
3,MSFT,2022,The most recent report from the Intergovernmen...,recent report intergovernmental panel climate ...
4,MSFT,2022,Meaningful climate action requires an enduring...,meaningful climate action requires enduring co...
...,...,...,...,...
9614,MSFT,2003,Microsofts diversity education programs provid...,microsofts diversity education programs provid...
9615,MSFT,2003,Businesses Microsoft sponsors a variety of pro...,businesses sponsors variety programs designed ...
9616,MSFT,2003,"Since its inception, the national awardwinning...",since inception national awardwinning build bu...
9617,MSFT,2003,"At Microsoft, our commitment to diversity goes...",commitment diversity goes beyond recruiting em...


In [8]:
cleaned_reports = df_sentences[['Sentence','cleaned_text','Year','Company']]
cleaned_reports.to_csv('data/cleaned_reports.csv')