# This book extracts the keywords from SASB sustainable standards for banks 


In [1]:
#import needed libraries
from pypdf import PdfReader
from keybert import KeyBERT

  from .autonotebook import tqdm as notebook_tqdm
2024-03-19 03:15:56.782504: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
reading_directory = '/Users/pelumioluwaabiola/Downloads/Researchwork/Standards'

In [3]:
#read pdf file
sasb_standards = []
document_name = '/Dec23-SASB-CommercialBanks.pdf'
reader = PdfReader(reading_directory + document_name)
number_of_pages = len(reader.pages)
page = reader.pages[2]
text = page.extract_text()

for i in range(0, number_of_pages):
    page = reader.pages[i]
    text = page.extract_text()
    sasb_standards.append(text)

print(sasb_standards)

['INDUSTRY STANDARD | VERSION 2023-12sasb.org\nSustainability Accounting Standard \n© 2023 The IFRS Foundation. All Rights Reserved.FINANCIALS SECTOR\nSustainable Industry Classification System® (SICS®)\nUnder Stewardship of the International Sustainability Standards BoardFN-CBCommercial Banks ', 'ABOUT THE SASB STANDARDS\nAs of August 2022, the International Sustainability Standards Board (ISSB) of the IFRS Foundation assumed\nresponsibility for the SASB Standards. The ISSB has committed to maintain, enhance and evolve the SASB\nStandards and encourages preparers and investors to continue to use the SASB Standards.\nIFRS S1 General Requirements for Disclosure of Sustainability-related Financial Information  (IFRS S1) requires\nentities to refer to and consider the applicability of disclosure topics in the SASB Standards when identifying\nsustainability-related risks and opportunities that could reasonably be expected to affect an entity ’s prospects.\nSimilarly, IFRS S1 requires entit

In [4]:
def clean_pdf(text):
    # If the first character is a digit, remove it
    for _ in range(4):
        if text and text[0].isdigit():
            text = text[1:]
    #remove '\n' (newlines)
    text = text.replace('\n', '  ')
    # Remove '\x0c' (form feed/new page)
    text = text.replace('\x0c', ' ')
    # Remove '\xa0' (non-breaking space)
    text = text.replace('\xa0', ' ')
    #remove all other special characters
    text = text.replace('\uf08c', ' ')
    text = text.replace('\uf099', ' ')
    text = text.replace('\uf09a ', ' ')
    text = text.replace('\uf232', ' ')
    text = text.replace('\uf0e0', ' ')
    text = text.replace('\x00', ' ')
    text = text.replace('\uf0e1', ' ')
    text = text.replace('\uf095', ' ')
    text = text.replace('\ue816', ' ')
    text = text.replace('\uf00d', ' ')
    text = text.replace('\uf002', ' ')
    text = text.replace('\uf107', ' ')
    text = text.replace('\uf078', ' ')
    text = text.replace('\uf0b7', ' ')
    #remove all unnecessary spaces
    text = ' '.join(text.split())



    return text

In [8]:
#clean pdf
clean_sasb_standards = []
for i in range(len(sasb_standards)):
    text = clean_pdf(sasb_standards[i])
    clean_sasb_standards.append(text)

clean_sasb_standards

['INDUSTRY STANDARD | VERSION 2023-12sasb.org Sustainability Accounting Standard © 2023 The IFRS Foundation. All Rights Reserved.FINANCIALS SECTOR Sustainable Industry Classification System® (SICS®) Under Stewardship of the International Sustainability Standards BoardFN-CBCommercial Banks',
 'ABOUT THE SASB STANDARDS As of August 2022, the International Sustainability Standards Board (ISSB) of the IFRS Foundation assumed responsibility for the SASB Standards. The ISSB has committed to maintain, enhance and evolve the SASB Standards and encourages preparers and investors to continue to use the SASB Standards. IFRS S1 General Requirements for Disclosure of Sustainability-related Financial Information (IFRS S1) requires entities to refer to and consider the applicability of disclosure topics in the SASB Standards when identifying sustainability-related risks and opportunities that could reasonably be expected to affect an entity ’s prospects. Similarly, IFRS S1 requires entities to refer 

In [9]:
#delete unnecessary pages
clean_sasb_standards.pop(0)
clean_sasb_standards.pop(1)
clean_sasb_standards.pop(-1)
clean_sasb_standards

['ABOUT THE SASB STANDARDS As of August 2022, the International Sustainability Standards Board (ISSB) of the IFRS Foundation assumed responsibility for the SASB Standards. The ISSB has committed to maintain, enhance and evolve the SASB Standards and encourages preparers and investors to continue to use the SASB Standards. IFRS S1 General Requirements for Disclosure of Sustainability-related Financial Information (IFRS S1) requires entities to refer to and consider the applicability of disclosure topics in the SASB Standards when identifying sustainability-related risks and opportunities that could reasonably be expected to affect an entity ’s prospects. Similarly, IFRS S1 requires entities to refer to and consider the applicability of metrics in the SASB Standards when determining what information to disclose regarding sustainability-related risks and opportunities. In June 2023, the ISSB amended climate-related topics and metrics in the SASB Standards to align them with the industry-b

In [10]:
keyextractor = KeyBERT('distilbert-base-nli-mean-tokens')

In [11]:
#retrieve top 10 for each page in the document 
keywords = []
for i in range(len(clean_sasb_standards)):
    keywords.append(keyextractor.extract_keywords(clean_sasb_standards[i], 
                              keyphrase_ngram_range=(1, 2), 
                              stop_words='english',  
                              top_n=5))

In [13]:
#print the keywords
for i in keywords:
    print(i)

[('august 2022', 0.6153), ('2023 issb', 0.6082), ('june 2023', 0.6072), ('2022 international', 0.5843), ('january 2025', 0.5771)]
[('industry metrics', 0.5522), ('industry standards', 0.5399), ('industry guidance', 0.5381), ('industry disclosure', 0.5175), ('industry descriptions', 0.5167)]
[('investors responsibility', 0.5899), ('regulatory trends', 0.5635), ('brokerage services', 0.5623), ('business associated', 0.5554), ('regulatory uncertainty', 0.5522)]
[('community developmentquantitativenumber', 0.5607), ('sustainability accounting', 0.5558), ('developmentquantitativenumber presentation', 0.534), ('financed emissionsdiscussion', 0.5137), ('asset classquantitativepresentation', 0.5133)]
[('trading antitrust', 0.5806), ('businessquantitativenumber presentation', 0.5707), ('businessquantitativenumber', 0.5688), ('corporate6 quantitativenumber', 0.5556), ('business ethicstotal', 0.5545)]
[('cybersecurity threats', 0.5428), ('compromising customers', 0.5218), ('growing cybersecurity'

In [14]:
keywords.pop(0)
keywords

[[('industry metrics', 0.5522),
  ('industry standards', 0.5399),
  ('industry guidance', 0.5381),
  ('industry disclosure', 0.5175),
  ('industry descriptions', 0.5167)],
 [('investors responsibility', 0.5899),
  ('regulatory trends', 0.5635),
  ('brokerage services', 0.5623),
  ('business associated', 0.5554),
  ('regulatory uncertainty', 0.5522)],
 [('community developmentquantitativenumber', 0.5607),
  ('sustainability accounting', 0.5558),
  ('developmentquantitativenumber presentation', 0.534),
  ('financed emissionsdiscussion', 0.5137),
  ('asset classquantitativepresentation', 0.5133)],
 [('trading antitrust', 0.5806),
  ('businessquantitativenumber presentation', 0.5707),
  ('businessquantitativenumber', 0.5688),
  ('corporate6 quantitativenumber', 0.5556),
  ('business ethicstotal', 0.5545)],
 [('cybersecurity threats', 0.5428),
  ('compromising customers', 0.5218),
  ('growing cybersecurity', 0.5157),
  ('evolving cybersecurity', 0.4955),
  ('banks protecting', 0.4826)],
 [(

In [15]:
#retrieve all the first items in each tuple in each list for keywords
first_keywords = []
for i in range(len(keywords)):
    for j in range(len(keywords[i])):
        first_keywords.append(keywords[i][j][0])

first_keywords

['industry metrics',
 'industry standards',
 'industry guidance',
 'industry disclosure',
 'industry descriptions',
 'investors responsibility',
 'regulatory trends',
 'brokerage services',
 'business associated',
 'regulatory uncertainty',
 'community developmentquantitativenumber',
 'sustainability accounting',
 'developmentquantitativenumber presentation',
 'financed emissionsdiscussion',
 'asset classquantitativepresentation',
 'trading antitrust',
 'businessquantitativenumber presentation',
 'businessquantitativenumber',
 'corporate6 quantitativenumber',
 'business ethicstotal',
 'cybersecurity threats',
 'compromising customers',
 'growing cybersecurity',
 'evolving cybersecurity',
 'banks protecting',
 'encryption key',
 'acquired encryption',
 'data breaches',
 'ciphertext entity',
 'incidents encrypted',
 'cyber threats',
 'cybersecurity requirements',
 'include cyber',
 'infrastructure cybersecurity',
 'emerging cyber',
 'business loans',
 'improve financial',
 'assessing ban

In [16]:
#create a dictionary of keywords with their frequency in the list
keywords_frequency = {}
for word in first_keywords:
    if word in keywords_frequency:
        keywords_frequency[word] += 1
    else:
        keywords_frequency[word] = 1

keywords_frequency

{'industry metrics': 1,
 'industry standards': 1,
 'industry guidance': 1,
 'industry disclosure': 1,
 'industry descriptions': 1,
 'investors responsibility': 1,
 'regulatory trends': 1,
 'brokerage services': 1,
 'business associated': 1,
 'regulatory uncertainty': 1,
 'community developmentquantitativenumber': 1,
 'sustainability accounting': 3,
 'developmentquantitativenumber presentation': 1,
 'financed emissionsdiscussion': 1,
 'asset classquantitativepresentation': 1,
 'trading antitrust': 1,
 'businessquantitativenumber presentation': 1,
 'businessquantitativenumber': 1,
 'corporate6 quantitativenumber': 1,
 'business ethicstotal': 1,
 'cybersecurity threats': 1,
 'compromising customers': 1,
 'growing cybersecurity': 1,
 'evolving cybersecurity': 1,
 'banks protecting': 1,
 'encryption key': 1,
 'acquired encryption': 1,
 'data breaches': 1,
 'ciphertext entity': 1,
 'incidents encrypted': 1,
 'cyber threats': 1,
 'cybersecurity requirements': 1,
 'include cyber': 1,
 'infrast

In [17]:
#sort the dictionary by frequency
sorted_keywords = sorted(keywords_frequency.items(), key=lambda x: x[1], reverse=True)
sorted_keywords

[('sustainability accounting', 3),
 ('business loans', 2),
 ('payday loans', 2),
 ('accounting standard', 2),
 ('industry metrics', 1),
 ('industry standards', 1),
 ('industry guidance', 1),
 ('industry disclosure', 1),
 ('industry descriptions', 1),
 ('investors responsibility', 1),
 ('regulatory trends', 1),
 ('brokerage services', 1),
 ('business associated', 1),
 ('regulatory uncertainty', 1),
 ('community developmentquantitativenumber', 1),
 ('developmentquantitativenumber presentation', 1),
 ('financed emissionsdiscussion', 1),
 ('asset classquantitativepresentation', 1),
 ('trading antitrust', 1),
 ('businessquantitativenumber presentation', 1),
 ('businessquantitativenumber', 1),
 ('corporate6 quantitativenumber', 1),
 ('business ethicstotal', 1),
 ('cybersecurity threats', 1),
 ('compromising customers', 1),
 ('growing cybersecurity', 1),
 ('evolving cybersecurity', 1),
 ('banks protecting', 1),
 ('encryption key', 1),
 ('acquired encryption', 1),
 ('data breaches', 1),
 ('cip

In [18]:
#take first 30 elements in the sorted dictionary
sorted_keywords = sorted_keywords[:30]
sorted_keywords

[('sustainability accounting', 3),
 ('business loans', 2),
 ('payday loans', 2),
 ('accounting standard', 2),
 ('industry metrics', 1),
 ('industry standards', 1),
 ('industry guidance', 1),
 ('industry disclosure', 1),
 ('industry descriptions', 1),
 ('investors responsibility', 1),
 ('regulatory trends', 1),
 ('brokerage services', 1),
 ('business associated', 1),
 ('regulatory uncertainty', 1),
 ('community developmentquantitativenumber', 1),
 ('developmentquantitativenumber presentation', 1),
 ('financed emissionsdiscussion', 1),
 ('asset classquantitativepresentation', 1),
 ('trading antitrust', 1),
 ('businessquantitativenumber presentation', 1),
 ('businessquantitativenumber', 1),
 ('corporate6 quantitativenumber', 1),
 ('business ethicstotal', 1),
 ('cybersecurity threats', 1),
 ('compromising customers', 1),
 ('growing cybersecurity', 1),
 ('evolving cybersecurity', 1),
 ('banks protecting', 1),
 ('encryption key', 1),
 ('acquired encryption', 1)]