In [None]:
# pip install transformers tika

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
from tika import parser
import re
import pandas as pd

tokenizer = AutoTokenizer.from_pretrained("nbroad/ESG-BERT")

model = AutoModelForSequenceClassification.from_pretrained("nbroad/ESG-BERT")

# Create the pipeline for text classification
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/2.67k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
# Create a Class to parse PDF
class PDFParser:
    def __init__(self, file_path):
        self.file_path = file_path
        self.raw = parser.from_file(self.file_path)
        self.text = self.raw['content']

    def get_text(self):
        return self.text

    def get_text_clean(self):
        text = self.text
        text = re.sub(r'\n', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        url_str = (r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\."
                       r"([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*")
        text = re.sub(url_str, r" ", text)  # URLs
        text = re.sub(r"^\s?\d+(.*)$", r"\1", text)  # headers
        text = re.sub(r"\d{5,}", r" ", text)  # figures
        text = re.sub(r"\.+", ".", text)  # multiple periods

        text = text.strip()  # leading & trailing spaces
        text = re.sub(r"\s+", " ", text)  # multiple spaces
        text = re.sub(r"\s?([,:;\.])", r"\1", text)  # punctuation spaces
        text = re.sub(r"\s?-\s?", "-", text)  # split-line words
        return text

    def get_text_clean_list(self, max_sentence_word_count=400):
        text = self.get_text_clean()
        sentences = text.split('.')

        text_list = []

        for sentence in sentences:
          words = sentence.split()
          current_sentence = []

          for word in words:
            current_sentence.append(word)
            if len(current_sentence) >= max_sentence_word_count:
                text_list.append(" ".join(current_sentence))
                current_sentence = []

          if current_sentence:
            text_list.append(" ".join(current_sentence))

        return text_list

In [None]:
def run_classifier(url):
    pp = PDFParser(url)
    sentences = pp.get_text_clean_list()
    print(f"The CSR report has {len(sentences):,d} sentences")
    result = classifier(sentences)
    df = pd.DataFrame(result)
    return(df)

In [None]:
# Let's try to look at Amazon
amzn = run_classifier("https://www.responsibilityreports.com/Click/2534")

2023-09-26 05:59:08,455 [MainThread  ] [INFO ]  Retrieving https://www.responsibilityreports.com/Click/2534 to /tmp/click-2534.
INFO:tika.tika:Retrieving https://www.responsibilityreports.com/Click/2534 to /tmp/click-2534.
2023-09-26 05:59:08,631 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar to /tmp/tika-server.jar.
INFO:tika.tika:Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar to /tmp/tika-server.jar.
2023-09-26 05:59:08,890 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar.md5 to /tmp/tika-server.jar.md5.
INFO:tika.tika:Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar.md5 to /tmp/tika-server.jar.md5.
2023-09

The CSR report has 409 sentences


In [None]:
# ans = amzn.groupby(['label']).mean().sort_values('score', ascending = False)
ans = amzn.groupby(['label']).mean()
ans

Unnamed: 0_level_0,score
label,Unnamed: 1_level_1
Access_And_Affordability,0.51299
Business_Ethics,0.460758
Business_Model_Resilience,0.5635
Competitive_Behavior,0.256933
Critical_Incident_Risk_Management,0.915521
Customer_Privacy,0.890653
Customer_Welfare,0.426341
Data_Security,0.098439
Director_Removal,0.102589
Ecological_Impacts,0.784869


In [None]:
ans_dict = ans.reset_index().set_index('label')['score'].to_dict()
ans_dict

{'Access_And_Affordability': 0.5129898999418531,
 'Business_Ethics': 0.46075784663359326,
 'Business_Model_Resilience': 0.563500332335631,
 'Competitive_Behavior': 0.2569328298171361,
 'Critical_Incident_Risk_Management': 0.9155208468437195,
 'Customer_Privacy': 0.8906526863574982,
 'Customer_Welfare': 0.4263412803411484,
 'Data_Security': 0.09843863546848297,
 'Director_Removal': 0.10258904844522476,
 'Ecological_Impacts': 0.784868635237217,
 'Employee_Engagement_Inclusion_And_Diversity': 0.7266522701829672,
 'Employee_Health_And_Safety': 0.7056979631835764,
 'Energy_Management': 0.6705785915255547,
 'GHG_Emissions': 0.8084544738133749,
 'Human_Rights_And_Community_Relations': 0.7366857584565878,
 'Labor_Practices': 0.69296253153256,
 'Management_Of_Legal_And_Regulatory_Framework': 0.3810105363595976,
 'Physical_Impacts_Of_Climate_Change': 0.6137672662734985,
 'Product_Design_And_Lifecycle_Management': 0.7170253269947492,
 'Product_Quality_And_Safety': 0.7358072532547845,
 'Selling_Pr

In [None]:
label_names = ['Business_Ethics',
 'Data_Security',
 'Access_And_Affordability',
 'Business_Model_Resilience',
 'Competitive_Behavior',
 'Critical_Incident_Risk_Management',
 'Customer_Welfare',
 'Director_Removal',
 'Employee_Engagement_Inclusion_And_Diversity',
 'Employee_Health_And_Safety',
 'Human_Rights_And_Community_Relations',
 'Labor_Practices',
 'Management_Of_Legal_And_Regulatory_Framework',
 'Physical_Impacts_Of_Climate_Change',
 'Product_Quality_And_Safety',
 'Product_Design_And_Lifecycle_Management',
 'Selling_Practices_And_Product_Labeling',
 'Supply_Chain_Management',
 'Systemic_Risk_Management',
 'Waste_And_Hazardous_Materials_Management',
 'Water_And_Wastewater_Management',
 'Air_Quality',
 'Customer_Privacy',
 'Ecological_Impacts',
 'Energy_Management',
 'GHG_Emissions']

label_names

['Business_Ethics',
 'Data_Security',
 'Access_And_Affordability',
 'Business_Model_Resilience',
 'Competitive_Behavior',
 'Critical_Incident_Risk_Management',
 'Customer_Welfare',
 'Director_Removal',
 'Employee_Engagement_Inclusion_And_Diversity',
 'Employee_Health_And_Safety',
 'Human_Rights_And_Community_Relations',
 'Labor_Practices',
 'Management_Of_Legal_And_Regulatory_Framework',
 'Physical_Impacts_Of_Climate_Change',
 'Product_Quality_And_Safety',
 'Product_Design_And_Lifecycle_Management',
 'Selling_Practices_And_Product_Labeling',
 'Supply_Chain_Management',
 'Systemic_Risk_Management',
 'Waste_And_Hazardous_Materials_Management',
 'Water_And_Wastewater_Management',
 'Air_Quality',
 'Customer_Privacy',
 'Ecological_Impacts',
 'Energy_Management',
 'GHG_Emissions']

In [None]:
# Define categories and their respective labels
categories = {
    "Emission": ["GHG_Emissions", "Ecological_Impacts", "Energy_Management", "Physical_Impacts_Of_Climate_Change"],
    "Resource Use": ["Waste_And_Hazardous_Materials_Management", "Water_And_Wastewater_Management"],
    "Innovation": ["Product_Design_And_Lifecycle_Management"],
    "Workforce": ["Employee_Engagement_Inclusion_And_Diversity", "Employee_Health_And_Safety"],
    "Community": ["Customer_Privacy", "Customer_Welfare"],
    "Human Rights": ["Human_Rights_And_Community_Relations"],
    "Product Responsibility": ["Business_Model_Resilience", "Product_Quality_And_Safety", "Supply_Chain_Management"],
    "Management": ["Director_Removal", "Management_Of_Legal_And_Regulatory_Framework"],
    "Shareholders": ["Business_Model_Resilience", "Supply_Chain_Management", "Systemic_Risk_Management"]
}

# Class distribution dictionary
class_distribution = {
    "E": ["Emission", "Resource Use", "Innovation"],
    "S": ["Workforce", "Community", "Human Rights", "Product Responsibility"],
    "G": ["Management", "Shareholders"]
}

In [None]:
# Initialize dictionaries to store category sums and counts
category_sums = {category: 0 for category in categories}
category_counts = {category: 0 for category in categories}

# Iterate through the output_list and accumulate sums and counts
for label, score in ans_dict.items():
    for category, labels_in_category in categories.items():
        if label in labels_in_category:
            category_sums[category] += score
            category_counts[category] += 1

# Calculate the average for each category
category_averages = {
    category: category_sums[category] / category_counts[category]
    for category in categories
}

print(category_averages)

{'Emission': 0.7194172417124113, 'Resource Use': 0.9482856350285667, 'Innovation': 0.7170253269947492, 'Workforce': 0.7161751166832717, 'Community': 0.6584969833493233, 'Human Rights': 0.7366857584565878, 'Product Responsibility': 0.7058813269570391, 'Management': 0.24179979240241117, 'Shareholders': 0.6480402136369356}


In [None]:
# Calculate the average scores for each class of categories
class_averages = {}
for class_label, class_categories in class_distribution.items():
    class_score_sum = sum(category_averages[category] for category in class_categories)
    class_averages[class_label] = class_score_sum / len(class_categories)

print(class_averages)

{'E': 0.7949094012452423, 'S': 0.7043097963615554, 'G': 0.4449200030196734}
