In [75]:
import fitz  # PyMuPDF
import pandas as pd
import os
import re
import spacy
import torch
from transformers import pipeline
from tqdm import tqdm

In [76]:
"""
Initializes NLP models and configures the computational device (GPU or CPU).
Sets up a spaCy model and two Hugging Face pipelines for classification and
sentiment analysis.
"""

# Enable tqdm progress bars for pandas operations.
tqdm.pandas(desc="Processing")

# Load the small English spaCy model for various NLP tasks.
nlp = spacy.load("en_core_web_sm")

# Set device to GPU (0) if available, otherwise set to CPU (-1).
if torch.cuda.is_available():
    device = 0
    print(f"Using device: GPU")
else:
    device = -1
    print(f"Using device: CPU")

# Initialize the zero-shot classification pipeline on the selected device.
zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)

# Initialize the sentiment analysis pipeline (FinBERT) on the selected device.
sentiment_analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert", device=device)

Using device: GPU


Device set to use cuda:0
Device set to use cuda:0


In [77]:
def parse_pdf_to_paragraphs(pdf_path, company_name):
    """Extracts and cleans text paragraphs from a PDF file.

    This function opens a PDF, iterates through each page, and extracts
    text blocks. It cleans up the text and filters for blocks longer than
    50 characters to identify them as paragraphs.

    Args:
        pdf_path (str): The full path to the PDF file.
        company_name (str): The name of the company associated with the document.

    Returns:
        list: A list of dictionaries, where each dictionary represents a
              parsed paragraph and contains its metadata. Returns an empty
              list if the PDF cannot be opened.
    """
    # Attempt to open the PDF file, handling potential errors.
    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        # If the file can't be opened, print an error and exit the function.
        print(f"Error opening {pdf_path}: {e}")
        return []

    # Extract the filename from the full path to use as the document name.
    document_name = os.path.basename(pdf_path)
    
    # Initialize a list to store the results.
    parsed_paragraphs = []

    # Iterate through each page of the PDF, keeping track of the page number.
    for page_num, page in enumerate(doc, start=1):
        # Extract all text blocks from the current page.
        blocks = page.get_text("blocks")
        
        # Process each individual text block.
        for block in blocks:
            # The actual text content is at index 4 of the block tuple.
            text_segment = block[4]
            
            # Clean the text by replacing multiple whitespaces with a single space.
            clean_text = re.sub(r'\s+', ' ', text_segment).strip()
            
            # Filter out short text segments to keep only meaningful paragraphs.
            if len(clean_text) > 50:
                # Append the paragraph and its metadata as a dictionary to our list.
                parsed_paragraphs.append({
                    'company': company_name,
                    'document_name': document_name,
                    'page_number': page_num,
                    'paragraph_text': clean_text
                })
                
    # Return the final list of all parsed paragraphs.
    return parsed_paragraphs

In [78]:
ACTION_VERBS = ['reduce', 'increase', 'achieve', 'commit', 'invest', 'launch', 'develop', 'implement', 'strive', 'aim', 'plan', 'set', 'target']
ESG_KEYWORDS = ['emission', 'carbon', 'diversity', 'safety', 'water', 'waste', 'governance', 'ethical', 'supply chain', 'sustainability', 'environmental', 'social', 'inclusion']

In [79]:
def is_claim(sentence_text):
    """Determines if a sentence is a potential claim based on a set of rules.

    A sentence is considered a claim if it:
    1. Is at least 8 words long.
    2. Contains a keyword from the global ESG_KEYWORDS list.
    3. Contains either an action verb from ACTION_VERBS or a number.

    Args:
        sentence_text (str): The input sentence to evaluate.

    Returns:
        bool: True if the sentence meets the claim criteria, otherwise False.
    """
    # Convert the sentence to lowercase for case-insensitive matching.
    text = sentence_text.lower()

    # Rule 1: A claim must have at least 8 words.
    if len(text.split()) < 8:
        return False

    # Rule 2: A claim must contain at least one ESG-related keyword.
    # (Assumes ESG_KEYWORDS is a predefined list of strings).
    if not any(keyword in text for keyword in ESG_KEYWORDS):
        return False

    # Check for the presence of a recognized action verb.
    # (Assumes ACTION_VERBS is a predefined list of strings).
    has_action = any(verb in text for verb in ACTION_VERBS)

    # Check for the presence of any numerical digit.
    has_number = any(char.isdigit() for char in text)

    # Rule 3: Return True if the sentence has an action verb OR a number.
    return has_action or has_number

In [80]:
def extract_claims_from_paragraphs(paragraph_data):
    """Processes paragraphs to identify and extract sentences that are claims.

    This function iterates through a list of paragraph data, uses a spaCy
    model to split each paragraph into sentences, and then evaluates each
    sentence using the `is_claim()` helper function.

    Args:
        paragraph_data (list): A list of dictionaries, where each dictionary
                               contains a 'paragraph_text' and its metadata.

    Returns:
        list: A new list of dictionaries, where each dictionary represents
              a single extracted claim and its original metadata.
    """
    # Initialize a list to store the results.
    extracted_claims = []

    # Iterate through each dictionary entry in the input list.
    for entry in paragraph_data:
        # Process the paragraph text with the pre-loaded spaCy model to parse it.
        doc = nlp(entry['paragraph_text'])

        # Loop over each sentence segmented by spaCy.
        for sent in doc.sents:
            # Get the sentence's text and remove leading/trailing whitespace.
            sentence_text = sent.text.strip()

            # Use the helper function to determine if the sentence is a claim.
            if is_claim(sentence_text):
                # If it is a claim, append its details to the results list.
                extracted_claims.append({
                    'company': entry['company'],
                    'document_name': entry['document_name'],
                    'page_number': entry['page_number'],
                    'claim': sentence_text
                })

    # Return the final list containing all identified claims.
    return extracted_claims

In [81]:
esg_topics = ["Greenhouse Gas Emissions", "Air Quality", "Water & Wastewater Management", "Energy Management", "Waste Management", "Ecological Impacts", "Employee Health & Safety", "Diversity & Inclusion", "Labor Practices", "Supply Chain Labor Standards", "Data Security", "Customer Privacy", "Product Safety", "Business Ethics", "Competitive Behavior", "Corporate Governance"]

In [82]:
def classify_esg_topic(claim_text, candidate_labels, confidence_threshold=0.7):
    """Classifies a text into one of several candidate ESG labels.

    Uses a pre-initialized zero-shot classification pipeline to categorize the
    input text. The classification is only accepted if the model's confidence
    score for the top label exceeds the specified threshold.

    Args:
        claim_text (str): The sentence or claim to be classified.
        candidate_labels (list): A list of strings representing the potential
                                 ESG topics (e.g., ['Climate', 'Diversity']).
        confidence_threshold (float, optional): The minimum confidence score
                                                 required to assign a label.
                                                 Defaults to 0.7.

    Returns:
        str: The most likely ESG label if its score is above the threshold,
             otherwise returns "Unclassified".
    """
    # Handle invalid input: return "Unclassified" if the text is not a valid string.
    if not isinstance(claim_text, str) or not claim_text:
        return "Unclassified"

    # Run the zero-shot classification model on the input text.
    # multi_label=False ensures the scores sum to 1, forcing a single best choice.
    result = zero_shot_classifier(claim_text, candidate_labels, multi_label=False)

    # Extract the label with the highest probability score.
    top_label = result['labels'][0]
    # Extract the corresponding highest score.
    top_score = result['scores'][0]

    # Compare the top score against the confidence threshold.
    if top_score > confidence_threshold:
        # If the model is confident, return the predicted label.
        return top_label
    else:
        # If the score is too low, the result is considered unclassified.
        return "Unclassified"

In [83]:
def analyze_sentiment(claim_text):
    """Analyzes the sentiment of a given text and returns a normalized score.

    This function uses a pre-initialized sentiment analysis pipeline to evaluate
    a text. It returns a score between -1.0 and 1.0, where positive values
    indicate positive sentiment, negative values indicate negative sentiment,
    and 0 represents neutral sentiment or invalid input.

    Args:
        claim_text (str): The text whose sentiment is to be analyzed.

    Returns:
        float: A sentiment score from -1.0 to 1.0.
    """
    # Return a neutral score for empty or non-string inputs.
    if not isinstance(claim_text, str) or not claim_text:
        return 0

    # Run the pre-loaded sentiment analysis pipeline on the input text.
    results = sentiment_analyzer(claim_text)

    # Handle cases where the model might not return a result.
    if not results:
        return 0

    # The pipeline returns a list; the main result is the first element.
    result = results[0]
    
    # Extract the confidence score and the sentiment label.
    score = result['score']
    label = result['label']

    # Convert the label and score into a single normalized value.
    if label == 'positive':
        # Return the score directly for positive sentiment.
        return score
    elif label == 'negative':
        # Invert the score for negative sentiment.
        return -score
    else:
        # Return 0 for neutral sentiment.
        return 0

In [84]:
def calculate_specificity_score(claim_text):
    """Calculates a normalized specificity score for a given text.

    This function scores a claim on a scale of 0.0 to 1.0 based on how
    specific it is. Points are awarded for the presence of numbers, future
    years, percentages, and specific ESG-related metric keywords.

    Args:
        claim_text (str): The text to be scored for specificity.

    Returns:
        float: A normalized score between 0.0 and 1.0.
    """
    # Return 0.0 for empty or invalid string input.
    if not isinstance(claim_text, str) or not claim_text:
        return 0.0

    # Initialize score and set the maximum possible score based on 4 rules.
    score, max_score = 0, 4.0

    # Rule 1: Award a point if the text contains any number.
    if re.search(r'\d', claim_text):
        score += 1

    # Rule 2: Award a point if a future year (e.g., 2025, 2030) is mentioned.
    if re.search(r'\b(20[2-9][0-9])\b', claim_text):
        score += 1

    # Rule 3: Award a point if a percentage is mentioned.
    if '%' in claim_text or 'percent' in claim_text.lower():
        score += 1

    # Rule 4: Award a point for mentioning specific, common ESG metrics.
    specific_metrics = ['scope 1', 'scope 2', 'scope 3', 'tco2e', 'kwh', 'mwh', 'baseline']
    if any(metric in claim_text.lower() for metric in specific_metrics):
        score += 1
        
    # Normalize the final score to a value between 0.0 and 1.0.
    return score / max_score

In [85]:
HEDGING_PHRASES = ['aim to', 'strive to', 'plan to', 'work towards', 'hope to', 'could', 'potentially', 'where feasible', 'in the future']

In [86]:
def calculate_hedging_score(claim_text):
    """Calculates a binary score indicating the presence of hedging language.

    This function checks if any predefined hedging phrases (from the global
    HEDGING_PHRASES list) are present in the input text. Hedging language
    often signals a lack of firm commitment (e.g., "we aim to", "we hope").

    Args:
        claim_text (str): The text to be analyzed for hedging.

    Returns:
        int: Returns 1 if a hedging phrase is found, otherwise returns 0.
    """
    # The `any()` function returns True if a match is found, which is cast to an integer (1).
    # If no matches are found, it returns False, which becomes 0.
    return int(any(phrase in claim_text.lower() for phrase in HEDGING_PHRASES))

In [87]:
def calculate_greenwashing_risk(row):
    """Calculates a greenwashing risk score based on several factors.

    This function computes a risk score for a single claim (represented by a row)
    by combining its specificity, sentiment, and hedging scores. The formula is
    designed so that risk increases with vagueness (low specificity), positive
    sentiment, and the presence of hedging language.

    Args:
        row (pandas.Series): A row from a DataFrame that must contain the keys
                             'specificity_score', 'sentiment_score', and
                             'hedging_score'.

    Returns:
        float: The calculated greenwashing risk score, rounded to 3 decimal places.
    """
    # Extract the individual scores from the input row for clarity.
    specificity_score = row['specificity_score']
    sentiment_score = row['sentiment_score']
    hedging_score = row['hedging_score']

    # Isolate the positive sentiment component, as only positive claims contribute to risk.
    positive_sentiment = max(0, sentiment_score)
    
    # Define the weights for each factor in the risk calculation.
    w_spec = 0.5   # Specificity is the most heavily weighted factor.
    w_sent = 0.25  # Positive sentiment is a moderate risk factor.
    w_hedge = 0.25 # Hedging language is also a moderate risk factor.
    
    # Invert the specificity score because low specificity (vagueness) increases risk.
    inverse_specificity = 1 - specificity_score
    
    # Calculate the final risk score as a weighted sum of the factors.
    risk_score = (w_spec * inverse_specificity) + (w_sent * positive_sentiment) + (w_hedge * hedging_score)
          
    # Return the final score, rounded for cleaner output.
    return round(risk_score, 3)

In [88]:
# This dictionary maps company names to the file paths of their ESG reports.
# Each key is a string representing the company, and the corresponding value is the
# relative path to its PDF report file.
report_files = {
    'SekiSui': 'reports/sekisui/ESG_factbook_en.pdf',
    'Grasim' : 'reports/Grasim/grasim-industries-esg-data-book-2023-24.pdf',
    'Johnosn&Johnson': 'reports/J&J/johnson-johnson-2024-health-for-humanity-report.pdf'
}

In [89]:
# --- Task 1: Data Ingestion and Claim Extraction ---
# This block iterates through the defined report files, parses each PDF,
# and extracts sentences that are identified as potential claims.

# Log a message to indicate the start of the first major processing stage.
print("--- Running Tasks 1: Ingestion and Extraction ---")

# Initialize a single list to store all claims extracted from all documents.
all_extracted_claims = []

# Loop through each company and its corresponding report file path.
for company, path in report_files.items():
    # Before processing, verify that the PDF file actually exists at the given path.
    if not os.path.exists(path):
        # If the file is not found, print a warning and skip to the next item in the loop.
        print(f"--- WARNING: File not found for {company} at {path}. Skipping. ---")
        continue

    # Announce the start of processing for the current company's report.
    print(f"\nProcessing report for: {company}...")

    # Step 1.1: Parse the PDF document to extract text into structured paragraphs.
    paragraphs = parse_pdf_to_paragraphs(path, company)

    # Step 1.2: Sift through the paragraphs to identify and extract claim sentences.
    claims = extract_claims_from_paragraphs(paragraphs)

    # Add the list of claims from the current document to the master list.
    all_extracted_claims.extend(claims)

--- Running Tasks 1: Ingestion and Extraction ---

Processing report for: SekiSui...

Processing report for: Grasim...

Processing report for: Johnosn&Johnson...


In [90]:
# --- Task 2: Claim Classification ---
# This block converts the extracted claims into a pandas DataFrame and then
# classifies each claim into a predefined ESG topic.

# Log a message to indicate the start of the classification stage.
print("--- Running Tasks 2: Classification ---")

# Convert the list of claim dictionaries into a pandas DataFrame for easier analysis.
df_claims = pd.DataFrame(all_extracted_claims)

# Create a new 'esg_topic' column by classifying each claim's text.
# 'progress_apply' is used to show a progress bar during this potentially long-running task.
# (Assumes 'esg_topics' is a predefined list of candidate labels).
df_claims['esg_topic'] = df_claims['claim'].progress_apply(lambda x: classify_esg_topic(x, esg_topics))

--- Running Tasks 2: Classification ---


Processing: 100%|██████████| 629/629 [07:05<00:00,  1.48it/s]


In [91]:
# --- Task 3: Sentiment and Specificity Analysis ---
# This block analyzes each claim to determine its sentiment and how
# specific its language is, adding the results as new columns to the DataFrame.

# Log a message indicating the start of the analysis stage.
print("\n--- Running Task 3: Sentiment and Specificity Analysis ---")

# Create a 'sentiment_score' column by applying the sentiment analysis function to each claim.
# 'progress_apply' displays a progress bar for this operation.
df_claims['sentiment_score'] = df_claims['claim'].progress_apply(analyze_sentiment)

# Create a 'specificity_score' column by applying the specificity scoring function to each claim.
df_claims['specificity_score'] = df_claims['claim'].progress_apply(calculate_specificity_score)


--- Running Task 3: Sentiment and Specificity Analysis ---


Processing: 100%|██████████| 629/629 [00:09<00:00, 65.52it/s]
Processing: 100%|██████████| 629/629 [00:00<00:00, 78518.37it/s]


In [92]:
# --- Task 4: Hedging and Greenwashing Risk Calculation ---
# This block identifies hedging language in claims and then calculates a final
# greenwashing risk score based on all previously computed metrics.

# Log a message to indicate the start of the final analysis stage.
print("\n--- Running Task 4: Greenwashing Risk Detection ---")

# Create a 'hedging_score' column by applying the hedging detection function.
# This will result in a binary score (1 for hedging, 0 otherwise).
df_claims['hedging_score'] = df_claims['claim'].progress_apply(calculate_hedging_score)

# Create the final 'greenwashing_risk_score' column by applying the risk calculation.
# The function is applied row-wise (axis=1) because it needs access to multiple
# columns (specificity, sentiment, hedging) for each claim.
df_claims['greenwashing_risk_score'] = df_claims.progress_apply(calculate_greenwashing_risk, axis=1)


--- Running Task 4: Greenwashing Risk Detection ---


Processing: 100%|██████████| 629/629 [00:00<00:00, 89778.03it/s]
Processing: 100%|██████████| 629/629 [00:00<00:00, 17239.53it/s]


In [93]:
# --- Final Summary ---
# This block prepares and displays a clean summary table of the results,
# showing the highest-risk claims for each company.

# Log a message indicating that all processing is complete.
print("\n--- All Tasks Complete: Final Summary Table ---")

# Define the specific columns to be included in the final report.
final_columns = [
    'company', 
    'esg_topic', 
    'claim', 
    'sentiment_score', 
    'specificity_score', 
    'greenwashing_risk_score'
]

# Create a new, clean DataFrame for the summary.
# This involves selecting the desired columns and renaming them for better readability.
df_summary = df_claims[final_columns].rename(columns={
    'company': 'Company', 'esg_topic': 'ESG Topic', 'claim': 'Extracted Claim', 
    'sentiment_score': 'Sentiment Score', 'specificity_score': 'Specificity Score', 
    'greenwashing_risk_score': 'Greenwashing Risk Score'
})

# Sort the summary table to group by company, and then show the highest risk claims first.
# 'inplace=True' modifies the DataFrame directly.
df_summary.sort_values(by=['Company', 'Greenwashing Risk Score'], ascending=[True, False], inplace=True)

# Display the first 10 rows of the final summary DataFrame.
df_summary.head(10)


--- All Tasks Complete: Final Summary Table ---


Unnamed: 0,Company,ESG Topic,Extracted Claim,Sentiment Score,Specificity Score,Greenwashing Risk Score
461,Grasim,Unclassified,This initiative has led several of our units t...,0.947836,0.0,0.737
466,Grasim,Unclassified,"This process leads to significant chemical, wa...",0.858855,0.0,0.715
462,Grasim,Unclassified,"Additionally, the company presents Pride Award...",0.837551,0.0,0.709
469,Grasim,Unclassified,"By embedding this price into its operations, U...",0.833619,0.0,0.708
460,Grasim,Unclassified,The chemical business is one of our most energ...,0.7982,0.0,0.7
453,Grasim,Unclassified,Through continuous investments in cutting-edge...,0.777195,0.0,0.694
475,Grasim,Unclassified,"To further our commitment to sustainability, G...",0.742046,0.0,0.686
480,Grasim,Unclassified,• Developing green belts to restore mined area...,0.676902,0.0,0.669
459,Grasim,Unclassified,Each KPI carries a specific weight and focuses...,0.564947,0.0,0.641
438,Grasim,Unclassified,"He guides the management to ensure Governance,...",0.550691,0.0,0.638


In [94]:
df_summary[(df_summary["ESG Topic"] != "Unclassified")].head()

Unnamed: 0,Company,ESG Topic,Extracted Claim,Sentiment Score,Specificity Score,Greenwashing Risk Score
484,Grasim,Diversity & Inclusion,• 8-10 Section Heads / FLOs • Talent pool • Ag...,0.0,0.25,0.375
435,Grasim,Corporate Governance,As mentioned in the corporate governance secti...,0.0,0.5,0.25
439,Grasim,Greenhouse Gas Emissions,Parameters Unit FY 2021 FY 2022 FY 2023 FY 202...,0.0,0.5,0.25
440,Grasim,Greenhouse Gas Emissions,❖ Indirect Greenhouse Gas Emissions (Scope 2) ...,0.0,0.5,0.25
441,Grasim,Greenhouse Gas Emissions,Parameters Unit FY 2021 FY 2022 FY 2023 FY 202...,0.0,0.75,0.125
