# ESG Data Exploration

This notebook introduces key ESG concepts and provides interactive examples for exploring ESG data.

## 1. Understanding ESG Components

ESG stands for Environmental, Social, and Governance:

- **Environmental**: Climate change, carbon emissions, water usage, waste management
- **Social**: Employee relations, diversity, human rights, community impact
- **Governance**: Board composition, executive compensation, shareholder rights

Let's explore these components in real ESG reports.

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import pdfplumber
import re
from collections import Counter

# Set style for better visualizations
sns.set_theme()  # This is the recommended way to set the style in newer versions
sns.set_palette('husl')

## 2. ESG Keywords and Metrics

Let's define some common ESG keywords to look for in reports:

In [None]:
esg_keywords = {
    'environmental': [
        'carbon emissions', 'climate change', 'renewable energy', 'waste management',
        'water consumption', 'biodiversity', 'pollution', 'recycling'
    ],
    'social': [
        'diversity', 'inclusion', 'human rights', 'employee safety', 'community',
        'labor practices', 'data privacy', 'health'
    ],
    'governance': [
        'board diversity', 'executive compensation', 'shareholder rights',
        'ethics', 'compliance', 'transparency', 'risk management'
    ]
}

## 3. Analyzing ESG Report Content

Let's analyze our sample ESG report to see the frequency of ESG-related terms:

In [None]:
def analyze_esg_content(pdf_path):
    """Analyze ESG content in a PDF report."""
    # Extract text
    with pdfplumber.open(pdf_path) as pdf:
        text = '\n'.join(page.extract_text() for page in pdf.pages)
    
    # Count ESG keywords
    keyword_counts = {category: {} for category in esg_keywords}
    for category, words in esg_keywords.items():
        for word in words:
            count = len(re.findall(word, text.lower()))
            if count > 0:
                keyword_counts[category][word] = count
    
    return keyword_counts

# Path to your PDF file
pdf_path = os.path.join('..', 'data', 'totalenergies_sustainability-climate-2024-progress-report_2024_en_pdf.pdf')

# Analyze content if file exists
if os.path.exists(pdf_path):
    keyword_counts = analyze_esg_content(pdf_path)
    
    # Plot results
    plt.figure(figsize=(15, 5))
    for i, (category, counts) in enumerate(keyword_counts.items()):
        plt.subplot(1, 3, i+1)
        if counts:  # If we found any keywords
            plt.bar(counts.keys(), counts.values())
            plt.title(f'{category.capitalize()} Keywords')
            plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
else:
    print("PDF file not found. Please check the path.")

## 4. Exercise: ESG Metric Extraction

Try to extract specific ESG metrics from the report. Here's an example pattern to find carbon emission values:

In [None]:
def extract_esg_metrics(text):
    """Extract various ESG metrics from text."""
    metrics = {
        'carbon_metrics': [],
        'renewable_metrics': [],
    }
    
    # Carbon metrics patterns
    carbon_patterns = [
        r'(\d+(?:\.\d+)?)\s*(?:MtCO2e|million tons of CO2|Million tons CO2)',
        r'(\d+(?:\.\d+)?)\s*(?:gCO2/kWh|gCO2e/kWh)',
        r'(\d+(?:\.\d+)?)\s*(?:%|percent)\s*(?:reduction|decrease)\s*(?:in|of)\s*(?:emissions|carbon)'
    ]
    
    # Renewable energy metrics patterns
    renewable_patterns = [
        r'(\d+(?:\.\d+)?)\s*(?:GW|gigawatts?)\s*(?:of|installed)?\s*(?:renewable|solar|wind)',
        r'(\d+(?:\.\d+)?)\s*(?:%|percent)\s*(?:renewable|solar|wind)\s*(?:energy|capacity)',
        r'(\d+(?:\.\d+)?)\s*(?:TWh|terawatt hours?)\s*(?:of|generated)?\s*(?:renewable|solar|wind)'
    ]
    
    
    # Extract metrics using patterns
    for pattern in carbon_patterns:
        matches = re.finditer(pattern, text, re.IGNORECASE)
        metrics['carbon_metrics'].extend([match.group() for match in matches])
    
    for pattern in renewable_patterns:
        matches = re.finditer(pattern, text, re.IGNORECASE)
        metrics['renewable_metrics'].extend([match.group() for match in matches])
    
    return metrics

# Example usage (if PDF exists)
if os.path.exists(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = '\n'.join(page.extract_text() for page in pdf.pages)
        metrics = extract_esg_metrics(text)
        
        print("ESG Metrics Found:")
        print("-" * 50)
        
        for category, found_metrics in metrics.items():
            if found_metrics:
                print(f"\n{category.replace('_', ' ').title()}:")
                for metric in found_metrics[:5]:  # Show first 5 matches
                    print(f"- {metric}")
            else:
                print(f"\nNo {category.replace('_', ' ')} found.")

## 5. Practice Exercises

Try these exercises to practice ESG data analysis:

1. Create a function to extract diversity metrics (e.g., percentage of women in workforce)
2. Create a function to extract investment metrics (e.g., amounts invested in renewable energy)
3. Create a function to extract water consumption metrics (e.g., water withdrawal, water recycling rates)
4. Create a function to extract waste management metrics (e.g., waste generation, recycling rates)

Example solutions:

In [None]:
def extract_diversity_metrics(text):
    """Extract diversity-related metrics from text."""
    # Pattern for percentages near diversity-related words
    pattern = r'(\d+(?:\.\d+)?%)\s*(?:women|diverse|minority|representation)'
    matches = re.finditer(pattern, text.lower())
    return [match.group() for match in matches]

# Example solution for exercise 2:
def extract_investment_metrics(text):
    """Extract investment-related metrics from text."""
    # Pattern for amounts invested in renewable energy
    pattern = r'(\d+(?:\.\d+)?)\s*(?:billion|million)?\s*(?:USD|EUR|€|\$)\s*(?:invested|investment|spent)\s*(?:in|for)\s*(?:renewable|solar|wind|clean)'
    matches = re.finditer(pattern, text.lower())
    return [match.group() for match in matches]

# Example solution for exercise 3:
def extract_water_metrics(text):
    """Extract water-related metrics from text."""
    # Pattern for water consumption and recycling
    pattern = r'(\d+(?:\.\d+)?)\s*(?:million|billion)?\s*(?:m3|cubic meters?|liters?)\s*(?:of)?\s*(?:water|wastewater)\s*(?:withdrawn|consumed|recycled|reused)'
    matches = re.finditer(pattern, text.lower())
    return [match.group() for match in matches]

# Example solution for exercise 4:
def extract_waste_metrics(text):
    """Extract waste management metrics from text."""
    # Pattern for waste generation and recycling
    pattern = r'(\d+(?:\.\d+)?)\s*(?:million|thousand)?\s*(?:tons?|tonnes?)\s*(?:of)?\s*(?:waste|hazardous waste)\s*(?:generated|recycled|disposed)'
    matches = re.finditer(pattern, text.lower())
    return [match.group() for match in matches]

# Test the functions with the PDF
if os.path.exists(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = '\n'.join(page.extract_text() for page in pdf.pages)
        
        print("Diversity Metrics:")
        print(extract_diversity_metrics(text))
        
        print("\nInvestment Metrics:")
        print(extract_investment_metrics(text))
        
        print("\nWater Metrics:")
        print(extract_water_metrics(text))
        
        print("\nWaste Metrics:")
        print(extract_waste_metrics(text))

## 6. Advanced NLP Analysis

Let's explore more sophisticated NLP techniques to analyze ESG content:

In [None]:
# Import additional NLP libraries
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
import nltk
import spacy
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Download spaCy model (run this once)
!python -m spacy download en_core_web_sm

# Load spaCy model
nlp = spacy.load('en_core_web_sm')
nltk.download('wordnet')
nltk.download('punkt')

In [None]:
def preprocess_text(text):
    """Preprocess text: remove numbers, special chars, lemmatize"""
    lemmatizer = WordNetLemmatizer()
    
    # Remove numbers and special characters, keep only words
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    
    # Tokenize sentences and words, lemmatize
    words = text.lower().split()
    words = [lemmatizer.lemmatize(word) for word in words if len(word) > 3]
    
    return ' '.join(words)

def split_text_into_chunks(text, chunk_size=500):
    """Splits text into smaller chunks for better topic modeling"""
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    
    for sentence in sentences:
        current_chunk.append(sentence)
        if len(' '.join(current_chunk)) > chunk_size:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

def perform_topic_modeling(text, n_topics=5, method="nmf"):
    """Perform topic modeling on a document split into chunks"""
    
    # Preprocess text
    text = preprocess_text(text)
    chunks = split_text_into_chunks(text)

    # Use TF-IDF vectorization for better results
    vectorizer = TfidfVectorizer(
    min_df=1,  # Allow words that appear in at least one chunk
    stop_words='english', 
    max_features=1000
    )
    
    doc_term_matrix = vectorizer.fit_transform(chunks)
    
    if method == "lda":
        model = LatentDirichletAllocation(
            n_components=n_topics,
            random_state=42,
            learning_method='batch',
            max_iter=50
        )
    else:  # Default to NMF
        model = NMF(n_components=n_topics, random_state=42, init="random")
    
    model_output = model.fit_transform(doc_term_matrix)
    
    feature_names = vectorizer.get_feature_names_out()
    
    # Extract top words per topic
    topic_words = {}
    for topic_idx, topic in enumerate(model.components_):
        top_words_idx = topic.argsort()[:-10 - 1:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topic_words[f"Topic {topic_idx + 1}"] = top_words

    return topic_words

def extract_named_entities(text):
    """Extract and categorize named entities using spaCy."""
    doc = nlp(text)
    entities = {}
    
    for ent in doc.ents:
        if ent.label_ not in entities:
            entities[ent.label_] = []
        if ent.text not in entities[ent.label_]:
            entities[ent.label_].append(ent.text)
    
    return entities

In [None]:
# Path to your PDF file
pdf_path = os.path.join('..', 'data', 'totalenergies_sustainability-climate-2024-progress-report_2024_en_pdf.pdf')

# 1. Topic Modeling
if os.path.exists(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = '\n'.join(page.extract_text() for page in pdf.pages if page.extract_text())
        
        # Perform topic modeling
        topics = perform_topic_modeling(text, n_topics=5, method="nmf")
        
        print("\nImproved Topic Modeling Results:")
        print("-" * 50)
        for topic, words in topics.items():
            print(f"\n{topic}:")
            print(", ".join(words))
    
    # 2. Named Entity Recognition
    print("\nNamed Entity Recognition:")
    print("-" * 50)
    entities = extract_named_entities(text[:100000])  # Process first 100K chars
    
    # Visualize entity distribution
    entity_counts = {k: len(v) for k, v in entities.items()}
    plt.figure(figsize=(12, 6))
    plt.bar(entity_counts.keys(), entity_counts.values())
    plt.xticks(rotation=45)
    plt.title('Distribution of Named Entity Types')
    plt.tight_layout()
    plt.show()

## 7. Semantic Analysis

Let's analyze the semantic similarity between different sections of the report:

In [None]:
def extract_paragraphs_using_sentences(text, min_length=200):
    """Extracts paragraphs by splitting based on sentences instead of newlines."""
    sentences = sent_tokenize(text)  # Use sentence tokenizer

    paragraphs = []
    current_paragraph = []

    for sentence in sentences:
        current_paragraph.append(sentence)
        
        # Group sentences into a paragraph when it reaches a reasonable length
        if len(" ".join(current_paragraph)) > min_length:
            paragraphs.append(" ".join(current_paragraph))
            current_paragraph = []

    if len(" ".join(current_paragraph)) > min_length:
        paragraphs.append(" ".join(current_paragraph))

    print(f"\n✅ Extracted {len(paragraphs)} paragraphs using sentence tokenization.")
    return paragraphs



def analyze_semantic_similarity(text, min_length=200):
    """Analyze semantic similarity between paragraphs using SBERT embeddings."""
    
    # Step 1: Extract paragraphs using sentence tokenization instead of newlines
    paragraphs = extract_paragraphs_using_sentences(text, min_length)

    # Step 2: Check if we have enough paragraphs
    if len(paragraphs) < 2:
        print("\n❌ Not enough paragraphs for semantic similarity analysis.")
        return None, None

    # Step 3: Load SBERT model
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Step 4: Compute embeddings
    embeddings = model.encode(paragraphs, convert_to_tensor=True)

    # Step 5: Compute similarity matrix
    similarity_matrix = cosine_similarity(embeddings.cpu().numpy())

    # Step 6: Visualize similarity matrix
    plt.figure(figsize=(10, 10))
    sns.heatmap(similarity_matrix[:20, :20], cmap='YlOrRd', annot=True)
    plt.title('Semantic Similarity Between First 20 Paragraphs')
    plt.tight_layout()
    plt.show()

    return similarity_matrix, paragraphs

# Path to your PDF file
pdf_path = os.path.join('..', 'data', 'totalenergies_sustainability-climate-2024-progress-report_2024_en_pdf.pdf')

if os.path.exists(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())

    print("Analyzing semantic similarity using SBERT...")
    
    similarity_matrix, paragraphs = analyze_semantic_similarity(text)
    
    if paragraphs is None:
        print("Error: Not enough paragraphs extracted for analysis.")
    else:
        print(f"Total paragraphs extracted: {len(paragraphs)}")
        if len(paragraphs) < 5:
            print("Warning: Not enough paragraphs extracted. Try adjusting the paragraph splitting method.")
        
        # Find most similar paragraph pairs
        n_paragraphs = len(paragraphs)
        most_similar = []

        for i in range(n_paragraphs):
            for j in range(i+1, n_paragraphs):
                if similarity_matrix[i, j] > 0.75:  # Adjust threshold for meaningful similarity
                    most_similar.append((i, j, similarity_matrix[i, j]))

        # Display top 3 most similar paragraph pairs
        most_similar.sort(key=lambda x: x[2], reverse=True)
        
        if most_similar:
            print("\nMost similar sections (first 100 characters of each):")
            for i, j, sim in most_similar[:3]:
                print(f"\nSimilarity: {sim:.2f}")
                print(f"Paragraph {i}: {paragraphs[i][:100]}...")
                print(f"Paragraph {j}: {paragraphs[j][:100]}...")
        else:
            print("No highly similar paragraph pairs found.")