# ESG Report Data Preprocessing

This notebook processes ESG reports from PDF format into cleaned text data for further analysis.

## Setup
First, we'll import the necessary libraries and download required NLTK resources.

In [None]:
import os
import pandas as pd
import json
import re
import nltk
import pdfplumber
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

## Configure Paths
Set up the directory paths for input and output data.

In [None]:
# Define paths
notebook_dir = os.getcwd()
project_root = os.path.dirname(os.path.dirname(notebook_dir))  # Go up two levels from notebooks dir
data_dir = os.path.join(project_root, "day_1", "data")
os.makedirs(data_dir, exist_ok=True)

print(f"Data directory: {data_dir}")

## Define Processing Functions
These functions handle PDF text extraction and text cleaning.

In [None]:
def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

def clean_text(text):
    """Clean and preprocess extracted text."""
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^a-zA-Z0-9.,!?%€$]', ' ', text)  # Remove special characters
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    return " ".join(words)

## Load Dataset
Define the sample dataset with ESG report information.

In [None]:
# Sample dataset (Placeholder: Replace with real ESG reports)
data_sample = [
    {"company": "TotalEnergies", "year": 2024, "file": "totalenergies_sustainability-climate-2024-progress-report_2024_en_pdf.pdf"},
]

dataframe = pd.DataFrame(data_sample)
print("Sample ESG Dataset Loaded:")
display(dataframe)

## Process Reports
Define and run the main processing function.

In [None]:
def process_esg_reports(dataframe, data_dir):
    """Process ESG reports from PDF to cleaned text."""
    processed_reports = []
    for _, row in tqdm(dataframe.iterrows(), total=len(dataframe)):
        file_path = os.path.join(data_dir, row["file"])
        print(f"Attempting to read file: {file_path}")
        print(f"File exists: {os.path.exists(file_path)}")
        
        if os.path.exists(file_path):
            print(f"Reading PDF file...")
            raw_text = extract_text_from_pdf(file_path)
            print(f"Extracted text length: {len(raw_text)}")
            cleaned_text = clean_text(raw_text)
            print(f"Cleaned text length: {len(cleaned_text)}")
            processed_reports.append({
                "company": row["company"],
                "year": row["year"],
                "cleaned_text": cleaned_text
            })
        else:
            print(f"File {file_path} not found!")
    return processed_reports

# Process ESG reports
processed_data = process_esg_reports(dataframe, data_dir)

## Save Results
Save the processed data to a JSON file.

In [None]:
# Save as JSON
output_path = os.path.join(data_dir, "processed_esg_data.json")
with open(output_path, "w") as f:
    json.dump(processed_data, f, indent=4)

print(f"Processing complete. Cleaned ESG data saved to {output_path}")