In [None]:
import re
from pathlib import Path

In [1]:
def normalize_text(text):
    text = text.replace("\n", " ")
    text = text.replace("\t", " ")
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


In [3]:
import spacy
nlp = spacy.load("en_core_web_sm")

def get_sentences(text):
    doc = nlp(text)
    return [s.text.strip() for s in doc.sents if len(s.text.strip()) > 30]


In [4]:
def is_non_linguistic(s):
    digit_ratio = sum(c.isdigit() for c in s) / len(s)
    if digit_ratio > 0.3:
        return True
    if s.isupper() and len(s.split()) > 6:
        return True
    return False


In [5]:
CLAIM_VERBS = [
    "commit", "aim", "target", "reduce", "achieve", "improve",
    "ensure", "support", "invest", "transition", "align",
    "pledge", "increase", "decrease"
]

SUBJECT_HINTS = ["we", "our", "company", "group", "organization"]

def is_claim_sentence(s):
    s_low = s.lower()
    if not any(v in s_low for v in CLAIM_VERBS):
        return False
    if not any(sub in s_low for sub in SUBJECT_HINTS):
        return False
    return True


In [6]:
def clean_txt_for_greenwashing(text):
    text = normalize_text(text)
    sentences = get_sentences(text)

    final = []
    for s in sentences:
        if is_non_linguistic(s):
            continue
        if not is_claim_sentence(s):
            continue
        final.append(s)

    return final


In [9]:
from pathlib import Path

INPUT_DIR = Path("./dataset/extracted_text")
OUTPUT_DIR = Path("./dataset/cleaned_sentences")
OUTPUT_DIR.mkdir(exist_ok=True)

for file in INPUT_DIR.glob("*.txt"):
    raw = file.read_text(encoding="utf-8", errors="ignore")
    clean_sentences = clean_txt_for_greenwashing(raw)

    (OUTPUT_DIR / file.name).write_text(
        "\n".join(clean_sentences),
        encoding="utf-8"
    )

    print(f"{file.name}: {len(clean_sentences)} sentences kept")


american-forest-paper-association_2016_esg.txt: 23 sentences kept
american-forest-paper-association_2018_esg.txt: 26 sentences kept
american-forest-paper-association_2020_esg.txt: 45 sentences kept
LSE_RIGD_2021_esg.txt: 227 sentences kept
LSE_RIGD_2022_esg.txt: 363 sentences kept
LSE_RIGD_2023_esg.txt: 117 sentences kept
LSE_RIGD_2024_esg.txt: 102 sentences kept
NASDAQ_NVDA_2022_esg.txt: 134 sentences kept
NASDAQ_NVDA_2023_esg.txt: 163 sentences kept
NASDAQ_NVDA_2024_esg.txt: 112 sentences kept
NYSE_BA_2021_esg.txt: 178 sentences kept
NYSE_BA_2022_esg.txt: 190 sentences kept
NYSE_CRI_2020_esg.txt: 68 sentences kept
NYSE_CRI_2021_esg.txt: 101 sentences kept
NYSE_CRI_2022_esg.txt: 107 sentences kept
NYSE_MUR_2021_esg.txt: 229 sentences kept
NYSE_MUR_2022_esg.txt: 304 sentences kept
NYSE_MUR_2023_esg.txt: 427 sentences kept
NYSE_MUR_2024_esg.txt: 389 sentences kept
NYSE_VEDL_2021_esg.txt: 234 sentences kept
NYSE_VEDL_2023_esg.txt: 435 sentences kept
NYSE_VEDL_2024_esg.txt: 494 sentences 