In [None]:
# data extraction


import os
import re
import pandas as pd
import fitz  # PyMuPDF

# In notebooks/Colab, install BEFORE importing fitz on a fresh runtime:
# !pip install PyMuPDF

def filename_creator(sus_report):
    """Extract company name and year from filename"""
    name = os.path.splitext(os.path.basename(sus_report))[0]
    parts = re.split(r"[_\-\s]", name)
    year = next((p for p in parts if re.fullmatch(r"\d{4}", p)), "Unknown")
    companies = {"BP", "TOTAL", "SHELL","EXXON","CHEVRON"}  # extend later
    company = next((p.upper() for p in parts if p.upper() in companies), "Unknown")
    return company, year

def extract_pages_fitz(file_path, sparse_threshold=50):
    """
    Extract text from PDF using PyMuPDF with proper page numbers.
    Keeps ALL pages (including short ones), and flags sparse pages.
    """
    data = []

    try:
        doc = fitz.open(file_path)

        for page_num in range(len(doc)):
            page = doc[page_num]
            text = page.get_text("text").strip()  # Extract + trim

            char_count = len(text)
            word_count = len(text.split()) if text else 0

            data.append({
                "page_number": page_num + 1,  # 1-indexed
                "text": text,
                "char_count": char_count,
                "word_count": word_count,
                "is_sparse": char_count < sparse_threshold
            })

        doc.close()

    except Exception as e:
        print(f"Error processing {file_path}: {e}")

    return data

def process_all_pdfs(folder_path, sparse_threshold=50):
    """Process all PDFs in folder and create dataset"""
    all_data = []

    for filename in os.listdir(folder_path):
        if filename.lower().endswith(".pdf"):
            file_path = os.path.join(folder_path, filename)
            print(f"ðŸ“„ Processing: {filename}")

            company, year = filename_creator(filename)
            pages_data = extract_pages_fitz(file_path, sparse_threshold=sparse_threshold)

            for page_info in pages_data:
                all_data.append({
                    "Company": company,
                    "Year": year,
                    "Page": page_info["page_number"],
                    "Text": page_info["text"],
                    "Char_Count": page_info["char_count"],
                    "Word_Count": page_info["word_count"],
                    "Is_Sparse": page_info["is_sparse"],
                })

    return pd.DataFrame(all_data)

# Usage
folder_path = "/content/"
df = process_all_pdfs(folder_path, sparse_threshold=50)

# Save to CSV
df.to_csv("BIGOIL_pages4.csv", index=False)
print(f"âœ… Extracted {len(df)} pages from {df['Company'].nunique()} companies")
df.head()


In [None]:
# chunking the writing blocks

import pandas as pd
import spacy


df = pd.read_csv("/content/BIGOIL_pages4 (1).csv")

nlp = spacy.load("en_core_web_sm", disable=["ner", "tagger"])

def split_long_text(
    text,
    target_words=400,
    max_words=450
):
    """
    Split text into sentence-based chunks.
    Sentences are accumulated until ~target_words.
    """
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]

    chunks = []
    current_chunk = []
    current_word_count = 0

    for sent in sentences:
        sent_word_count = len(sent.split())

        if current_word_count + sent_word_count <= target_words:
            current_chunk.append(sent)
            current_word_count += sent_word_count
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sent]
            current_word_count = sent_word_count

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

new_rows = []

for _, row in df.iterrows():
    text = str(row["Text"])
    word_count = row["Word_Count"]

    # Case 1: short rows â†’ keep as-is
    if word_count <= 450:
        new_row = row.to_dict()
        new_row["Subpage"] = float(row["Page"])  # e.g. 40.0
        new_rows.append(new_row)
        continue

    # Case 2: long rows â†’ split
    chunks = split_long_text(text)

    for i, chunk in enumerate(chunks, start=1):
        new_row = row.to_dict()
        new_row["Text"] = chunk
        new_row["Word_Count"] = len(chunk.split())
        new_row["Char_Count"] = len(chunk)
        new_row["Subpage"] = float(f"{int(row['Page'])}.{i}")  # 40.1, 40.2, ...
        new_rows.append(new_row)

df_chunked = pd.DataFrame(new_rows)

print("Original rows:", len(df))
print("New rows after chunking:", len(df_chunked))

df_chunked.to_csv(
    "/content/BIGOIL_pages_sentence_chunked.csv",
    index=False
)

print("âœ… Saved: BIGOIL_pages_sentence_chunked.csv")




In [None]:
# Filtering and clearing

#cleaning the dataset
import pandas as pd
import re

# 1. Load CHUNKED dataset (important)
df = pd.read_csv("/content/BIGOIL_pages_sentence_chunked.csv")

# 3. Light cleaning
def light_clean(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', str(text))
    text = text.replace('\n', ' ')
    text = text.replace('-', ' ')
    text = re.sub(r'\s+', ' ', text)
    return text.strip().lower()

df["Text"] = df["Text"].apply(light_clean)

# 4. Green / environmental lexicon
greenwashing_keywords = [
    "environment", "environmental", "biodiversity", "nature", "ecosystem", "ecosystems",
    "species", "ocean", "oceans", "forest", "forests", "water", "deforest", "deforestation",
    "endangered", "protect", "protection", "agriculture", "agricultural", "continent",
    "hemisphere", "climate", "climatic", "warming", "global warming", "extreme weather",
    "cyclone", "cyclones", "biology", "biologist", "chemistry", "chemical", "chemicals",
    "sustainable", "sustainability", "sustain", "green", "greenwashing", "greenhouse",
    "renewable", "renewables", "energy", "clean energy", "net zero", "carbon neutrality",
    "carbon neutral", "transition", "transitions", "transitioning", "decarbon",
    "decarbonize", "decarbonise", "decarbonization", "decarbonisation",
    "alternative energy", "alternative fuel", "green investment", "green investments",
    "circular economy", "low carbon", "climate positive", "energy efficiency",
    "energy efficient", "dual challenge", "ambition", "ambitions", "commitment",
    "commitments", "leadership", "vision", "visionary", "carbon", "carbon dioxide", "co2",
    "ghg", "greenhouse gas", "greenhouse gases", "emission", "emissions", "methane", "ch4",
    "footprint", "carbon footprint", "effluent", "effluents", "pollutant", "pollutants",
    "pollution", "hazardous", "hazard", "contaminated", "contamination", "disposal",
    "waste disposal", "flaring", "gas flaring", "abatement", "carbon abatement",
    "carbon capture", "carbon sink", "carbon offset", "carbon offsets", "carbon tax",
    "carbon pricing", "carbon price", "solar", "solar power", "solar energy", "wind",
    "wind energy", "wind power", "hydrogen", "green hydrogen", "biofuel", "biofuels",
    "biomass", "battery", "batteries", "geothermal", "electric vehicle",
    "electric vehicles", "ev", "evs", "hydropower", "hydroelectric", "clean tech",
    "material", "materials", "metric", "metrics", "target", "targets", "tonnes", "tons",
    "scope 1", "scope 2", "scope 3", "baseline", "benchmark", "benchmarks", "reduction",
    "reductions", "increase", "increases", "ogci", "ipcc", "paris agreement", "paris",
    "kyoto", "unfccc", "1.5Â°c", "2Â°c", "two degrees", "one point five degrees"
]

# 5. Regex pattern
pattern = re.compile(
    r'\b(?:' + '|'.join(re.escape(word) for word in greenwashing_keywords) + r')\b',
    flags=re.IGNORECASE
)

# 6. Filter rows
df_filtered = df[df["Text"].apply(lambda x: bool(pattern.search(x)))]

# 7. Save
df_filtered.to_csv("/content/cleaned_oil_chunked.csv", index=False)

print(f"âœ… Done. {len(df_filtered)} rows retained.")


In [None]:
# Adding the new character counts etc

df = pd.read_csv("/content/cleaned_oil_chunked.csv")

df["Word_Count"] = df["Text"].str.split().str.len()
df["Char_Count"] = df["Text"].str.len()

df.to_csv("/content/cleaned_oil_chunked_fixed.csv", index=False)
print("âœ… Saved: cleaned_oil_chunked_fixed.csv", "Rows:", len(df))

In [None]:
#Â lemmantion etc

import pandas as pd
import spacy

# 1. Load the CLEAN & CHUNKED dataset
df = pd.read_csv("/content/clean&fresh_oil_.csv")

# 2. Load spaCy with only what we need
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

# 3. Lemmatization function
def lemmatize_text(text):
    doc = nlp(str(text))
    lemmas = [
        token.lemma_
        for token in doc
        if not token.is_punct and not token.is_space
    ]
    return " ".join(lemmas)

# 4. Apply lemmatization
df["lemmatized_text"] = df["Text"].apply(lemmatize_text)

# 5. Save output
df.to_csv("/content/clean_fresh_oil_lemmatized.csv", index=False)

print("âœ… Lemmatization complete. Saved as clean_fresh_oil_lemmatized.csv")


In [None]:
# basic sentiment analysis
import pandas as pd
from transformers import pipeline

# 1. Load your latest dataset
df = pd.read_csv("/content/clean_fresh_oil_lemmatized.csv")

# 2. Load RoBERTa-based sentiment model
sentiment_model = pipeline(
    "sentiment-analysis",
    model="siebert/sentiment-roberta-large-english"
)

# 3. Sentiment function (safe truncation)
def get_sentiment(text):
    result = sentiment_model(
        str(text),
        truncation=True
    )[0]
    return result["label"], float(result["score"])

# 4. Apply sentiment analysis
df[["sentiment_label", "sentiment_score"]] = df["Text"].apply(
    lambda x: pd.Series(get_sentiment(x))
)

# 5. Save output
output_path = "/content/clean_fresh_oil_with_sentiment.csv"
df.to_csv(output_path, index=False)

print("âœ… RoBERTa sentiment complete!")
print("Saved as:", output_path)

# Quick sanity check
print(df[["Text", "sentiment_label", "sentiment_score"]].head(5))


In [None]:
# =====================================================
# ClimateBERT Full Inference Pipeline (ALL MODELS)
# =====================================================


# 2. Imports
import pandas as pd
from transformers import pipeline

# =====================================================
# 3. Load dataset
# =====================================================

INPUT_PATH = "/content/clean_fresh_oil_with_sentiment.csv"
OUTPUT_PATH = "/content/clean_fresh_oil_with_all_climatebert.csv"

df = pd.read_csv(INPUT_PATH)

print("Loaded dataset:", df.shape)

# =====================================================
# 4. Helper function to run any ClimateBERT classifier
# =====================================================

def run_climate_model(model_name, prefix, df, batch_size=16):
    """
    Runs a HuggingFace text-classification model on df['Text']
    and appends label, score, and full probability distribution.
    """

    print(f"\nRunning model: {model_name}")

    clf = pipeline(
        "text-classification",
        model=model_name,
        device=0,                 # GPU
        truncation=True,
        return_all_scores=True
    )

    labels = []
    scores = []
    probs  = []

    texts = df["Text"].astype(str).tolist()

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        outputs = clf(batch)

        for out in outputs:
            # Sort predictions by confidence
            out_sorted = sorted(out, key=lambda x: x["score"], reverse=True)
            best = out_sorted[0]

            labels.append(best["label"])
            scores.append(float(best["score"]))
            probs.append({d["label"]: float(d["score"]) for d in out})

    df[f"{prefix}_label"] = labels
    df[f"{prefix}_score"] = scores
    df[f"{prefix}_probs"] = probs

    print(f"âœ… Finished: {prefix}")

# =====================================================
# 5. Run ALL ClimateBERT models
# =====================================================

# 5.1 Specificity (specific vs non-specific)
run_climate_model(
    model_name="climatebert/distilroberta-base-climate-specificity",
    prefix="specificity",
    df=df
)

# 5.2 Climate detector (climate vs not)
run_climate_model(
    model_name="climatebert/distilroberta-base-climate-detector",
    prefix="climate_detector",
    df=df
)

# 5.3 Commitment framing
run_climate_model(
    model_name="climatebert/distilroberta-base-climate-commitment",
    prefix="commitment",
    df=df
)

# 5.4 TCFD category classification
run_climate_model(
    model_name="climatebert/distilroberta-base-climate-tcfd",
    prefix="tcfd",
    df=df
)

# 5.5 Climate-specific sentiment
run_climate_model(
    model_name="climatebert/distilroberta-base-climate-sentiment",
    prefix="climate_sentiment",
    df=df
)

# =====================================================
# 6. Save final dataset
# =====================================================

df.to_csv(OUTPUT_PATH, index=False)

print("\nðŸŽ‰ ALL CLIMATEBERT MODELS COMPLETED")
print("Saved to:", OUTPUT_PATH)
print("Final shape:", df.shape)


In [None]:
# addition of buzzword scoring and fuutre scoring

import pandas as pd
import spacy
import re

# Load your dataset
df = pd.read_csv('/content/clean_fresh_oil_with_all_climatebert.csv')

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# --- POS-based future verbs (as lemmas) ---
future_verbs = {
    "will", "shall", "aim", "aiming", "plan", "planning", "target", "aspire",
    "commit", "committed", "pledge", "pledged", "pledging", "vow", "promise",
    "seek", "intend", "intends", "hope", "envision", "anticipate", "anticipates", "forecast",
    "forecasting", "project", "projects", "projected", "look", "working", "moving", "set"
}

# --- Phrase-based future expressions (not captured as single tokens by spaCy) ---
future_phrases = [
    "going to", "seek to", "intend to", "set out to", "hope to", "looking to",
    "working to", "working towards", "moving towards", "transition plan",
    "path to net zero", "by 2030", "by 2050",
    "weâ€™re helping to save the planet"
]

# --- Buzzword lexicon for greenwashing ---
greenwashing_lexicon = [
    "green", "clean", "cleaner", "cleanest", "efficient", "sustainable",
    "sustainability", "eco-friendly", "environmentally friendly", "earth-friendly",
    "eco-conscious", "natural", "non-toxic", "organic", "ethical", "biodegradable",
    "carbon neutral", "climate neutral", "carbon offsets", "carbon offset", "carbon credits",
    "low carbon", "fuels of tomorrow", "resilient hydrocarbons", "energy in progress",
    "transformation", "energy transition", "transition", "beyond petroleum",
    "emissions intensity", "locally grown", "sustainably sourced", "eco-safe",
    "eco-preferred", "cfc-free", "chlorofluorocarbon-free", "renewable natural gas", "rng",
    "carbon capture and storage", "ccs", "e-fuels", "synthetic fuels", "synth-fuels",
    "carbon-neutral e-fuels", "drive carbon neutral"
]

# --- Compile regex patterns for phrase matching ---
future_phrase_pattern = re.compile(r'\b(?:' + '|'.join(re.escape(phrase) for phrase in future_phrases) + r')\b', flags=re.IGNORECASE)
buzzword_pattern = re.compile(r'\b(?:' + '|'.join(re.escape(term) for term in greenwashing_lexicon) + r')\b', flags=re.IGNORECASE)

# --- POS-based future score function ---
def pos_future_score(text):
    if pd.isnull(text) or text.strip() == "":
        return 0.0

    doc = nlp(text)
    token_count = len(doc)
    future_count = 0

    for i, token in enumerate(doc):
        lemma = token.lemma_.lower()
        pos = token.pos_

        # Match modal verbs like "will", "shall" only when followed by a verb
        if lemma in {"will", "shall"} and token.tag_ == "MD":
            if i + 1 < token_count and doc[i + 1].pos_ == "VERB":
                future_count += 1
            else:
                future_count += 1  # accept anyway to stay inclusive

        # Match intention/commitment words when used as verbs or auxiliaries
        elif lemma in future_verbs and pos in {"VERB", "AUX"}:
            future_count += 1

    # Also match multi-word future phrases via regex
    future_count += len(future_phrase_pattern.findall(text.lower()))

    return future_count / token_count if token_count > 0 else 0.0

# --- Buzzword score function (greenwashing density) ---
def buzzword_score(text):
    if pd.isnull(text) or text.strip() == "":
        return 0.0
    word_count = len(text.split())
    if word_count == 0:
        return 0.0
    buzzword_matches = buzzword_pattern.findall(text.lower())
    return len(buzzword_matches) / word_count

# --- Apply both scoring functions to your DataFrame ---
df["future_score_pos"] = df["Text"].apply(pos_future_score)
df["buzzword_score"] = df["Text"].apply(buzzword_score)

# --- Save new dataset with both scores included ---
df.to_csv("/content/oil_reports_with_future_and_buzzword_scores.csv", index=False)

print("âœ… POS-based future score and buzzword score added!")
print("ðŸ“„ Saved as: oil_reports_with_future_and_buzzword_scores.csv")
