In [None]:
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
import torch


In [None]:
df = pd.read_csv("combined_esg_labeled.csv")
df = df[["sentence", "label"]]
df = df.dropna()
df.head()

In [None]:

MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)

def tokenize(batch):
    return tokenizer(
        batch["sentence"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

dataset = dataset.map(tokenize, batched=True)
dataset = dataset.train_test_split(test_size=0.2)

dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
)

In [None]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_steps=50,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
)

In [None]:
trainer.train()

In [None]:
SAVE_PATH = "greenwashing_app/model/bert_greenwashing"

model.save_pretrained(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)
