In [1]:
import json
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

# Load data
data_path = r'C:\Users\Subways-Sun\OneDrive\Documents\GitHub\sodium-ion-batteries\data_annotated\search_20241106-223705_sodium+ion+battery+anode-sodium+ion+battery+cathode-sodium+ion+battery+electrode_annotated_rephrased.json'
with open(data_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Prepare data
texts = []
labels_raw = []
for i in data:
    texts.append(i["text"])
    labels_raw.append(i["label_int"])

labels = []
for i in labels_raw:
    if i == 0:
        labels.append(1)
    elif i == 1:
        labels.append(0)

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts,
    labels,
    test_size=0.2,
    random_state=42
)

# Create datasets
train_dataset = Dataset.from_dict({
    'text': train_texts,
    'labels': train_labels
})
val_dataset = Dataset.from_dict({
    'text': val_texts,
    'labels': val_labels
})
raw_datasets = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})
print(len(texts))

400


In [2]:
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 320
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 80
    })
})


In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers.trainer_callback import TrainerCallback
import evaluate
import numpy as np
from tqdm.auto import tqdm

# Set seed for reproducibility
torch.manual_seed(42)

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the tokenizer and model
model_name = "batterydata/batterybert-cased-abstract"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Assuming raw_datasets is a DatasetDict with "train" and "validation" splits
# and each example has "text" and "labels" fields

# Function to tokenize the datasets
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Apply tokenization to the datasets
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# Load the model
num_labels = len(set(raw_datasets["train"]["labels"]))
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
model = model.to(device)  # Move model to GPU

# Load evaluation metric
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

# Create a proper callback by inheriting from TrainerCallback
class EpochReportCallback(TrainerCallback):
    def __init__(self):
        self.epoch = 0
    
    def on_epoch_end(self, args, state, control, **kwargs):
        self.epoch += 1
        
        # Get the most recent training loss
        if len(state.log_history) > 0:
            # Find the most recent training loss entry
            train_losses = [log for log in state.log_history if "loss" in log and "eval" not in log]
            if train_losses:
                train_loss = train_losses[-1]["loss"]
                print(f"\nEpoch {self.epoch}/{args.num_train_epochs}")
                print(f"Training Loss: {train_loss:.4f}")
            
            # Find the most recent evaluation entries
            eval_entries = [log for log in state.log_history if "eval_loss" in log]
            if eval_entries:
                eval_loss = eval_entries[-1]["eval_loss"]
                eval_accuracy = eval_entries[-1]["eval_accuracy"]
                print(f"Evaluation Loss: {eval_loss:.4f}")
                print(f"Evaluation Accuracy: {eval_accuracy:.4f}")
        
        return control

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    # Add CUDA specific arguments
    fp16=True,  # Use mixed precision training
    no_cuda=False,  # Use CUDA
    report_to="none",  # Disable default logging to avoid clutter
)

# Create the Trainer with an instance of the callback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
    callbacks=[EpochReportCallback()],  # Create an instance of the callback class
)

# Train the model
print("Starting training...")
trainer.train()

Using device: cuda


Map:   0%|          | 0/320 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Starting training...


  0%|          | 0/400 [00:00<?, ?it/s]

{'loss': 0.7902, 'grad_norm': 3.9231011867523193, 'learning_rate': 9e-07, 'epoch': 0.25}
{'loss': 0.842, 'grad_norm': 0.000405534083256498, 'learning_rate': 1.9e-06, 'epoch': 0.5}
{'loss': 1.3617, 'grad_norm': 4.130080699920654, 'learning_rate': 2.8000000000000003e-06, 'epoch': 0.75}
{'loss': 1.7605, 'grad_norm': 22.6730899810791, 'learning_rate': 3.8e-06, 'epoch': 1.0}

Epoch 1/10
Training Loss: 1.7605


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.6691043376922607, 'eval_accuracy': 0.825, 'eval_runtime': 0.2846, 'eval_samples_per_second': 281.089, 'eval_steps_per_second': 35.136, 'epoch': 1.0}
{'loss': 0.7248, 'grad_norm': 24.809722900390625, 'learning_rate': 4.800000000000001e-06, 'epoch': 1.25}
{'loss': 0.8498, 'grad_norm': 27.25240707397461, 'learning_rate': 5.7000000000000005e-06, 'epoch': 1.5}
{'loss': 0.6623, 'grad_norm': 48.782081604003906, 'learning_rate': 6.700000000000001e-06, 'epoch': 1.75}
{'loss': 0.2753, 'grad_norm': 2.9203381538391113, 'learning_rate': 7.7e-06, 'epoch': 2.0}

Epoch 2/10
Training Loss: 0.2753
Evaluation Loss: 1.6691
Evaluation Accuracy: 0.8250


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.7481721639633179, 'eval_accuracy': 0.8625, 'eval_runtime': 0.2796, 'eval_samples_per_second': 286.174, 'eval_steps_per_second': 35.772, 'epoch': 2.0}
{'loss': 0.3848, 'grad_norm': 0.24446101486682892, 'learning_rate': 8.7e-06, 'epoch': 2.25}
{'loss': 0.1532, 'grad_norm': 0.8036653995513916, 'learning_rate': 9.7e-06, 'epoch': 2.5}
{'loss': 0.0979, 'grad_norm': 0.05082309991121292, 'learning_rate': 1.0700000000000001e-05, 'epoch': 2.75}
{'loss': 0.1005, 'grad_norm': 0.9843505024909973, 'learning_rate': 1.1700000000000001e-05, 'epoch': 3.0}

Epoch 3/10
Training Loss: 0.1005
Evaluation Loss: 0.7482
Evaluation Accuracy: 0.8625


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.24778151512145996, 'eval_accuracy': 0.95, 'eval_runtime': 0.2932, 'eval_samples_per_second': 272.814, 'eval_steps_per_second': 34.102, 'epoch': 3.0}
{'loss': 0.0036, 'grad_norm': 1.0223650932312012, 'learning_rate': 1.27e-05, 'epoch': 3.25}
{'loss': 0.0809, 'grad_norm': 0.7001034021377563, 'learning_rate': 1.3700000000000001e-05, 'epoch': 3.5}
{'loss': 0.0011, 'grad_norm': 0.3647365868091583, 'learning_rate': 1.47e-05, 'epoch': 3.75}
{'loss': 0.0002, 'grad_norm': 0.048908501863479614, 'learning_rate': 1.5700000000000002e-05, 'epoch': 4.0}

Epoch 4/10
Training Loss: 0.0002
Evaluation Loss: 0.2478
Evaluation Accuracy: 0.9500


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.31435731053352356, 'eval_accuracy': 0.9625, 'eval_runtime': 0.2874, 'eval_samples_per_second': 278.397, 'eval_steps_per_second': 34.8, 'epoch': 4.0}
{'loss': 0.0002, 'grad_norm': 0.009898586198687553, 'learning_rate': 1.6700000000000003e-05, 'epoch': 4.25}
{'loss': 0.0002, 'grad_norm': 0.0027319469954818487, 'learning_rate': 1.77e-05, 'epoch': 4.5}
{'loss': 0.0001, 'grad_norm': 0.016157740727066994, 'learning_rate': 1.87e-05, 'epoch': 4.75}
{'loss': 0.0001, 'grad_norm': 0.0047590662725269794, 'learning_rate': 1.97e-05, 'epoch': 5.0}

Epoch 5/10
Training Loss: 0.0001
Evaluation Loss: 0.3144
Evaluation Accuracy: 0.9625


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.31703728437423706, 'eval_accuracy': 0.9625, 'eval_runtime': 0.276, 'eval_samples_per_second': 289.81, 'eval_steps_per_second': 36.226, 'epoch': 5.0}
{'loss': 0.0001, 'grad_norm': 0.0026123346760869026, 'learning_rate': 2.07e-05, 'epoch': 5.25}
{'loss': 0.0001, 'grad_norm': 0.0032429040875285864, 'learning_rate': 2.1700000000000002e-05, 'epoch': 5.5}
{'loss': 0.0001, 'grad_norm': 0.0021182678174227476, 'learning_rate': 2.2700000000000003e-05, 'epoch': 5.75}
{'loss': 0.0001, 'grad_norm': 0.001402911264449358, 'learning_rate': 2.37e-05, 'epoch': 6.0}

Epoch 6/10
Training Loss: 0.0001
Evaluation Loss: 0.3170
Evaluation Accuracy: 0.9625


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.2817559838294983, 'eval_accuracy': 0.9625, 'eval_runtime': 0.2668, 'eval_samples_per_second': 299.889, 'eval_steps_per_second': 37.486, 'epoch': 6.0}
{'loss': 0.0001, 'grad_norm': 0.0033579962328076363, 'learning_rate': 2.47e-05, 'epoch': 6.25}
{'loss': 0.0001, 'grad_norm': 0.0015444708988070488, 'learning_rate': 2.57e-05, 'epoch': 6.5}
{'loss': 0.0001, 'grad_norm': 0.0016248482279479504, 'learning_rate': 2.6700000000000002e-05, 'epoch': 6.75}
{'loss': 0.0001, 'grad_norm': 0.008929664269089699, 'learning_rate': 2.7700000000000002e-05, 'epoch': 7.0}

Epoch 7/10
Training Loss: 0.0001
Evaluation Loss: 0.2818
Evaluation Accuracy: 0.9625


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.28981995582580566, 'eval_accuracy': 0.9625, 'eval_runtime': 0.2716, 'eval_samples_per_second': 294.497, 'eval_steps_per_second': 36.812, 'epoch': 7.0}
{'loss': 0.0001, 'grad_norm': 0.002281831344589591, 'learning_rate': 2.87e-05, 'epoch': 7.25}
{'loss': 0.0, 'grad_norm': 0.0009053099784068763, 'learning_rate': 2.97e-05, 'epoch': 7.5}
{'loss': 0.0, 'grad_norm': 0.001418367144651711, 'learning_rate': 3.07e-05, 'epoch': 7.75}
{'loss': 0.0, 'grad_norm': 0.001444044872187078, 'learning_rate': 3.1700000000000005e-05, 'epoch': 8.0}

Epoch 8/10
Training Loss: 0.0000
Evaluation Loss: 0.2898
Evaluation Accuracy: 0.9625


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.282640278339386, 'eval_accuracy': 0.9625, 'eval_runtime': 0.27, 'eval_samples_per_second': 296.325, 'eval_steps_per_second': 37.041, 'epoch': 8.0}
{'loss': 0.0001, 'grad_norm': 0.0008772741421125829, 'learning_rate': 3.27e-05, 'epoch': 8.25}
{'loss': 0.0, 'grad_norm': 0.0021463658194988966, 'learning_rate': 3.3700000000000006e-05, 'epoch': 8.5}
{'loss': 0.0, 'grad_norm': 0.000889813294634223, 'learning_rate': 3.4699999999999996e-05, 'epoch': 8.75}
{'loss': 0.0001, 'grad_norm': 0.0031801722943782806, 'learning_rate': 3.57e-05, 'epoch': 9.0}

Epoch 9/10
Training Loss: 0.0001
Evaluation Loss: 0.2826
Evaluation Accuracy: 0.9625


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.2961278557777405, 'eval_accuracy': 0.9625, 'eval_runtime': 0.279, 'eval_samples_per_second': 286.782, 'eval_steps_per_second': 35.848, 'epoch': 9.0}
{'loss': 0.0001, 'grad_norm': 0.0027870074845850468, 'learning_rate': 3.6700000000000004e-05, 'epoch': 9.25}
{'loss': 0.0, 'grad_norm': 0.003563058329746127, 'learning_rate': 3.77e-05, 'epoch': 9.5}
{'loss': 0.0, 'grad_norm': 0.0012248293496668339, 'learning_rate': 3.8700000000000006e-05, 'epoch': 9.75}
{'loss': 0.0, 'grad_norm': 0.001021943404339254, 'learning_rate': 3.97e-05, 'epoch': 10.0}

Epoch 10/10
Training Loss: 0.0000
Evaluation Loss: 0.2961
Evaluation Accuracy: 0.9625


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.31184953451156616, 'eval_accuracy': 0.9625, 'eval_runtime': 0.2711, 'eval_samples_per_second': 295.141, 'eval_steps_per_second': 36.893, 'epoch': 10.0}
{'train_runtime': 53.921, 'train_samples_per_second': 59.346, 'train_steps_per_second': 7.418, 'train_loss': 0.2022728704288602, 'epoch': 10.0}


TrainOutput(global_step=400, training_loss=0.2022728704288602, metrics={'train_runtime': 53.921, 'train_samples_per_second': 59.346, 'train_steps_per_second': 7.418, 'total_flos': 841955377152000.0, 'train_loss': 0.2022728704288602, 'epoch': 10.0})

In [5]:
# Save the fine-tuned model
model.save_pretrained("./batterybert-cased-abstract_finetuned_400_2")
tokenizer.save_pretrained("./batterybert-cased-abstract_finetuned_400_2")

('./batterybert-cased-abstract_finetuned_400_2\\tokenizer_config.json',
 './batterybert-cased-abstract_finetuned_400_2\\special_tokens_map.json',
 './batterybert-cased-abstract_finetuned_400_2\\vocab.txt',
 './batterybert-cased-abstract_finetuned_400_2\\added_tokens.json',
 './batterybert-cased-abstract_finetuned_400_2\\tokenizer.json')

In [6]:
# For inference
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Move inputs to GPU
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    predictions = torch.argmax(outputs.logits, dim=-1)
    return predictions.cpu().numpy()  # Move predictions back to CPU for numpy conversion

In [13]:
val_path = r'C:\Users\Subways-Sun\OneDrive\Documents\GitHub\sodium-ion-batteries\data_annotated\annotated_data_openai_1.json'
with open(val_path, 'r', encoding='utf-8') as f:
    val_data = json.load(f)

label_bert = []
score_bert = []
for i in range(len(val_data["text"])):
    val_label = predict(val_data["text"][i])
    if val_label[0] == 0:
        label_bert.append(1)
        # score_bert.append(val_label[0]["score"])
    elif val_label[0] == 1:
        label_bert.append(0)
        # score_bert.append(val_label[0]["score"])

val_data["label_bert"] = label_bert
val_data["score_bert"] = score_bert
print(len(label_bert))

100


In [14]:
label_openai = val_data["label_openai"]
label_annotated = val_data["label_annotated"]
print(f"No\tAnno\tOpenAI\tBERT\tBERT Score")

openai_fp = 0
openai_fn = 0
bert_fp = 0
bert_fn = 0

for i in range(len(label_bert)):
    if (label_bert[i] + label_openai[i] + label_annotated[i] != 3) and (label_bert[i] + label_openai[i] + label_annotated[i] != 0):
        print(f"{i}\t{label_annotated[i]}\t{label_openai[i]}\t{label_bert[i]}")
        if label_bert[i] == 1 and label_annotated[i] == 0:
            bert_fp += 1
        elif label_bert[i] == 0 and label_annotated[i] == 1:
            bert_fn += 1
        if label_openai[i] == 1 and label_annotated[i] == 0:
            openai_fp += 1
        elif label_openai[i] == 0 and label_annotated[i] == 1:
            openai_fn += 1

print(f"OpenAI False Positive: {openai_fp}")
print(f"OpenAI False Negative: {openai_fn}")
print(f"Bert False Positive: {bert_fp}")
print(f"Bert False Negative: {bert_fn}")

No	Anno	OpenAI	BERT	BERT Score
7	0	1	0
21	0	1	0
34	0	1	0
36	0	1	0
53	0	1	0
72	1	1	0
79	0	1	0
80	0	1	0
83	0	0	1
99	0	1	0
OpenAI False Positive: 8
OpenAI False Negative: 0
Bert False Positive: 1
Bert False Negative: 1
