In [2]:
import json
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

data_path = r'C:\Users\Subways-Sun\OneDrive\Documents\GitHub\sodium-ion-batteries\data_annotated\search_20241106-223705_sodium+ion+battery+anode-sodium+ion+battery+cathode-sodium+ion+battery+electrode_annotated_rephrased.json'
with open(data_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

texts = []
labels = []

for i in data:
    texts.append(i["text"])
    labels.append(i["label_int"])

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts,
    labels,
    test_size=0.2,
    random_state=42
)

train_dataset = Dataset.from_dict({
    'text': train_texts,
    'labels': train_labels
})

val_dataset = Dataset.from_dict({
    'text': val_texts,
    'labels': val_labels
})

raw_datasets = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})
print(len(texts))

200


In [3]:
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 160
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 40
    })
})


In [4]:
model_name = "batterydata/batterybert-cased-abstract"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [5]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, torch_dtype="auto")

In [6]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")

In [7]:
import evaluate

metric = evaluate.load("accuracy")

In [8]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [9]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer",
                                  eval_strategy="epoch",
                                  num_train_epochs=7)

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)


In [11]:
trainer.train()

  0%|          | 0/140 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.44593778252601624, 'eval_accuracy': 0.85, 'eval_runtime': 0.3781, 'eval_samples_per_second': 105.791, 'eval_steps_per_second': 13.224, 'epoch': 1.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.4470089077949524, 'eval_accuracy': 0.85, 'eval_runtime': 0.342, 'eval_samples_per_second': 116.972, 'eval_steps_per_second': 14.622, 'epoch': 2.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.19213494658470154, 'eval_accuracy': 0.9, 'eval_runtime': 0.362, 'eval_samples_per_second': 110.512, 'eval_steps_per_second': 13.814, 'epoch': 3.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.027100294828414917, 'eval_accuracy': 1.0, 'eval_runtime': 0.3598, 'eval_samples_per_second': 111.16, 'eval_steps_per_second': 13.895, 'epoch': 4.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.002022701082751155, 'eval_accuracy': 1.0, 'eval_runtime': 0.3527, 'eval_samples_per_second': 113.425, 'eval_steps_per_second': 14.178, 'epoch': 5.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.0034980475902557373, 'eval_accuracy': 1.0, 'eval_runtime': 0.3407, 'eval_samples_per_second': 117.415, 'eval_steps_per_second': 14.677, 'epoch': 6.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.0011996629182249308, 'eval_accuracy': 1.0, 'eval_runtime': 0.3207, 'eval_samples_per_second': 124.727, 'eval_steps_per_second': 15.591, 'epoch': 7.0}
{'train_runtime': 33.1461, 'train_samples_per_second': 33.79, 'train_steps_per_second': 4.224, 'train_loss': 0.3969518116542271, 'epoch': 7.0}


TrainOutput(global_step=140, training_loss=0.3969518116542271, metrics={'train_runtime': 33.1461, 'train_samples_per_second': 33.79, 'train_steps_per_second': 4.224, 'total_flos': 294684382003200.0, 'train_loss': 0.3969518116542271, 'epoch': 7.0})

In [12]:
print(raw_datasets["validation"][0])

{'text': 'Sodium-ion batteries (SIBs) can develop cost-effective and safe energy storage technology for substantial energy storage demands. In this work, we have developed manganese oxide (α-MnO2) nanorods for SIB applications. The crystal structure, which is crucial for high-performance energy storage, is examined systematically for the metal oxide cathode. The intercalation of sodium into the α-MnO2 matrix was studied using the theoretical density functional theory (DFT) studies. The DFT studies predict Na ions’ facile diffusion kinetics through the MnO2 lattice with an attractively low diffusion barrier (0.21 eV). When employed as a cathode material for SIBs, MnO2 showed a moderate capacity (109 mAh·g–1 at C/20 current rate) and superior life cyclability (58.6% after 800 cycles) in NaPF6/EC+DMC (5% FEC) electrolyte. It shows a much higher capacity of 181 mAh·g–1 (C/20 current rate) in NaClO4/PC (5% FEC) electrolyte, though it suffers fast capacity fading (11.5% after 800 cycles). Ou