In [38]:
import json
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

data_path = r'C:\Users\Subways-Sun\OneDrive\Documents\GitHub\sodium-ion-batteries\data_annotated\search_20241106-223705_sodium+ion+battery+anode-sodium+ion+battery+cathode-sodium+ion+battery+electrode_annotated_rephrased_once.json'
with open(data_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

texts = []
labels_raw = []

for i in data:
    texts.append(i["text"])
    labels_raw.append(i["label_int"])

labels = []
for i in labels_raw:
    if i == 0:
        labels.append(1)
    elif i == 1:
        labels.append(0)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts,
    labels,
    test_size=0.2,
    random_state=42
)

train_dataset = Dataset.from_dict({
    'text': train_texts,
    'labels': train_labels
})

val_dataset = Dataset.from_dict({
    'text': val_texts,
    'labels': val_labels
})

raw_datasets = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})
print(len(texts))

200


In [39]:
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 160
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 40
    })
})


In [40]:
model_name = "batterydata/batterybert-cased-abstract"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [41]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, torch_dtype="auto")

In [42]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")

In [43]:
import evaluate

metric = evaluate.load("accuracy")

In [44]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [45]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer",
                                  eval_strategy="epoch",
                                  num_train_epochs=10)

In [46]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)


RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
trainer.train()

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.2842535078525543, 'eval_accuracy': 0.875, 'eval_runtime': 0.3288, 'eval_samples_per_second': 121.657, 'eval_steps_per_second': 15.207, 'epoch': 1.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.5090761184692383, 'eval_accuracy': 0.9, 'eval_runtime': 0.3238, 'eval_samples_per_second': 123.516, 'eval_steps_per_second': 15.44, 'epoch': 2.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.33721375465393066, 'eval_accuracy': 0.925, 'eval_runtime': 0.3209, 'eval_samples_per_second': 124.663, 'eval_steps_per_second': 15.583, 'epoch': 3.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.309284508228302, 'eval_accuracy': 0.925, 'eval_runtime': 0.3324, 'eval_samples_per_second': 120.33, 'eval_steps_per_second': 15.041, 'epoch': 4.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.23123617470264435, 'eval_accuracy': 0.95, 'eval_runtime': 0.3368, 'eval_samples_per_second': 118.781, 'eval_steps_per_second': 14.848, 'epoch': 5.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.27842697501182556, 'eval_accuracy': 0.95, 'eval_runtime': 0.3325, 'eval_samples_per_second': 120.307, 'eval_steps_per_second': 15.038, 'epoch': 6.0}


  0%|          | 0/5 [00:00<?, ?it/s]

RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
trainer.evaluate()

  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.004782584495842457,
 'eval_accuracy': 1.0,
 'eval_runtime': 0.5041,
 'eval_samples_per_second': 79.351,
 'eval_steps_per_second': 9.919,
 'epoch': 10.0}

In [None]:
trainer.save_model("batterybert-cased-abstract_finetuned_200")

In [None]:
from transformers import pipeline
classifier = pipeline("text-classification", model="batterybert-cased-abstract_finetuned_200", tokenizer=tokenizer, device=0)

In [None]:
text = r"MXene nanosheets and ordered porous carbons both have their own advantages and disadvantages. Assembling and combining the advantages of the two will be a good choice for battery electrode hosts of active materials. In this work, an electrostatic separation-adsorption strategy is proposed to realize the ordered alternating self-assembly of MXene nanosheets and ordered porous carbon (MPOC), obtaining a unique wall-like porous material with a high conductivity and interconnected porous nanostructure, which strengthens the transfer rate of electrons and ions simultaneously. Meanwhile, the introduction of N-doping from porous carbon into MPOC prolongs the cycle life. When use red phosphorus (RP) as active materials, the MPOC@RP anode exhibited high-capacity output (2454.3 and 2408.1 mAh g-1 in lithium-ion batteries (LIBs) and sodium-ion batteries (SIBs) at 0.1 C) and long cycle life (the decay rates per cycle of 0.028% and 0.036% after 1500 and 1200 cycles at 2 C in LIBs and SIBs respectively). The successful application in RP anodes displays great potential in other electrode materials such as silicon, sulfur, selenium, and so on. Meanwhile, this strategy is also effective to design other composites materials like MXene and carbon nanotubes, MXene and Graphene, and so on."

In [None]:
# model_out = AutoModelForSequenceClassification.from_pretrained("batterybert-cased-abstract_finetuned_200")
# with torch.no_grad():
#     logits = model_out(text).logits
# predicted_class_id = logits.argmax().item()
# print(predicted_class_id)

In [None]:
classifier(r"MXene nanosheets and ordered porous carbons both have their own advantages and disadvantages. Assembling and combining the advantages of the two will be a good choice for battery electrode hosts of active materials. In this work, an electrostatic separation-adsorption strategy is proposed to realize the ordered alternating self-assembly of MXene nanosheets and ordered porous carbon (MPOC), obtaining a unique wall-like porous material with a high conductivity and interconnected porous nanostructure, which strengthens the transfer rate of electrons and ions simultaneously. Meanwhile, the introduction of N-doping from porous carbon into MPOC prolongs the cycle life. When use red phosphorus (RP) as active materials, the MPOC@RP anode exhibited high-capacity output (2454.3 and 2408.1 mAh g-1 in lithium-ion batteries (LIBs) and sodium-ion batteries (SIBs) at 0.1 C) and long cycle life (the decay rates per cycle of 0.028% and 0.036% after 1500 and 1200 cycles at 2 C in LIBs and SIBs respectively). The successful application in RP anodes displays great potential in other electrode materials such as silicon, sulfur, selenium, and so on. Meanwhile, this strategy is also effective to design other composites materials like MXene and carbon nanotubes, MXene and Graphene, and so on.")

[{'label': 'battery', 'score': 0.9982238411903381}]

In [None]:
classifier(r"Highly crystalline CuFeS2 containing earth-abundant and environmentally friendly elements prepared via a high-temperature synthesis exhibits an excellent electrochemical performance as an anode material in sodium-ion batteries. The initial specific capacity of 460 mAh g-1 increases to 512 mAh g-1 in the 150th cycle and then decreases to a still very high value of 444 mAh g-1 at 0.5 A g-1 in the remaining 550 cycles. Even for a large current density, a pronounced cycling stability is observed. Here, we demonstrate that combining the results of X-ray powder diffraction experiments, pair distribution function analysis, and 23Na NMR and Mössbauer spectroscopy investigations performed at different stages of discharging and charging processes allows elucidation of very complex reaction mechanisms. In the first step after uptake of 1 Na/CuFeS2, nanocrystalline NaCuFeS2 is formed as an intermediate phase, which surprisingly could be recovered during charging. On increasing the Na content, Cu+ is reduced to nanocrystalline Cu, while nanocrystalline Na2S and nanosized elemental Fe are formed in the discharged state. After charging, the main crystalline phase is NaCuFeS2. At the 150th cycle, the mechanisms clearly changed, and in the charged state, nanocrystalline CuxS phases are observed. At later stages of cycling, the mechanisms are altered again: NaF, Cu2S, and Cu7.2S4 appeared in the discharged state, while NaF and Cu5FeS4 are observed in the charged state. In contrast to a typical conversion reaction, nanocrystalline phases play the dominant role, which are responsible for the high reversible capacity and long-term stability.")[0]["label"]

'battery'

In [None]:
val_path = r'C:\Users\Subways-Sun\OneDrive\Documents\GitHub\sodium-ion-batteries\data_annotated\annotated_data_openai.json'
with open(val_path, 'r', encoding='utf-8') as f:
    val_data = json.load(f)

label_bert = []
score_bert = []
for i in range(len(val_data["text"])):
    val_label = classifier(val_data["text"][i][:512])
    if val_label[0]["label"] == "battery":
        label_bert.append(1)
        score_bert.append(val_label[0]["score"])
    elif val_label[0]["label"] == "non-battery":
        label_bert.append(0)
        score_bert.append(val_label[0]["score"])

val_data["label_bert"] = label_bert
val_data["score_bert"] = score_bert

# result_path = r'C:\Users\Subways-Sun\OneDrive\Documents\GitHub\sodium-ion-batteries\data_annotated\annotated_data_openai_bert.json'
# with open(result_path, 'w', encoding='utf-8') as f:
#     json.dump(val_data, f, indent=4, ensure_ascii=False)



You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
label_openai = val_data["label_openai"]
label_annotated = val_data["label_annotated"]
print(f"No\tAnno\tOpenAI\tBERT\tBERT Score")

openai_tp = 0
openai_tn = 0
openai_fp = 0
openai_fn = 0
bert_tp = 0
bert_tn = 0
bert_fp = 0
bert_fn = 0

for i in range(len(label_bert)):
    if (label_bert[i] + label_openai[i] + label_annotated[i] != 3) and (label_bert[i] + label_openai[i] + label_annotated[i] != 0):
        print(f"{i}\t{label_annotated[i]}\t{label_openai[i]}\t{label_bert[i]}\t{score_bert[i]}")
        if label_bert[i] == 1 and label_annotated[i] == 0:
            bert_fp += 1
        elif label_bert[i] == 0 and label_annotated[i] == 1:
            bert_fn += 1
        if label_openai[i] == 1 and label_annotated[i] == 0:
            openai_fp += 1
        elif label_openai[i] == 0 and label_annotated[i] == 1:
            openai_fn += 1
    if label_bert[i] == 1 and label_annotated[i] == 1:
        bert_tp += 1
    if label_bert[i] == 0 and label_annotated[i] == 0:
        bert_tn += 1
    if label_openai[i] == 1 and label_annotated[i] == 1:
        openai_tp += 1
    if label_openai[i] == 0 and label_annotated[i] == 0:
        openai_tn += 1
        
print(f"OpenAI True Positive: {openai_tp}")
print(f"OpenAI True Negative: {openai_tn}")
print(f"OpenAI False Positive: {openai_fp}")
print(f"OpenAI False Negative: {openai_fn}")
print(f"Bert True Positive: {bert_tp}")
print(f"Bert True Negative: {bert_tn}")
print(f"Bert False Positive: {bert_fp}")
print(f"Bert False Negative: {bert_fn}")

No	Anno	OpenAI	BERT	BERT Score
4	0	1	0	0.9998950958251953
5	0	1	0	0.9999111890792847
8	1	0	1	0.9956467747688293
9	0	0	1	0.9972032308578491
14	0	0	1	0.997548520565033
17	0	0	1	0.9945766925811768
20	0	0	1	0.9976568222045898
25	0	0	1	0.9974341988563538
27	0	0	1	0.9928406476974487
29	1	1	0	0.9998341798782349
36	0	0	1	0.9977502226829529
44	0	0	1	0.9975800514221191
45	1	1	0	0.9997965693473816
46	0	0	1	0.9955013394355774
49	1	0	0	0.9986640214920044
51	1	1	0	0.9971381425857544
52	0	0	1	0.9957528114318848
53	0	0	1	0.9981735944747925
58	0	0	1	0.9965315461158752
60	0	0	1	0.997140645980835
63	0	1	0	0.999908447265625
64	0	0	1	0.9759599566459656
66	0	0	1	0.9983342289924622
68	1	0	1	0.9898646473884583
71	1	1	0	0.9998682737350464
75	0	0	1	0.9861099720001221
80	0	1	0	0.9999006986618042
88	0	0	1	0.9981135129928589
94	0	0	1	0.9977023005485535
95	0	1	0	0.9999347925186157
97	0	0	1	0.9956747889518738
99	0	0	1	0.9950883984565735
101	0	0	1	0.9979206919670105
102	0	0	1	0.9938217401504517
103	0	1	0	0.9999232292

In [None]:
val_path = r'C:\Users\Subways-Sun\OneDrive\Documents\GitHub\sodium-ion-batteries\data_annotated\annotated_data_openai_1.json'
with open(val_path, 'r', encoding='utf-8') as f:
    val_data = json.load(f)

label_bert = []
score_bert = []
for i in range(len(val_data["text"])):
    val_label = classifier(val_data["text"][i][:512])
    if val_label[0]["label"] == "battery":
        label_bert.append(1)
        score_bert.append(val_label[0]["score"])
    elif val_label[0]["label"] == "non-battery":
        label_bert.append(0)
        score_bert.append(val_label[0]["score"])

val_data["label_bert"] = label_bert
val_data["score_bert"] = score_bert

# result_path = r'C:\Users\Subways-Sun\OneDrive\Documents\GitHub\sodium-ion-batteries\data_annotated\annotated_data_openai_bert.json'
# with open(result_path, 'w', encoding='utf-8') as f:
#     json.dump(val_data, f, indent=4, ensure_ascii=False)



In [None]:
label_openai = val_data["label_openai"]
label_annotated = val_data["label_annotated"]
print(f"No\tAnno\tOpenAI\tBERT\tBERT Score")

openai_fp = 0
openai_fn = 0
bert_fp = 0
bert_fn = 0

for i in range(len(label_bert)):
    if (label_bert[i] + label_openai[i] + label_annotated[i] != 3) and (label_bert[i] + label_openai[i] + label_annotated[i] != 0):
        print(f"{i}\t{label_annotated[i]}\t{label_openai[i]}\t{label_bert[i]}\t{score_bert[i]}")
        if label_bert[i] == 1 and label_annotated[i] == 0:
            bert_fp += 1
        elif label_bert[i] == 0 and label_annotated[i] == 1:
            bert_fn += 1
        if label_openai[i] == 1 and label_annotated[i] == 0:
            openai_fp += 1
        elif label_openai[i] == 0 and label_annotated[i] == 1:
            openai_fn += 1

print(f"OpenAI False Positive: {openai_fp}")
print(f"OpenAI False Negative: {openai_fn}")
print(f"Bert False Positive: {bert_fp}")
print(f"Bert False Negative: {bert_fn}")

No	Anno	OpenAI	BERT	BERT Score
7	0	1	0	0.9234947562217712
11	0	1	0	0.9999006986618042
21	0	1	0	0.999907374382019
23	1	1	0	0.7412803769111633
25	1	1	0	0.9998682737350464
27	1	1	0	0.9998341798782349
34	0	1	0	0.9999347925186157
36	0	1	0	0.9999232292175293
37	1	1	0	0.9997254014015198
39	1	1	0	0.9976949095726013
47	1	1	0	0.9971381425857544
53	0	1	0	0.999908447265625
72	1	1	0	0.9997965693473816
75	1	0	0	0.9986640214920044
79	0	1	0	0.9999111890792847
80	0	1	0	0.9999353885650635
83	0	0	1	0.9979206919670105
88	0	1	0	0.9998925924301147
99	0	1	0	0.9999369382858276
OpenAI False Positive: 10
OpenAI False Negative: 1
Bert False Positive: 1
Bert False Negative: 8
