In [35]:
import json
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

data_path = r'C:\Users\Subways-Sun\OneDrive\Documents\GitHub\sodium-ion-batteries\data_annotated\search_20241106-223705_sodium+ion+battery+anode-sodium+ion+battery+cathode-sodium+ion+battery+electrode_annotated_rephrased_once.json'
with open(data_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

texts = []
labels_raw = []

for i in data:
    texts.append(i["text"])
    labels_raw.append(i["label_int"])

labels = []
for i in labels_raw:
    if i == 0:
        labels.append(1)
    elif i == 1:
        labels.append(0)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts,
    labels,
    test_size=0.2,
    random_state=42
)

train_dataset = Dataset.from_dict({
    'text': train_texts,
    'labels': train_labels
})

val_dataset = Dataset.from_dict({
    'text': val_texts,
    'labels': val_labels
})

raw_datasets = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})
print(len(texts))

200


In [36]:
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 160
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 40
    })
})


In [45]:
model_name = "batterydata/batterybert-cased-abstract"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [46]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, torch_dtype="auto")

In [47]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")

In [48]:
import evaluate

metric = evaluate.load("accuracy")

In [49]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [50]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer",
                                  eval_strategy="epoch",
                                  num_train_epochs=10)

In [51]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)


In [52]:
trainer.train()

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.5123046636581421, 'eval_accuracy': 0.875, 'eval_runtime': 0.8318, 'eval_samples_per_second': 48.09, 'eval_steps_per_second': 6.011, 'epoch': 1.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.33328187465667725, 'eval_accuracy': 0.9, 'eval_runtime': 0.8785, 'eval_samples_per_second': 45.535, 'eval_steps_per_second': 5.692, 'epoch': 2.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.48736315965652466, 'eval_accuracy': 0.875, 'eval_runtime': 0.8561, 'eval_samples_per_second': 46.726, 'eval_steps_per_second': 5.841, 'epoch': 3.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.24066007137298584, 'eval_accuracy': 0.95, 'eval_runtime': 0.8517, 'eval_samples_per_second': 46.963, 'eval_steps_per_second': 5.87, 'epoch': 4.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.11825376749038696, 'eval_accuracy': 0.95, 'eval_runtime': 0.7716, 'eval_samples_per_second': 51.838, 'eval_steps_per_second': 6.48, 'epoch': 5.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.0028533772565424442, 'eval_accuracy': 1.0, 'eval_runtime': 0.763, 'eval_samples_per_second': 52.425, 'eval_steps_per_second': 6.553, 'epoch': 6.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.0006424685707315803, 'eval_accuracy': 1.0, 'eval_runtime': 0.761, 'eval_samples_per_second': 52.563, 'eval_steps_per_second': 6.57, 'epoch': 7.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.036356765776872635, 'eval_accuracy': 0.975, 'eval_runtime': 0.8041, 'eval_samples_per_second': 49.743, 'eval_steps_per_second': 6.218, 'epoch': 8.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.05163665488362312, 'eval_accuracy': 0.975, 'eval_runtime': 0.7632, 'eval_samples_per_second': 52.409, 'eval_steps_per_second': 6.551, 'epoch': 9.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.0006281362730078399, 'eval_accuracy': 1.0, 'eval_runtime': 0.6714, 'eval_samples_per_second': 59.578, 'eval_steps_per_second': 7.447, 'epoch': 10.0}
{'train_runtime': 633.5906, 'train_samples_per_second': 2.525, 'train_steps_per_second': 0.316, 'train_loss': 0.18963598251342773, 'epoch': 10.0}


TrainOutput(global_step=200, training_loss=0.18963598251342773, metrics={'train_runtime': 633.5906, 'train_samples_per_second': 2.525, 'train_steps_per_second': 0.316, 'total_flos': 420977688576000.0, 'train_loss': 0.18963598251342773, 'epoch': 10.0})

In [53]:
trainer.evaluate()

  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.0006281362730078399,
 'eval_accuracy': 1.0,
 'eval_runtime': 0.6836,
 'eval_samples_per_second': 58.516,
 'eval_steps_per_second': 7.315,
 'epoch': 10.0}

In [54]:
trainer.save_model("batterybert-cased-abstract_finetuned_200")

SafetensorError: Error while serializing: IoError(Os { code: 1224, kind: Uncategorized, message: "The requested operation cannot be performed on a file with a user-mapped section open." })

In [55]:
from transformers import pipeline
classifier = pipeline("text-classification", model="batterybert-cased-abstract_finetuned_200", tokenizer=tokenizer, device=0)

In [16]:
text = r"MXene nanosheets and ordered porous carbons both have their own advantages and disadvantages. Assembling and combining the advantages of the two will be a good choice for battery electrode hosts of active materials. In this work, an electrostatic separation-adsorption strategy is proposed to realize the ordered alternating self-assembly of MXene nanosheets and ordered porous carbon (MPOC), obtaining a unique wall-like porous material with a high conductivity and interconnected porous nanostructure, which strengthens the transfer rate of electrons and ions simultaneously. Meanwhile, the introduction of N-doping from porous carbon into MPOC prolongs the cycle life. When use red phosphorus (RP) as active materials, the MPOC@RP anode exhibited high-capacity output (2454.3 and 2408.1 mAh g-1 in lithium-ion batteries (LIBs) and sodium-ion batteries (SIBs) at 0.1 C) and long cycle life (the decay rates per cycle of 0.028% and 0.036% after 1500 and 1200 cycles at 2 C in LIBs and SIBs respectively). The successful application in RP anodes displays great potential in other electrode materials such as silicon, sulfur, selenium, and so on. Meanwhile, this strategy is also effective to design other composites materials like MXene and carbon nanotubes, MXene and Graphene, and so on."

In [21]:
# model_out = AutoModelForSequenceClassification.from_pretrained("batterybert-cased-abstract_finetuned_200")
# with torch.no_grad():
#     logits = model_out(text).logits
# predicted_class_id = logits.argmax().item()
# print(predicted_class_id)

In [57]:
classifier(r"MXene nanosheets and ordered porous carbons both have their own advantages and disadvantages. Assembling and combining the advantages of the two will be a good choice for battery electrode hosts of active materials. In this work, an electrostatic separation-adsorption strategy is proposed to realize the ordered alternating self-assembly of MXene nanosheets and ordered porous carbon (MPOC), obtaining a unique wall-like porous material with a high conductivity and interconnected porous nanostructure, which strengthens the transfer rate of electrons and ions simultaneously. Meanwhile, the introduction of N-doping from porous carbon into MPOC prolongs the cycle life. When use red phosphorus (RP) as active materials, the MPOC@RP anode exhibited high-capacity output (2454.3 and 2408.1 mAh g-1 in lithium-ion batteries (LIBs) and sodium-ion batteries (SIBs) at 0.1 C) and long cycle life (the decay rates per cycle of 0.028% and 0.036% after 1500 and 1200 cycles at 2 C in LIBs and SIBs respectively). The successful application in RP anodes displays great potential in other electrode materials such as silicon, sulfur, selenium, and so on. Meanwhile, this strategy is also effective to design other composites materials like MXene and carbon nanotubes, MXene and Graphene, and so on.")

[{'label': 'battery', 'score': 0.999872088432312}]

In [58]:
classifier(r"Highly crystalline CuFeS2 containing earth-abundant and environmentally friendly elements prepared via a high-temperature synthesis exhibits an excellent electrochemical performance as an anode material in sodium-ion batteries. The initial specific capacity of 460 mAh g-1 increases to 512 mAh g-1 in the 150th cycle and then decreases to a still very high value of 444 mAh g-1 at 0.5 A g-1 in the remaining 550 cycles. Even for a large current density, a pronounced cycling stability is observed. Here, we demonstrate that combining the results of X-ray powder diffraction experiments, pair distribution function analysis, and 23Na NMR and Mössbauer spectroscopy investigations performed at different stages of discharging and charging processes allows elucidation of very complex reaction mechanisms. In the first step after uptake of 1 Na/CuFeS2, nanocrystalline NaCuFeS2 is formed as an intermediate phase, which surprisingly could be recovered during charging. On increasing the Na content, Cu+ is reduced to nanocrystalline Cu, while nanocrystalline Na2S and nanosized elemental Fe are formed in the discharged state. After charging, the main crystalline phase is NaCuFeS2. At the 150th cycle, the mechanisms clearly changed, and in the charged state, nanocrystalline CuxS phases are observed. At later stages of cycling, the mechanisms are altered again: NaF, Cu2S, and Cu7.2S4 appeared in the discharged state, while NaF and Cu5FeS4 are observed in the charged state. In contrast to a typical conversion reaction, nanocrystalline phases play the dominant role, which are responsible for the high reversible capacity and long-term stability.")[0]["label"]

'battery'

In [74]:
val_path = r'C:\Users\Subways-Sun\OneDrive\Documents\GitHub\sodium-ion-batteries\data_annotated\annotated_data_openai.json'
with open(val_path, 'r', encoding='utf-8') as f:
    val_data = json.load(f)

label_bert = []
score_bert = []
for i in range(len(val_data["text"])):
    val_label = classifier(val_data["text"][i][:512])
    if val_label[0]["label"] == "battery":
        label_bert.append(1)
        score_bert.append(val_label[0]["score"])
    elif val_label[0]["label"] == "non-battery":
        label_bert.append(0)
        score_bert.append(val_label[0]["score"])

val_data["label_bert"] = label_bert
val_data["score_bert"] = score_bert

# result_path = r'C:\Users\Subways-Sun\OneDrive\Documents\GitHub\sodium-ion-batteries\data_annotated\annotated_data_openai_bert.json'
# with open(result_path, 'w', encoding='utf-8') as f:
#     json.dump(val_data, f, indent=4, ensure_ascii=False)



In [77]:
label_openai = val_data["label_openai"]
label_annotated = val_data["label_annotated"]
print(f"No\tAnno\tOpenAI\tBERT\tBERT Score")

openai_tp = 0
openai_tn = 0
openai_fp = 0
openai_fn = 0
bert_tp = 0
bert_tn = 0
bert_fp = 0
bert_fn = 0

for i in range(len(label_bert)):
    if (label_bert[i] + label_openai[i] + label_annotated[i] != 3) and (label_bert[i] + label_openai[i] + label_annotated[i] != 0):
        print(f"{i}\t{label_annotated[i]}\t{label_openai[i]}\t{label_bert[i]}\t{score_bert[i]}")
        if label_bert[i] == 1 and label_annotated[i] == 0:
            bert_fp += 1
        elif label_bert[i] == 0 and label_annotated[i] == 1:
            bert_fn += 1
        if label_openai[i] == 1 and label_annotated[i] == 0:
            openai_fp += 1
        elif label_openai[i] == 0 and label_annotated[i] == 1:
            openai_fn += 1
    if label_bert[i] == 1 and label_annotated[i] == 1:
        bert_tp += 1
    if label_bert[i] == 0 and label_annotated[i] == 0:
        bert_tn += 1
    if label_openai[i] == 1 and label_annotated[i] == 1:
        openai_tp += 1
    if label_openai[i] == 0 and label_annotated[i] == 0:
        openai_tn += 1
        
print(f"OpenAI True Positive: {openai_tp}")
print(f"OpenAI True Negative: {openai_tn}")
print(f"OpenAI False Positive: {openai_fp}")
print(f"OpenAI False Negative: {openai_fn}")
print(f"Bert True Positive: {bert_tp}")
print(f"Bert True Negative: {bert_tn}")
print(f"Bert False Positive: {bert_fp}")
print(f"Bert False Negative: {bert_fn}")

No	Anno	OpenAI	BERT	BERT Score
4	0	1	1	0.9994474053382874
5	0	1	0	0.9999542236328125
8	1	0	1	0.9998757839202881
9	0	0	1	0.9998512268066406
11	1	1	0	0.9999184608459473
13	1	1	0	0.9579297304153442
14	0	0	1	0.9998705387115479
17	0	0	1	0.9998308420181274
20	0	0	1	0.999725878238678
25	0	0	1	0.999792754650116
27	0	0	1	0.9985211491584778
29	1	1	0	0.9984087347984314
35	1	1	0	0.9995049238204956
36	0	0	1	0.9998750686645508
41	1	1	0	0.8142088651657104
43	1	1	0	0.999729573726654
44	0	0	1	0.9998645782470703
45	1	1	0	0.9997357726097107
49	1	0	0	0.9997065663337708
51	1	1	0	0.9910997748374939
52	0	0	1	0.978841245174408
53	0	0	1	0.9998689889907837
58	0	0	1	0.9998679161071777
60	0	0	1	0.99915611743927
63	0	1	0	0.999935507774353
66	0	0	1	0.9998705387115479
68	1	0	1	0.9986616373062134
80	0	1	0	0.9999425411224365
88	0	0	1	0.9998756647109985
94	0	0	1	0.9998705387115479
95	0	1	0	0.9999485015869141
97	0	0	1	0.9993574023246765
99	0	0	1	0.9959127306938171
101	0	0	1	0.999870777130127
103	0	1	0	0.9999450445175171

In [72]:
val_path = r'C:\Users\Subways-Sun\OneDrive\Documents\GitHub\sodium-ion-batteries\data_annotated\annotated_data_openai_1.json'
with open(val_path, 'r', encoding='utf-8') as f:
    val_data = json.load(f)

label_bert = []
score_bert = []
for i in range(len(val_data["text"])):
    val_label = classifier(val_data["text"][i][:512])
    if val_label[0]["label"] == "battery":
        label_bert.append(1)
        score_bert.append(val_label[0]["score"])
    elif val_label[0]["label"] == "non-battery":
        label_bert.append(0)
        score_bert.append(val_label[0]["score"])

val_data["label_bert"] = label_bert
val_data["score_bert"] = score_bert

# result_path = r'C:\Users\Subways-Sun\OneDrive\Documents\GitHub\sodium-ion-batteries\data_annotated\annotated_data_openai_bert.json'
# with open(result_path, 'w', encoding='utf-8') as f:
#     json.dump(val_data, f, indent=4, ensure_ascii=False)



In [73]:
label_openai = val_data["label_openai"]
label_annotated = val_data["label_annotated"]
print(f"No\tAnno\tOpenAI\tBERT\tBERT Score")

openai_fp = 0
openai_fn = 0
bert_fp = 0
bert_fn = 0

for i in range(len(label_bert)):
    if (label_bert[i] + label_openai[i] + label_annotated[i] != 3) and (label_bert[i] + label_openai[i] + label_annotated[i] != 0):
        print(f"{i}\t{label_annotated[i]}\t{label_openai[i]}\t{label_bert[i]}\t{score_bert[i]}")
        if label_bert[i] == 1 and label_annotated[i] == 0:
            bert_fp += 1
        elif label_bert[i] == 0 and label_annotated[i] == 1:
            bert_fn += 1
        if label_openai[i] == 1 and label_annotated[i] == 0:
            openai_fp += 1
        elif label_openai[i] == 0 and label_annotated[i] == 1:
            openai_fn += 1

print(f"OpenAI False Positive: {openai_fp}")
print(f"OpenAI False Negative: {openai_fn}")
print(f"Bert False Positive: {bert_fp}")
print(f"Bert False Negative: {bert_fn}")

No	Anno	OpenAI	BERT	BERT Score
7	0	1	0	0.9999476671218872
11	0	1	0	0.9999425411224365
21	0	1	0	0.9999438524246216
27	1	1	0	0.9984087347984314
34	0	1	0	0.9999485015869141
36	0	1	0	0.9999450445175171
39	1	1	0	0.9997223019599915
47	1	1	0	0.9910997748374939
53	0	1	0	0.999935507774353
54	1	1	0	0.999729573726654
57	1	1	0	0.8142088651657104
72	1	1	0	0.9997357726097107
73	1	1	0	0.9995049238204956
75	1	0	0	0.9997065663337708
79	0	1	0	0.9999542236328125
80	0	1	0	0.9999384880065918
83	0	0	1	0.999870777130127
86	1	1	0	0.9579297304153442
88	0	1	0	0.9999287128448486
89	1	1	0	0.9999184608459473
99	0	1	0	0.9999451637268066
OpenAI False Positive: 10
OpenAI False Negative: 1
Bert False Positive: 1
Bert False Negative: 10
