In [1]:
######################################## Pre-Training ####################################################

import pandas as pd
import torch
import re
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForMaskedLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, TrainerCallback
from sklearn.model_selection import train_test_split

# Define custom tokenization function for 'composition'
def custom_tokenize(composition):
    matches = re.findall(r'([A-Z][a-z]*)([0-9.]+)', composition)
    sorted_matches = sorted(matches, key=lambda x: x[0])
    tokens = []
    for match in sorted_matches:
        element, fraction = match
        token = f"{element}{fraction}"  # Combine element and fraction
        tokens.append(token)
    return ' '.join(tokens)

# Test the function
print(custom_tokenize("Co1.2 Fe0.8 Ni1"))

# Load your unlabeled data
unlabeled_data = pd.read_csv('6K.csv') #<---------------------------------------------------------------------------------

# Apply custom tokenization to 'composition' column
unlabeled_data['custom_composition'] = unlabeled_data['composition'].apply(custom_tokenize)

# Convert numeric columns to strings
numeric_cols = unlabeled_data.select_dtypes(['float64', 'int64']).columns
for col in numeric_cols:
    unlabeled_data[col] = unlabeled_data[col].astype(str)

# Concatenate them with the custom composition tokens
unlabeled_data['concat_text'] = unlabeled_data['custom_composition'] + ' ' + unlabeled_data[numeric_cols].agg(' '.join, axis=1)

# Split the data into training and validation sets
train_texts, val_texts = train_test_split(unlabeled_data['concat_text'].values, test_size=0.2, random_state=42)

# Tokenize using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_inputs = tokenizer(list(train_texts), padding=True, truncation=True, return_tensors="pt", max_length=512)
val_inputs = tokenizer(list(val_texts), padding=True, truncation=True, return_tensors="pt", max_length=512)

# Custom Dataset class for MLM
class MLM_Dataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# Convert tokenized inputs to custom Dataset
train_dataset = MLM_Dataset(train_inputs)
val_dataset = MLM_Dataset(val_inputs)

# Data collator for MLM
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

# Initialize BERT model with MLM head
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

class SaveBestModelCallback(TrainerCallback):
    """A custom callback to save the best model based on validation loss."""
    def __init__(self):
        super().__init__()
        self.best_loss = float('inf')

    def on_evaluate(self, args, state, control, **kwargs):
        if state.log_history:
            eval_loss = state.log_history[-1].get("eval_loss")
            if eval_loss and eval_loss < self.best_loss:
                self.best_loss = eval_loss
                print(f"New best model with loss: {eval_loss}, saving model...")
                model.save_pretrained("d:/6K_Pretraining")   #<---------------------------------------------
                tokenizer.save_pretrained("d:/6K_Pretraining") #<------------------------------------------

# Training arguments
training_args = TrainingArguments(
    output_dir="d:/6K_Pretraining", #<---------------------------------------------------------------
    overwrite_output_dir=True,
    num_train_epochs=40,
    per_device_train_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=50
)

# Initialize Trainer and train
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    callbacks=[SaveBestModelCallback()]
)

# Train the model
trainer.train()

# Save the final model
trainer.save_model("d:/6K_Pretraining") #<------------------------------------------------------------------------


  from .autonotebook import tqdm as notebook_tqdm


Co1.2 Fe0.8 Ni1


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RuntimeError: module compiled against API version 0xf but this version of numpy is 0xe

[34m[1mwandb[0m: Currently logged in as: [33mhvof[0m ([33mhvofspray[0m). Use [1m`wandb login --relogin`[0m to force relogin


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss
1,2.1003,1.919396
2,1.8838,1.826399
3,1.7634,1.760216
4,1.7532,1.732641
5,1.6797,1.660034
6,1.6971,1.674681
7,1.6669,1.665483
8,1.6726,1.647842
9,1.6388,1.629345
10,1.6193,1.637127


New best model with loss: 1.9193962812423706, saving model...


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


New best model with loss: 1.8263986110687256, saving model...


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


New best model with loss: 1.7602158784866333, saving model...


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


New best model with loss: 1.7326405048370361, saving model...


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


New best model with loss: 1.6600340604782104, saving model...


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


New best model with loss: 1.6478416919708252, saving model...


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


New best model with loss: 1.6293445825576782, saving model...


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


New best model with loss: 1.6088182926177979, saving model...


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


New best model with loss: 1.5830131769180298, saving model...


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


New best model with loss: 1.5593374967575073, saving model...


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


New best model with loss: 1.5261154174804688, saving model...


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


New best model with loss: 1.522301197052002, saving model...


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


New best model with loss: 1.5009446144104004, saving model...


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


New best model with loss: 1.4997190237045288, saving model...


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


New best model with loss: 1.4783862829208374, saving model...


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


New best model with loss: 1.4746463298797607, saving model...


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


New best model with loss: 1.4703161716461182, saving model...


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


New best model with loss: 1.4542824029922485, saving model...


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


New best model with loss: 1.4487982988357544, saving model...


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


New best model with loss: 1.433034896850586, saving model...


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
