In [None]:
import pandas as pd
import torch
import re
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForMaskedLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Define custom tokenization function for 'composition'
def custom_tokenize(composition):
    matches = re.findall(r'([A-Z][a-z]*)([0-9.]+)', composition)
    sorted_matches = sorted(matches, key=lambda x: x[0])
    tokens = []
    for match in sorted_matches:
        element, fraction = match
        token = f"{element}{fraction}"  # Combine element and fraction
        tokens.append(token)
    return ' '.join(tokens)

# Test the function
print(custom_tokenize("Co1.2 Fe0.8 Ni1"))

# Load your unlabeled data
unlabeled_data = pd.read_csv('compositions_400K_Features_Rounded_Cleaned.csv')

# Apply custom tokenization to 'composition' column
unlabeled_data['custom_composition'] = unlabeled_data['composition'].apply(custom_tokenize)

# Convert numeric columns to strings
numeric_cols = unlabeled_data.select_dtypes(['float64', 'int64']).columns
for col in numeric_cols:
    unlabeled_data[col] = unlabeled_data[col].astype(str)

# Concatenate them with the custom composition tokens
# unlabeled_data['concat_text'] = unlabeled_data['custom_composition'] + ' ' + unlabeled_data['Phase'].astype(str) + ' ' + unlabeled_data[numeric_cols].agg(' '.join, axis=1)
unlabeled_data['concat_text'] = unlabeled_data['custom_composition'] + ' ' + unlabeled_data[numeric_cols].agg(' '.join, axis=1)


# Text data for tokenization
texts = unlabeled_data['concat_text'].values

# Tokenize using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
inputs = tokenizer(texts.tolist(), padding=True, truncation=True, return_tensors="pt", max_length=512)

# Custom Dataset class for MLM
class MLM_Dataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# Convert tokenized inputs to custom Dataset
train_dataset = MLM_Dataset(inputs)

# Data collator for MLM
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

# Initialize BERT model with MLM head
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

# Training arguments
training_args = TrainingArguments(
    output_dir="./results/pretrained_BERT_400k",
    overwrite_output_dir=True,
    num_train_epochs=100,
    per_device_train_batch_size=16,
    save_steps=1000,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=10
)

# Initialize Trainer and train
trainer = Trainer(model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, tokenizer=tokenizer)
trainer.train()
trainer.save_model("./results/pretrained_BERT_400k")
