In [6]:
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
import torch
from transformers import BertTokenizer
from transformers import pipeline
import pickle
from transformers import AutoTokenizer, AutoModel

In [33]:
# Step 1: Create a label-to-index mapping (Manually defined or inferred from your data)
labels = ["O", "B-organic-chemicals", "I-organic-chemicals", "B-catalyst", "I-catalyst", "B-property values", "I-property values"]

# Create a dictionary mapping labels to integers
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

# Step 2: Convert labels to indices
def convert_labels_to_indices(labels, label2id):
    return [label2id.get(label, label2id['O']) for label in labels]  # Default to 'O' if not found

In [34]:
# Step 3: Read and parse the CoNLL file
def load_conll_data(filename):
    sentences = []
    labels = []
    sentence = []
    label = []
    
    with open(filename, 'r') as file:
        for line in file:
            line = line.strip()
            if line:
                token, ner_tag = line.split("\t")  # Assuming space-separated token and label
                sentence.append(token)
                label.append(ner_tag)
            else:
                if sentence:  # If sentence is not empty, store it
                    sentences.append(sentence)
                    labels.append(label)
                sentence = []
                label = []
                
    return sentences, labels

In [35]:
# Step 4: Tokenization function to match BERT's input format
def tokenize_and_align_labels(tokenizer, sentences, labels):
    tokenized_inputs = tokenizer(sentences, padding=True, truncation=True, is_split_into_words=True)
    
    labels_all = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(i)  # word_ids() is available on fast tokenizers
        label_ids = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        labels_all.append(label_ids)

    tokenized_inputs["labels"] = labels_all
    return tokenized_inputs

In [36]:
# Step 5: Read the data from file
# Assuming your CoNLL file is named 'train.conll'
train_sentences, train_labels = load_conll_data(r'ner-chem\ner\data.conll')

In [37]:
# Step 6: Convert string labels to indices
train_labels_numeric = [convert_labels_to_indices(labels, label2id) for labels in train_labels]

In [38]:
# Step 7: Initialize the fast tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# Step 8: Create Dataset with tokenized sentences and numeric labels
train_dataset = Dataset.from_dict({
    "tokens": train_sentences,  # List of tokenized sentences
    "labels": train_labels_numeric,  # Corresponding numeric labels
})

In [39]:
# Step 9: Apply tokenization and label alignment
train_dataset = train_dataset.map(lambda e: tokenize_and_align_labels(tokenizer, e['tokens'], e['labels']), batched=True)

# Step 10: Initialize the model (with label2id and id2label)
model = BertForTokenClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

Map: 100%|██████████| 9/9 [00:00<00:00, 1387.31 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
# Step 11: Define the TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    #per_device_train_batch_size=8,
    #per_device_eval_batch_size=8,
    num_train_epochs=10,
    #weight_decay=0.01
)


# Step 13: Initialize the Trainer
trainer = Trainer(
    model=model,                         # the model to be trained
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
    eval_dataset=train_dataset,          # evaluation dataset (for demonstration purposes, using train for eval)      # evaluation metrics function
)




In [51]:
# Step 14: Train the model
trainer.train()

                                              
 10%|█         | 2/20 [00:03<00:26,  1.46s/it]

{'eval_loss': 0.27185097336769104, 'eval_runtime': 0.4943, 'eval_samples_per_second': 18.206, 'eval_steps_per_second': 4.046, 'epoch': 1.0}


                                              
 20%|██        | 4/20 [00:07<00:25,  1.56s/it]

{'eval_loss': 0.2218555212020874, 'eval_runtime': 0.6934, 'eval_samples_per_second': 12.979, 'eval_steps_per_second': 2.884, 'epoch': 2.0}


                                              
 30%|███       | 6/20 [00:11<00:24,  1.75s/it]

{'eval_loss': 0.1900850087404251, 'eval_runtime': 0.7035, 'eval_samples_per_second': 12.794, 'eval_steps_per_second': 2.843, 'epoch': 3.0}


                                              
 40%|████      | 8/20 [00:15<00:22,  1.88s/it]

{'eval_loss': 0.1614873707294464, 'eval_runtime': 0.6841, 'eval_samples_per_second': 13.156, 'eval_steps_per_second': 2.923, 'epoch': 4.0}


                                               
 50%|█████     | 10/20 [00:20<00:19,  1.97s/it]

{'eval_loss': 0.14277811348438263, 'eval_runtime': 0.8888, 'eval_samples_per_second': 10.127, 'eval_steps_per_second': 2.25, 'epoch': 5.0}


                                               
 60%|██████    | 12/20 [00:25<00:16,  2.09s/it]

{'eval_loss': 0.13137286901474, 'eval_runtime': 0.7811, 'eval_samples_per_second': 11.522, 'eval_steps_per_second': 2.56, 'epoch': 6.0}


                                               
 70%|███████   | 14/20 [00:29<00:12,  2.11s/it]

{'eval_loss': 0.1221974790096283, 'eval_runtime': 0.7307, 'eval_samples_per_second': 12.317, 'eval_steps_per_second': 2.737, 'epoch': 7.0}


                                               
 80%|████████  | 16/20 [00:34<00:08,  2.14s/it]

{'eval_loss': 0.11575402319431305, 'eval_runtime': 0.9749, 'eval_samples_per_second': 9.232, 'eval_steps_per_second': 2.051, 'epoch': 8.0}


                                               
 90%|█████████ | 18/20 [00:40<00:04,  2.35s/it]

{'eval_loss': 0.1110372468829155, 'eval_runtime': 1.0243, 'eval_samples_per_second': 8.786, 'eval_steps_per_second': 1.952, 'epoch': 9.0}


                                               
100%|██████████| 20/20 [00:48<00:00,  2.45s/it]

{'eval_loss': 0.10895165055990219, 'eval_runtime': 1.111, 'eval_samples_per_second': 8.101, 'eval_steps_per_second': 1.8, 'epoch': 10.0}
{'train_runtime': 48.9154, 'train_samples_per_second': 1.84, 'train_steps_per_second': 0.409, 'train_loss': 0.21605362892150878, 'epoch': 10.0}





TrainOutput(global_step=20, training_loss=0.21605362892150878, metrics={'train_runtime': 48.9154, 'train_samples_per_second': 1.84, 'train_steps_per_second': 0.409, 'total_flos': 3628718598780.0, 'train_loss': 0.21605362892150878, 'epoch': 10.0})

In [None]:
# Define the path where you want to save the model and tokenizer
output_dir = "model"  # Replace this with the desired output directory

# Save the trained model
model.save_pretrained(output_dir)

# Save the tokenizer (this is necessary for later use)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")




In [8]:
# Load your model and tokenizer
model = AutoModel.from_pretrained("model")  # Replace "model" with your model's directory
tokenizer = AutoTokenizer.from_pretrained("model")

# Save the model and tokenizer to a .pkl file
with open("model.pkl", "wb") as f:
    pickle.dump((model, tokenizer), f)

print("Model and tokenizer saved as model.pkl")

Some weights of BertModel were not initialized from the model checkpoint at model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and tokenizer saved as model.pkl


In [2]:
#1. Load the trained model and tokenizer
model_path = "model"  # Replace this with the path where your model is saved
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")  # Use the same tokenizer as used during training
model = BertForTokenClassification.from_pretrained(model_path)

In [3]:
# 2. Load the label mapping (you should have the same id2label and label2id as in training)
id2label = {0: "O", 1: "B-organic-chemicals", 2: "I-organic-chemicals", 3: "B-catalyst", 4: "I-catalyst"}  # Update this if you have more classes
label2id = {label: id for id, label in id2label.items()}
# 3. Initialize the NER pipeline
nlp = pipeline("ner", model=model, tokenizer=tokenizer, device=-1)  # `device=-1` means CPU; change to `0` for GPU

# 5. Tokenize and make predictions
def predict_entities(text):
    # Use the NER pipeline for prediction
    ner_results = nlp(text)
    
    # Post-process results: match predicted tokens with labels and tokens
    predictions = []
    for result in ner_results:
        entity = result['word']
        label = result['entity']
        score = result['score']
        predictions.append({"entity": entity, "label": label, "score": score})
    
    return predictions


In [4]:
def merge_subwords(predictions):
    """
    Merges subword tokens and keeps only the first subword's label for each word.
    """
    merged_predictions = []
    current_word = ''
    current_label = None
    
    for prediction in predictions:
        token = prediction['entity']
        label = prediction['label']
        score = prediction['score']
        
        # Check if the token is a continuation of the previous one (subword token)
        if token.startswith('##'):
            # If it's a subword token, add it to the current word
            current_word += token[2:]  # Remove the '##' part of the subword token
        else:
            # If it's the first token of a new word, save the previous word and start a new one
            if current_word:
                merged_predictions.append({'entity': current_word, 'label': current_label, 'score': score})
            current_word = token  # Start a new word
            current_label = label  # Set the label for this word
            # No score adjustment here, use the score of the first subword

    # Append the last word
    if current_word:
        merged_predictions.append({'entity': current_word, 'label': current_label, 'score': score})

    return merged_predictions

In [5]:
# 4. Example Input Text
text = input("Enter text")
# 6. Predict entities
predicted_entities = predict_entities(text)

# Merge subword tokens and clean the predictions
merged_predictions = merge_subwords(predicted_entities)
# 7. Print the results
print(text)
print("Predictions:", merged_predictions)


Carbomer refers to a family of synthetic, high-molecular-weight polymers made from acrylic acid as the primary monomer, with small amounts of polyalkenyl polyethers acting as crosslinkers. These polymers are widely synthesized through free-radical polymerization, typically in an aqueous solution or solvent-based systems. The ideal temperature for polymerization is between 70-80°C, where a free-radical initiator (such as benzoyl peroxide or azobisisobutyronitrile (AIBN)) acts as the catalyst to initiate the polymerization reaction. These crosslinked polyacrylate structures are highly hydrophilic, allowing carbomer to swell significantly in water, forming viscous gels even at low concentrations.
Predictions: [{'entity': 'bomer', 'label': None, 'score': np.float32(0.33687964)}, {'entity': 'acrylic', 'label': 'B-organic-chemicals', 'score': np.float32(0.2837343)}, {'entity': 'acidkeneth', 'label': 'B-catalyst', 'score': np.float32(0.45214137)}, {'entity': '70', 'label': 'B-property values'