#Installs#

In [None]:
# !pip install datasets
# !pip install seqeval
# !pip install transformers
# !pip install huggingface_hub
# !pip install evaluate




This code provides a complete pipeline for training and evaluating a Named Entity Recognition (NER) model using a dataset from Hugging Face. Here's an explanation of the code's key sections:

##1. Dataset Loading:

* The dataset is loaded from Hugging Face using the load_dataset() function. The dataset in this case is a custom NER dataset ('sigaldanilov/ner_dataset1').
Tokenizer Initialization:

##2. Tokenizer Initialization:
* The tokenizer, based on the "bert-base-cased" model, is initialized using the AutoTokenizer class. This tokenizer will convert text tokens into a format suitable for input into the BERT model.
Hashtag Preprocessing:

##3. Hashtag Preprocessing:
* The preprocess_hashtags() function removes hashtags (#) from the tokens in the dataset. This ensures that words with hashtags are processed without the symbol, which is particularly useful for social media text like Instagram posts.
Tokenization and Label Alignment:

##4. Tokenization and Label Alignment:
* The tokenize_and_align_labels() function tokenizes the input text (splitting words into tokens) while aligning the labels (NER tags) to the corresponding tokens. This is critical because BERT tokenizes words into smaller subwords, and NER labels need to be aligned with these subword tokens.
It ensures that padding tokens are ignored (labeled as -100) and the labels are only applied to the first token of each word.
Model Initialization:

##5. Model Initialization:
* The BERT model for token classification (AutoModelForTokenClassification) is initialized. The number of labels is set based on the NER tags in the dataset.
Metrics Calculation:

##6. Metrics Calculation:
* The compute_metrics() function calculates evaluation metrics using the seqeval library, which is a standard for sequence labeling tasks like NER. The function computes the precision, recall, F1 score, and accuracy by comparing the model's predictions to the actual labels.
Training Setup:

##7. Training Setup:
* TrainingArguments defines hyperparameters for training, including batch size, learning rate, number of epochs, and weight decay (used to regularize the model).
* The Trainer class is then initialized with the model, training arguments, datasets, tokenizer, and metrics function. This class handles the training loop, evaluation, and saving the model.
Model Training:

##8. Model Training:
* The trainer.train() command starts the training process, where the model learns to predict the correct NER tags for each token.
Evaluation:

##9. Evaluation:
* After training, the model is evaluated on the test dataset using trainer.evaluate(). The results include performance metrics like precision, recall, and F1 score.

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import load_dataset
from evaluate import load  # Use `evaluate` for metrics
import numpy as np

# Load the dataset from Hugging Face
dataset = load_dataset('sigaldanilov/ner_dataset1')

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Preprocessing function to remove hashtags
def preprocess_hashtags(examples):
    # Remove the '#' symbol from tokens that start with it
    examples['tokens'] = [token[1:] if token.startswith('#') else token for token in examples['tokens']]
    return examples

# Apply the preprocessing to remove hashtags from the tokens
preprocessed_datasets = dataset.map(preprocess_hashtags)

# Tokenization and label alignment function
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",  # Ensures all sequences are the same length
        is_split_into_words=True,
        max_length=128  # Adjust max_length based on your dataset
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            # None means this token is a special token (like padding)
            if word_idx is None:
                label_ids.append(-100)
            # Only add label for the first token of each word
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        # Ensure label_ids has the same length as max_length
        label_ids = label_ids[:128]  # Truncate if necessary
        label_ids.extend([-100] * (128 - len(label_ids)))  # Pad if necessary

        labels.append(label_ids)

    # Return the tokenized inputs with the aligned labels
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the tokenization function to the preprocessed dataset
tokenized_datasets = preprocessed_datasets.map(tokenize_and_align_labels, batched=True)

# Initialize the model
model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(dataset["train"].features["ner_tags"].feature.names))
label_names = dataset["train"].features["ner_tags"].feature.names
metric = load("seqeval")

# Define the compute_metrics function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Assuming you have defined `label_names` elsewhere, it should be a list of label names for the NER task
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [[label_names[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    # Compute the evaluation metrics using the seqeval metric
    results = metric.compute(predictions=true_predictions, references=true_labels)

    # Return the relevant metrics
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Define test_dataset and evaluate on it
test_dataset = tokenized_datasets["test"]
test_results = trainer.evaluate(eval_dataset=test_dataset)

# Print the test results
print(test_results)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/764 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/20.2k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/6.59k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/202 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/43 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/44 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



Map:   0%|          | 0/202 [00:00<?, ? examples/s]

Map:   0%|          | 0/43 [00:00<?, ? examples/s]

Map:   0%|          | 0/44 [00:00<?, ? examples/s]

Map:   0%|          | 0/202 [00:00<?, ? examples/s]

Map:   0%|          | 0/43 [00:00<?, ? examples/s]

Map:   0%|          | 0/44 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.572084,0.6,0.193548,0.292683,0.859438
2,No log,0.341409,0.595745,0.451613,0.513761,0.89759
3,No log,0.300577,0.604167,0.467742,0.527273,0.903614


{'eval_loss': 0.34423354268074036, 'eval_precision': 0.56, 'eval_recall': 0.4444444444444444, 'eval_f1': 0.4955752212389381, 'eval_accuracy': 0.8854961832061069, 'eval_runtime': 16.8993, 'eval_samples_per_second': 2.604, 'eval_steps_per_second': 0.178, 'epoch': 3.0}


##1. Save the model:
* This line saves the entire trained model (in this case, the NER model) into the specified directory ./ner_bart_model.
* The save_pretrained() function from the Hugging Face transformers library saves:
 * The model's architecture (how it was built).
 * The learned parameters/weights after training.
 * The configuration file (config.json), which includes details such as the number of labels, model type, and other model-specific configurations.
* By saving the model, you can easily load it later without retraining, allowing you to reuse the trained model for making predictions or further fine-tuning.

## 2. Save the tokenizer:
* This line saves the tokenizer into the specified directory ./ner_bart_tokenizer.
* The tokenizer is responsible for converting raw text into the numerical format that the model can process (such as token IDs).
save_pretrained() saves:
 * The tokenizer vocabulary (vocab.txt or similar).
 * A tokenizer configuration file (tokenizer_config.json), which stores details about how the tokenizer was created (e.g., type of tokenizer, special tokens).
* Saving the tokenizer ensures that when you use the model later for predictions, the text will be tokenized in the same way as during training.

In [None]:
# Save the model to a directory
model.save_pretrained('./ner_bart_model')

# Save the tokenizer to a directory
tokenizer.save_pretrained('./ner_bart_tokenizer')

('./ner_bart_tokenizer/tokenizer_config.json',
 './ner_bart_tokenizer/special_tokens_map.json',
 './ner_bart_tokenizer/vocab.txt',
 './ner_bart_tokenizer/added_tokens.json',
 './ner_bart_tokenizer/tokenizer.json')


This code logs into Hugging Face's platform using your API token. It allows you to interact with Hugging Face Hub, such as uploading models or accessing private repositories, programmatically. The login() function authenticates your account using the token provided.

In [None]:
from huggingface_hub import login

# Log in to Hugging Face using your API token
login(token="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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


This code uploads your trained model and tokenizer to Hugging Face Hub.

In [None]:
from huggingface_hub import HfApi

# Initialize API
api = HfApi()

# Define the repository name (replace with your own repository name)
repo_id = "sigaldanilov/bertmodel"  # Replace 'your_username' and 'ner_bart_model'

# Create the repository on Hugging Face (if it doesn’t already exist)
api.create_repo(repo_id=repo_id, exist_ok=True)

# Upload the model directory to the Hugging Face repository
api.upload_folder(
    folder_path='./ner_bart_model',  # Path to the model folder
    path_in_repo='.',                # Upload to the root of the repository
    repo_id=repo_id,                 # Full repository ID on Hugging Face
    commit_message='Upload NER BART model'
)

# Upload the tokenizer directory to the Hugging Face repository
api.upload_folder(
    folder_path='./ner_bart_tokenizer',  # Path to the tokenizer folder
    path_in_repo='.',                    # Upload to the root of the repository
    repo_id=repo_id,                     # Full repository ID on Hugging Face
    commit_message='Upload NER BART tokenizer'
)


model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/sigaldanilov/bertmodel/commit/2aed8028e44490eecfab64583a3afae4ba013ba2', commit_message='Upload NER BART tokenizer', commit_description='', oid='2aed8028e44490eecfab64583a3afae4ba013ba2', pr_url=None, pr_revision=None, pr_num=None)

In summary, this code implements several techniques such as careful tokenization, early stopping, lower learning rate, and saving the best model to optimize the model’s performance during training and evaluation.

In [None]:
import torch  # Import torch for PyTorch operations
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import load_dataset
from evaluate import load  # Use `evaluate` for metrics
import numpy as np

# Load the dataset from Hugging Face
dataset = load_dataset('sigaldanilov/ner_dataset1')

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Preprocessing function to remove hashtags
def preprocess_hashtags(examples):
    # Remove the '#' symbol from tokens that start with it
    examples['tokens'] = [token[1:] if token.startswith('#') else token for token in examples['tokens']]
    return examples

# Apply the preprocessing to remove hashtags from the tokens
preprocessed_datasets = dataset.map(preprocess_hashtags)

# Set max_length to cover 95% of the sentences
max_length = 70  # Adjusted based on dataset statistics

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        is_split_into_words=True,
        max_length=max_length  # Use max_length consistently
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Padding or special token
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])  # Assign the label for the first subword
            else:
                label_ids.append(-100)  # Assign -100 for subword tokens
            previous_word_idx = word_idx

        # Ensure label_ids has the same length as max_length
        label_ids = label_ids[:max_length]
        label_ids.extend([-100] * (max_length - len(label_ids)))

        labels.append(label_ids)

    # Return the tokenized inputs with the aligned labels
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


# Apply the tokenization function to the preprocessed dataset
tokenized_datasets = preprocessed_datasets.map(tokenize_and_align_labels, batched=True)

# Initialize the model
model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(dataset["train"].features["ner_tags"].feature.names))
label_names = dataset["train"].features["ner_tags"].feature.names
metric = load("seqeval")

# Define the compute_metrics function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [[label_names[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    # Compute the evaluation metrics using the seqeval metric
    results = metric.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }
from transformers import EarlyStoppingCallback

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",  # Save model at the end of each epoch to match eval_strategy
    learning_rate=1e-5,  # Lower learning rate for careful fine-tuning
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,  # Set a high number of epochs (we'll use early stopping)
    weight_decay=0.01,
    logging_dir="./logs",  # Where to store logs
    logging_steps=100,  # Log every 100 steps
    load_best_model_at_end=True,  # Ensure the best model is loaded at the end
    metric_for_best_model="f1",  # Use F1 score to evaluate the best model
    greater_is_better=True,  # For metrics like accuracy or F1, higher is better
)

# Initialize the Trainer with Early Stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Stop if no improvement for 3 evaluations
)

# Train the model
trainer.train()

# Evaluate on the test dataset
test_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])

# Print the test results
print(test_results)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.121937,0.272727,0.05,0.084507,0.805274
2,No log,0.49345,0.5,0.333333,0.4,0.880325
3,No log,0.334821,0.590909,0.433333,0.5,0.89858
4,No log,0.258225,0.680851,0.533333,0.598131,0.924949
5,No log,0.238371,0.679245,0.6,0.637168,0.924949
6,No log,0.22152,0.654545,0.6,0.626087,0.933063
7,No log,0.204239,0.698113,0.616667,0.654867,0.941176


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.121937,0.272727,0.05,0.084507,0.805274
2,No log,0.49345,0.5,0.333333,0.4,0.880325
3,No log,0.334821,0.590909,0.433333,0.5,0.89858
4,No log,0.258225,0.680851,0.533333,0.598131,0.924949
5,No log,0.238371,0.679245,0.6,0.637168,0.924949
6,No log,0.22152,0.654545,0.6,0.626087,0.933063
7,No log,0.204239,0.698113,0.616667,0.654867,0.941176
8,0.481000,0.201354,0.735849,0.65,0.690265,0.947262
9,0.481000,0.204812,0.666667,0.6,0.631579,0.943205
10,0.481000,0.20703,0.722222,0.65,0.684211,0.947262


{'eval_loss': 0.2556576430797577, 'eval_precision': 0.6949152542372882, 'eval_recall': 0.6507936507936508, 'eval_f1': 0.6721311475409837, 'eval_accuracy': 0.9236641221374046, 'eval_runtime': 10.2763, 'eval_samples_per_second': 4.282, 'eval_steps_per_second': 0.292, 'epoch': 14.0}


In [None]:
# Save the model and tokenizer to a directory
model_save_path = "./ner_model"

model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)


('./ner_model/tokenizer_config.json',
 './ner_model/special_tokens_map.json',
 './ner_model/vocab.txt',
 './ner_model/added_tokens.json',
 './ner_model/tokenizer.json')

In [None]:
model.push_to_hub("sigaldanilov/bertmodel")
tokenizer.push_to_hub("sigaldanilov/bertmodel")


model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/sigaldanilov/bertmodel/commit/43dbf8e9e9cad3e3922ecf9a6b68dcc60363d222', commit_message='Upload tokenizer', commit_description='', oid='43dbf8e9e9cad3e3922ecf9a6b68dcc60363d222', pr_url=None, pr_revision=None, pr_num=None)

This function, predict_ner(), takes input text and uses a pre-trained Named Entity Recognition (NER) model to identify and extract locations (e.g., cities or countries) from the text. Here's how it works:

1. Tokenization: The input text is split into tokens using a tokenizer.
2. Model Prediction: The model processes the tokenized text to predict the NER labels for each token (e.g., "B-LOC" or "I-LOC" for locations).
3. Label Mapping: It maps the predicted class indices back to the corresponding label names (e.g., "B-LOC").
4. Location Extraction: It collects tokens identified as part of locations (B-LOC, I-LOC) and combines subword tokens (like "##tel" and "aviv" to form "telaviv").
5. Return: The function returns a list of identified locations from the text.

In [None]:
def predict_ner(text, model, tokenizer, label_names):
    """
    This function takes any input text (sentences, paragraphs) and returns a list of identified locations.

    Args:
    - text: The input text as a string.
    - model: The pre-trained NER model.
    - tokenizer: The tokenizer used for tokenizing the text.
    - label_names: The list of label names (e.g., ['O', 'B-LOC', 'I-LOC', ...]).

    Returns:
    - locations: A list of identified locations in the text.
    """
    # Tokenize the input text into tokens, allowing spaces and punctuation
    tokenized_input = tokenizer(text.split(), return_tensors="pt", is_split_into_words=True)

    # Get predictions from the model
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # Disable gradient calculations for evaluation
        outputs = model(**tokenized_input)

    # Get the logits (raw outputs) and predicted class index
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=2)

    # Map the predicted class index to labels
    predicted_labels = [label_names[idx] for idx in predictions[0].numpy()]

    # Extract tokens
    tokens = tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'][0])

    # Collect locations (B-LOC, I-LOC)
    locations = []
    current_location = ""
    for token, label in zip(tokens, predicted_labels):
        # If the token starts with ##, it means it's part of a previous word
        if token.startswith("##"):
            token = token[2:]  # Remove ##
            current_location += token  # Append to the current location
        else:
            # If there's already a current location, add it to the list
            if current_location:
                locations.append(current_location)
                current_location = ""

            # If the token is a beginning or inside location label, start a new location
            if label in ["B-LOC", "I-LOC"]:
                current_location = token

    # Add the last location if there is one
    if current_location:
        locations.append(current_location)

    # Return the identified locations
    return locations

# Example usage with any text
input_text = """
I visited #israel last year and had an amazing time in #telaviv.
Then, I traveled to Haifa and Jerusalem.
"""

# Get the list of locations from the input text
identified_locations = predict_ner(input_text, model, tokenizer, label_names)

# Print the identified locations
print(f"Identified Locations: {identified_locations}")


Identified Locations: ['israel', 'telaviv', 'lv', 'Haifa', 'Jerusalem']
