Notebook Structure:
- Introduction: Overview of the project and objectives.
- Data Loading and Preprocessing: Load the dataset, preprocess, and tokenize.
- Model Training: Fine-tune the BERT model.
- Inference and Post-Processing: Run predictions and process results to display recognized mountain names.
- Visualization of Results: Show identified mountain names in text.

Step-by-Step Notebook Demo

1. Data Creating 


In [None]:
import pandas as pd
import re

# Load the file into a DataFrame
data = pd.read_csv("allCountries_small.txt", sep="\t", header=None, low_memory=False)

# Add column headers (they are missing in the file)
data.columns = ["geonameid", "name", "asciiname", "alternatenames", "latitude", 
                "longitude", "feature_class", "feature_code", "country_code", 
                "cc2", "admin1_code", "admin2_code", "admin3_code", "admin4_code", 
                "population", "elevation", "dem", "timezone", "modification_date"]

# Filter the data for mountains using feature_class = 'T' and appropriate feature_code
mountains = data[(data['feature_class'] == 'T') & 
                 (data['feature_code'].isin(['MT', 'RNG', 'PK']))]

# Save the filtered data to a new file
mountain_names = mountains['name'].tolist()
print(mountain_names)

def sentence_to_bio(sentence, mountain_name):
    # Ensure that mountain_name is a string
    if isinstance(mountain_name, list):
        mountain_name = " ".join(mountain_name)
    
    words = sentence.split()
    bio_tags = []
    
    for word in words:
        # Check if the word is the start or part of the mountain name
        if word == mountain_name.split()[0]:  # Start of the mountain name
            bio_tags.append("B-Mountain")
        elif word in mountain_name.split()[1:]:  # Continuation of the name
            bio_tags.append("I-Mountain")
        else:
            bio_tags.append("O")  # Other words
    
    return list(zip(words, bio_tags))

# Generate sentences for each mountain name
sentences = []
for name in mountain_names:  
    sentence = f"The mountain {name} is well-known."
    sentences.append((sentence, name))
    sentence = f"Many tourists visit {name} every year."
    sentences.append((sentence, name))
    sentence = f"The highest point in the {name} is spectacular."
    sentences.append((sentence, name))

# Write sentences with BIO tags to a file
with open("mountains_ner_dataset.txt", "w",encoding="utf-8") as f:
    for sentence, mountain_name in sentences:
        bio_format = sentence_to_bio(sentence, mountain_name)
        for word, tag in bio_format:
            f.write(f"{word}\t{tag}\n")
        f.write("\n")  # Empty line between sentences


['Roc Meler', 'Pic de les Abelletes', 'Roc de Port Dret', 'Collada del Xeig', 'Bony de Vellatocina', 'Pic de Costa Rodona', 'Pic de Tristaina', 'Cap de Tosa d’Entor', 'Pic de la Tosa de Juclar', 'Torre dels Soldats', 'Pic de Torradella', 'Pic de les Sorobilles', 'Pic del Solà d’Erts', 'Cap del Solà de les Comes', 'Pic de Siscarou', 'Pic de Coma Extremera', 'Pic de la Serrera', 'Pic de Serra Seca', 'Pic de Serra Mitjana', 'Cap de la Serra dels Isards', 'Senyal Negre', 'Senyal de Missa', 'Pic de Sanfons', 'Turó Rodó', 'Serra del Roc del Rellotge', 'Cap de Rep', 'Pic de Racofred Occidental', 'Collet Purgat', 'Turó del Port Vell', 'Pic del Port Vell', 'Bony de la Pleta de Jan', 'Sierra Plana', 'Pic del Pla de l’Ingla', 'Bony de la Pica', "Pic d'Ensagents", 'Pic de Percanela', 'Cap de la Palomera', 'Pic de Palomer', 'Pic de la Pala de Coll Carnisser', 'Pic de la Pala Alta', 'Pic de Padern', 'Cap dels Oriols', 'Coll d’Obac', 'Pic de Noé', 'Bony de les Neres', 'Pic Negre', 'Pic de Monturull',

2. Load Libraries and Set Up the Environment

In [None]:
import random
from transformers import BertTokenizer
import torch
import tensorflow as tf, tf_keras
from transformers import BertForTokenClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report
from torch.utils.data import Dataset
from transformers import DataCollatorForTokenClassification

3. Load and Preprocess Data
- This section loads the NER dataset and preprocesses it for BERT. The mountains_ner_dataset.txt file should be in the same directory as this notebook.

In [None]:
def load_sentences_and_labels(file_path):
    token_sentences = []
    token_labels = []
    sentence = []
    sentence_labels = []

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():  # If the line is not empty
                word, label = line.strip().split()
                sentence.append(word)
                sentence_labels.append(label)
            else:  # An empty line means the end of a sentence
                if sentence:
                    token_sentences.append(sentence)
                    token_labels.append(sentence_labels)
                    sentence = []
                    sentence_labels = []
        if sentence:  # Add the last sentence if exists
            token_sentences.append(sentence)
            token_labels.append(sentence_labels)
    return token_sentences, token_labels


sentences, labels = load_sentences_and_labels("mountains_ner_dataset.txt")


def split_data(tag_sentences, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    train_size = int(
        train_ratio * len(tag_sentences)
    )  # Calculate the sizes of each set
    val_size = int(val_ratio * len(tag_sentences))
    train_set = tag_sentences[:train_size]  # Split into respective sets
    val_set = tag_sentences[train_size : train_size + val_size]
    test_set = tag_sentences[train_size + val_size :]
    return train_set, val_set, test_set


train_set, val_set, test_set = split_data(sentences)
train_label, val_label, test_label = split_data(labels)


4. Tokenize and Prepare Data
- Convert words to BERT tokens and prepare them for input.

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


def tokenize_and_preserve_labels(sentence, labels):
    tokenized_sentence = []
    label_ids = []
    for word, label in zip(sentence, labels):
        tokenized_word = tokenizer.tokenize(word)  # Tokenize the word
        tokenized_sentence.extend(tokenized_word)
        label_ids.extend(
            [label] + ["O"] * (len(tokenized_word) - 1)
        )  # Assign the original tag only to the first sub-token, others get "O"
    input_ids = tokenizer.convert_tokens_to_ids(
        tokenized_sentence
    )  # Convert tokens to IDs
    return input_ids, label_ids


tokenized_data = []
for sentence, label in zip(train_set, train_label):
    input_ids, label_ids = tokenize_and_preserve_labels(sentence, label)
    tokenized_data.append((input_ids, label_ids))


def add_special_tokens(input_ids, label_ids):
    # IDs for [CLS] and [SEP] tokens
    cls_token_id = tokenizer.cls_token_id
    sep_token_id = tokenizer.sep_token_id
    input_ids = [cls_token_id] + input_ids + [sep_token_id]
    label_ids = ["O"] + label_ids + ["O"]  # Add "O" tag for special tokens
    return input_ids, label_ids


final_data = []
for input_ids, label_ids in tokenized_data:
    input_ids, label_ids = add_special_tokens(input_ids, label_ids)
    final_data.append((input_ids, label_ids))


def create_attention_mask(input_ids):
    return [1] * len(input_ids)


final_data_with_masks = []  # Add attention masks to the data
for input_ids, label_ids in final_data:
    attention_mask = create_attention_mask(input_ids)
    final_data_with_masks.append((input_ids, attention_mask, label_ids))


def prepare_data(sentences, labels):
    tokenized_data = []
    for sentence, label in zip(sentences, labels):
        input_ids, label_ids = tokenize_and_preserve_labels(sentence, label)
        input_ids, label_ids = add_special_tokens(input_ids, label_ids)
        attention_mask = create_attention_mask(input_ids)
        tokenized_data.append((input_ids, attention_mask, label_ids))
    return tokenized_data


train_data = prepare_data(train_set, train_label)
val_data = prepare_data(val_set, val_label)
test_data = prepare_data(test_set, test_label)

# Number of unique tags in the dataset, including "O" for the background
label_list = list(
    set([label for sentence_labels in train_label for label in sentence_labels])
)
num_labels = len(label_list)
label_map = {label: i for i, label in enumerate(label_list)}  # Map labels to IDs

model = BertForTokenClassification.from_pretrained(
    "bert-base-uncased", num_labels=num_labels
)


class NERDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item


# Convert the data to encoded objects for BERT
def encode_labels(labels, label_map):
    encoded_labels = []
    for sentence_labels in labels:
        encoded_labels.append([label_map[label] for label in sentence_labels])
    return encoded_labels


train_labels_encoded = encode_labels([x[2] for x in train_data], label_map)
val_labels_encoded = encode_labels([x[2] for x in val_data], label_map)

train_encodings = {
    "input_ids": [x[0] for x in train_data],
    "attention_mask": [x[1] for x in train_data],
}
val_encodings = {
    "input_ids": [x[0] for x in val_data],
    "attention_mask": [x[1] for x in val_data],
}


5. Model Setup and Training
- This part sets up the model and trains it using a subset of data.

In [None]:
train_dataset = NERDataset(train_encodings, train_labels_encoded)
val_dataset = NERDataset(val_encodings, val_labels_encoded)

data_collator = DataCollatorForTokenClassification(tokenizer)
training_args = TrainingArguments(
    output_dir="./results",  # directory to save the results
    evaluation_strategy="epoch",  # evaluate the model at the end of each epoch
    learning_rate=2e-5,  # initial learning rate
    per_device_train_batch_size=8,  # batch size for training
    per_device_eval_batch_size=8,  # batch size for evaluation
    num_train_epochs=3,  # number of training epochs
    weight_decay=0.01,  # regularization coefficient
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)
trainer.train()

trainer.save_model("./best_model")
tokenizer.save_pretrained("./saved_mode2")


6. Inference with the Trained Model
- This section runs inference on a sample sentence to detect mountain names.

In [None]:
from transformers import pipeline

# Loading the saved model and tokenizer
ner_pipeline = pipeline("ner", model="./best_model", tokenizer="./saved_mode2")

# Text for inference
text = "One of the most breathtaking places for travelers is the Himalayas, home to Mount Khushkak and Kanchenjunga, which attract climbers from all over the world. In Europe, Mont Blanc, the highest peak in the Alps, is also a popular destination for those seeking to conquer summits. In South America, the Andes mountain range stretches for thousands of kilometers, including famous peaks such as Aconcagua and Huascarán.In Africa, Mount Kilimanjaro stands as the highest peak on the continent, easily accessible without specialized climbing gear. Another impressive peak is Mount Meru, located in Tanzania, which is known for its rugged terrain. Moving to North America, the Rocky Mountains boast some of the most iconic mountain landscapes, with peaks like Mount Elbert and Mount Whitney standing tall.In the Pacific region, Mount Fuji in Japan is not only a cultural symbol but also a popular climbing destination. The mountains of New Zealand, like Mount Cook and Mount Taranaki, offer incredible hiking and climbing experiences. Finally, the Alps and the Carpathians continue to draw mountaineers and adventurers looking for challenging terrain and magnificent views."

results = ner_pipeline(text)

# Function to merge subtokens into full mountain names
def extract_full_entities(results):
    merged_entities = []
    current_entity = {
        "entity": None,
        "score": 0,
        "word": "",
        "start": None,
        "end": None,
    }

    for entity in results:
        if entity["word"].startswith("##"):
            current_entity["word"] += entity["word"].replace("##", "")
            current_entity["end"] = entity["end"]
            current_entity["score"] = min(current_entity["score"], entity["score"])
        else:
            if current_entity["entity"] is not None:
                merged_entities.append(current_entity)
            current_entity = {
                "entity": entity["entity"],
                "score": entity["score"],
                "word": entity["word"],
                "start": entity["start"],
                "end": entity["end"],
            }

    if current_entity["entity"] is not None:
        merged_entities.append(current_entity)

    return merged_entities


# Get the result with merged mountain names
final_entities = extract_full_entities(results)
final_list = []
# Output merged mountain names
for entity in final_entities:
    if entity["entity"] == "LABEL_1" or entity["entity"] == "LABEL_2":
        final_list.append(entity["word"].capitalize())
print(final_list)


Summary:
This notebook allows you to:

- Load and preprocess a custom mountain NER dataset.
- Train and fine-tune a BERT model on it.
- Use the model to recognize and highlight mountain names in text.
- This demo structure should make it easy for users to follow along and understand the NER process.