In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

model = AutoModelForTokenClassification.from_pretrained("Shah1st/mountain-ner-model")
tokenizer = AutoTokenizer.from_pretrained("Shah1st/mountain-ner-model")

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

config.json:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [2]:
def format_predictions_with_subtokens(text, predictions):
    """
    Formats NER model predictions and combines sub-tokens into a single word
    """
    print(f"Text: {text}\n")
    formatted_output = []
    
    current_word = ""
    current_label = ""
    current_score = 0
    token_count = 0
    
    for prediction in predictions:
        word = prediction['word']
        entity = prediction['entity']
        score = prediction['score']
        
        
        if entity == "LABEL_0":
            label = "O"
        elif entity == "LABEL_1":
            label = "B-MOUNTAIN"
        elif entity == "LABEL_2":
            label = "I-MOUNTAIN"
        else:
            label = entity
        
        
        if word.startswith("##"):
            current_word += word[2:] 
            current_score += score
            token_count += 1
        else:
            
            if current_word:
                avg_score = current_score / token_count 
                formatted_output.append(f"{current_word}: {current_label} (score: {avg_score:.2f})")
            
           
            current_word = word
            current_label = label
            current_score = score
            token_count = 1
    
   
    if current_word:
        avg_score = current_score / token_count
        formatted_output.append(f"{current_word}: {current_label} (score: {avg_score:.2f})")
    
   
    for formatted in formatted_output:
        print(formatted)
    print("\n" + "="*50 + "\n")

In [3]:
texts = [
    "Mount Everest is the highest mountain in the world.",
    "The Amazon River flows through the rainforest.",
    "The Andes stretch along the western coast of South America.",
    "Mount Kilimanjaro, located in Tanzania, and the Himalayas in Asia are among the most famous mountain ranges."
]

for text in texts:
    predictions = ner_pipeline(text)
    format_predictions_with_subtokens(text, predictions)

Text: Mount Everest is the highest mountain in the world.

Mount: B-MOUNTAIN (score: 0.88)
Everest: I-MOUNTAIN (score: 0.91)
is: O (score: 0.99)
the: O (score: 0.99)
highest: O (score: 0.99)
mountain: O (score: 0.98)
in: O (score: 0.97)
the: O (score: 0.98)
world: O (score: 0.97)
.: O (score: 0.96)


Text: The Amazon River flows through the rainforest.

The: O (score: 0.93)
Amazon: O (score: 0.83)
River: O (score: 0.98)
flows: O (score: 0.99)
through: O (score: 0.99)
the: O (score: 0.99)
rainforest: O (score: 0.90)
.: O (score: 0.97)


Text: The Andes stretch along the western coast of South America.

The: O (score: 0.65)
Andes: I-MOUNTAIN (score: 0.52)
stretch: O (score: 0.98)
along: O (score: 0.98)
the: O (score: 0.98)
western: O (score: 0.98)
coast: O (score: 0.98)
of: O (score: 0.98)
South: O (score: 0.93)
America: O (score: 0.97)
.: O (score: 0.96)


Text: Mount Kilimanjaro, located in Tanzania, and the Himalayas in Asia are among the most famous mountain ranges.

Mount: B-MOUNTAI

In [4]:
texts = [
    "The Rocky Plains are located near the Appalachian Mountains.",
    "Climbing the corporate ladder was her Mount Everest.",
    "He visited the Grand Canyon and later hiked through the Alps.",
    "The Andes, Himalayas, and the Rockies form some of the most impressive mountain ranges on Earth.",
    "Mount Fuji, located near Tokyo in Japan, is an iconic symbol of the country.",
    "The Carpathians are more than just mountains; they are a source of life for many species.",
    "Hidden behind the clouds, Mount Rainier casts a shadow over the valley.",
    "His determination was as steadfast as the Rockies.",
    "She trekked through the less-known Tian Shan and then set out to conquer Mount Kilimanjaro.",
    "Between the Rockies and the Appalachian Mountains lies the Great Plains."
]


for text in texts:
    predictions = ner_pipeline(text)
    format_predictions_with_subtokens(text, predictions)

Text: The Rocky Plains are located near the Appalachian Mountains.

The: O (score: 0.71)
Rocky: O (score: 0.72)
Plains: O (score: 0.72)
are: O (score: 0.98)
located: O (score: 0.99)
near: O (score: 0.98)
the: O (score: 0.96)
Appalachian: B-MOUNTAIN (score: 0.93)
Mountains: I-MOUNTAIN (score: 0.85)
.: O (score: 0.95)


Text: Climbing the corporate ladder was her Mount Everest.

Climbing: O (score: 0.98)
the: O (score: 0.99)
corporate: O (score: 0.99)
ladder: O (score: 0.98)
was: O (score: 0.98)
her: O (score: 0.98)
Mount: B-MOUNTAIN (score: 0.85)
Everest: I-MOUNTAIN (score: 0.89)
.: O (score: 0.97)


Text: He visited the Grand Canyon and later hiked through the Alps.

He: O (score: 0.98)
visited: O (score: 0.99)
the: O (score: 0.95)
Grand: B-MOUNTAIN (score: 0.81)
Canyon: I-MOUNTAIN (score: 0.72)
and: O (score: 0.97)
later: O (score: 0.99)
hiked: O (score: 0.99)
through: O (score: 0.98)
the: O (score: 0.83)
Alps: B-MOUNTAIN (score: 0.93)
.: O (score: 0.97)


Text: The Andes, Himalayas, 

### Short Summary for Each Example:

1. **The Rocky Plains are located near the Appalachian Mountains.**
   - **Appalachian Mountains** were correctly predicted as mountains with `B-MOUNTAIN` and `I-MOUNTAIN` tags.
   - **Rocky Plains** were correctly not identified as mountains.
   - Scores for Appalachian were reasonably high: 0.93 for Appalachian and 0.85 for Mountains.

2. **Climbing the corporate ladder was her Mount Everest.**
   - The model correctly predicted **Mount Everest** as a mountain, despite it being used metaphorically.
   - Scores were decent for Mount (0.85) and Everest (0.89).

3. **He visited the Grand Canyon and later hiked through the Alps.**
   - **Grand Canyon** was incorrectly predicted as a mountain (`B-MOUNTAIN` and `I-MOUNTAIN`), although it's not a mountain.
   - **Alps** were correctly recognized as mountains.
   - A lower score for Canyon (0.72) shows some uncertainty from the model.

4. **The Andes, Himalayas, and the Rockies form some of the most impressive mountain ranges on Earth.**
   - The model recognized **Andes**, **Himalayas**, and **Rockies** correctly, but with low scores for Andes (0.60) and Rockies (0.61), showing lack of confidence.

5. **Mount Fuji, located near Tokyo in Japan, is an iconic symbol of the country.**
   - **Mount Fuji** was correctly predicted as a mountain, but with a low score for Fuji (0.61).
   - The model correctly ignored **Tokyo** as a non-mountain.

6. **The Carpathians are more than just mountains; they are a source of life for many species.**
   - **Carpathians** were recognized as a mountain, but with a somewhat low score (0.79), indicating some uncertainty.

7. **Hidden behind the clouds, Mount Rainier casts a shadow over the valley.**
   - **Mount Rainier** was correctly predicted with reasonably high scores: 0.88 for Mount and 0.93 for Rainier.

8. **His determination was as steadfast as the Rockies.**
   - **Rockies** were correctly predicted, but with a low score (0.66), likely due to the metaphorical usage.

9. **She trekked through the less-known Tian Shan and then set out to conquer Mount Kilimanjaro.**
   - **Tian Shan** and **Mount Kilimanjaro** were both correctly recognized, though Kilimanjaro had a slightly lower score (0.81).

10. **Between the Rockies and the Appalachian Mountains lies the Great Plains.**
   - **Rockies** and **Appalachian Mountains** were correctly predicted, but with slightly lower scores for Rockies (0.79) and Appalachian (0.84), likely due to the more complex context.

### General Conclusions About the Model:

1. **Accuracy in Recognizing Mountains in Simple Contexts**:
   - The model successfully recognizes famous mountains, such as **Mount Everest** and **Mount Kilimanjaro**. In simple sentences with clear mountain references, the model provides high confidence (scores).

2. **Difficulty with Rare or Lesser-Known Mountains**:
   - The model shows lower scores for less-known mountains, like **Andes** (0.60) and **Carpathians** (0.79), indicating a lack of confidence when recognizing rare or less frequent mountains.

3. **Errors in Recognizing Non-Mountain Geographic Entities**:
   - The model incorrectly classified **Grand Canyon** as a mountain, showing that it sometimes confuses non-mountain geographic entities with mountains, especially if their names contain suggestive words like "Grand" or "Canyon."

4. **Challenges with Metaphors and Context**:
   - The model handled metaphorical use, such as **Mount Everest** for challenges, relatively well, but confidence scores are lower in these cases, indicating uncertainty.

5. **Low Confidence in Complex or Long Sentences**:
   - In longer or more complex sentences, like the one with multiple mountains (**Andes, Himalayas, Rockies**), the model tends to have lower confidence scores, possibly due to difficulties processing multiple geographic entities in one sentence.