We'll use the mountain list we have to create labels from sentences

In [None]:
import pandas as pd
import nltk
nltk.download('punkt')
from sklearn.model_selection import train_test_split
import random

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
mountain_list = pd.read_csv("/content/mountains.csv")
sentences = pd.read_csv("/content/all_sentences.csv")
print(sentences.iloc[:, 0].head())

0         I am not Mount Everest - to be climbed upon!
1    Lack of awareness of waste problem on Mount Ev...
2    K2's recent CODiE Award for "Best Asset Manage...
3    Each content type (K2, Hikashop...) can be con...
4    Vitamin K2 is needed for normal blood coagulat...
Name: Mount Everest - the international name of the world's highest summit., dtype: object


We tokenize the sentences and create tags for the target variable

In [None]:
def create_mountain_patterns(mountain_names):
    patterns = []
    for name in mountain_names:
        words = name.lower().split()
        if len(words) > 1:
            patterns.append(tuple(words))
        else:
            patterns.append(words[0])
    return patterns

def sentence_to_iob(sentence, mountain_patterns):
    tokens = nltk.tokenize.word_tokenize(sentence)
    lower_tokens = [token.lower() for token in tokens]
    iob_tags = ['O'] * len(tokens)

    for i in range(len(tokens)):
        for pattern in mountain_patterns:
            if isinstance(pattern, tuple):
                if i + len(pattern) <= len(lower_tokens):
                    if tuple(lower_tokens[i:i+len(pattern)]) == pattern:
                        iob_tags[i] = 'B-MOUNTAIN'
                        for j in range(1, len(pattern)):
                            iob_tags[i+j] = 'I-MOUNTAIN'
            elif lower_tokens[i] == pattern:
                iob_tags[i] = 'B-MOUNTAIN'

    return tokens, iob_tags

#Read the CSV files
mountain_list = pd.read_csv("/content/mountains.csv")
sentences = pd.read_csv("/content/all_sentences.csv")

#Mountain names are in the first(and only) column of mountains.csv
mountain_names = mountain_list.iloc[:, 0].tolist()

# Create mountain patterns
mountain_patterns = create_mountain_patterns(mountain_names)
print("Mountain patterns created.")

results = []

# Process each sentence
for sentence in sentences.iloc[:, 0]:
    tokens, iob_tags = sentence_to_iob(sentence, mountain_patterns)
    token_sentence = ' '.join([f"{token}" for token in tokens])
    annotation = ' '.join([f"{tag}" for tag in iob_tags])
    results.append({'Tokens': token_sentence, 'Tagged Sentence': annotation})

# Create the DataFrame
df = pd.DataFrame(results)

print(df.head())
df.to_csv("/content/sentences_annotation.csv")

Mountain patterns created.
                                              Tokens  \
0      I am not Mount Everest - to be climbed upon !   
1  Lack of awareness of waste problem on Mount Ev...   
2  K2 's recent CODiE Award for `` Best Asset Man...   
3  Each content type ( K2 , Hikashop ... ) can be...   
4  Vitamin K2 is needed for normal blood coagulat...   

                                     Tagged Sentence  
0            O O O B-MOUNTAIN I-MOUNTAIN O O O O O O  
1              O O O O O O O B-MOUNTAIN I-MOUNTAIN O  
2  B-MOUNTAIN O O O O O O O O O O O O O O O O O O...  
3       O O O O B-MOUNTAIN O O O O O O O O O O O O O  
4  O B-MOUNTAIN O O O O O O O O O O O O O O O O O...  
                                              Tokens  \
0      I am not Mount Everest - to be climbed upon !   
1  Lack of awareness of waste problem on Mount Ev...   
2  K2 's recent CODiE Award for `` Best Asset Man...   
3  Each content type ( K2 , Hikashop ... ) can be...   
4  Vitamin K2 is needed fo

Here we have it, now we remove the random sentences that can happen occasionaly with example sentences. We do it by removing all rows, whose annotation does not have a named entity

In [None]:
mask = df['Tagged Sentence'].apply(lambda x: 'B-MOUNTAIN' in x)
df_filtered = df[mask]
df_filtered = df_filtered.reset_index(drop=True)
df_filtered.to_csv("/content/filtered_sentences_annotation.csv")

We save it to csv, and get ready to start our model