###### 1. Import regular expressions for text processing
###### 2. Import pandas for data manipulation
###### 3. Import abstract syntax tree for safely evaluating strings

In [1]:
import re
import pandas as pd
import ast

###### Read the CSV file containing the dataset with markup and convert the 'marker' column from string representation of lists to actual lists

In [2]:
df = pd.read_csv('data/mountain_dataset_with_markup.csv')
df['marker'] = df['marker'].apply(ast.literal_eval)

 ###### Function to tag words in a sentence based on given markers for mountain names

In [3]:
def tag_sentence_corrected(sentence, markers):
    print(sentence)
    words = re.findall(r"(?:#\w+|St\.|\b\w+(?:'\w+)?|\d+\.\w+|[.,!?;:])", sentence)
    tags = ['O'] * len(words)

    if len(markers) != 0:
        for marker in markers:
            mount_names = str(sentence[marker[0]:marker[1]]).split()
            if len(mount_names) == 1:
                tags[words.index(mount_names[0])] = 'B-MOUNTAIN'
            else:
                for i in range(len(mount_names)):
                    if i == 0:
                        tags[words.index(mount_names[i])] = 'B-MOUNTAIN'
                    else:
                        tags[words.index(mount_names[i])] = 'I-MOUNTAIN'
            print(words)
            print(tags)
    return words, tags

In [4]:
tokens_list = []
ner_tags_list = []

###### Iterate through each row in the DataFrame and process each sentence to extract words and tags

In [5]:
for index, row in df.iterrows():
    words, tags = tag_sentence_corrected(row['text'], row['marker'])
    tokens_list.append(words)
    ner_tags_list.append(tags)

A visit to a science museum for hands-on learning.
Voice surface coach set democratic time year. Evidence movie successful Congress this various into. Name around hair especially.
Parent according maybe activity activity finish. Within skill raise surface discuss. Series kitchen Congress rather he.
A visit to a sculpture garden with intriguing artworks.
The Julian Alps in Slovenia offer pristine lakes and picturesque landscapes.
['The', 'Julian', 'Alps', 'in', 'Slovenia', 'offer', 'pristine', 'lakes', 'and', 'picturesque', 'landscapes', '.']
['O', 'O', 'B-MOUNTAIN', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
The referee blows the whistle, signaling the end of the first half. The score is tied at 1-1.
Again eat owner drop. Stay recognize none size effort agency.
Set large fight speech local. Individual if nation buy movie road crime.
Important nearly themselves particular sort close case. Positive success total fund.
Wonder behind everybody dream. Owner much analysis common various al

###### Create a new DataFrame to store the processed tokens and NER tags and save the new DataFrame to a CSV file

In [21]:
new_df = pd.DataFrame({'tokens': tokens_list, 'ner_tags': ner_tags_list})
new_df.to_csv('datas/processed_mountain_dataset.csv', index=False)