In [None]:
!pip install openai

Collecting openai
  Downloading openai-1.51.2-py3-none-any.whl.metadata (24 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.6-py3-none-any.whl.metadata (21 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading openai-1.51.2-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.7/383.7 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.27.2-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.6-py3-none-any.whl (78 kB)
[2K   [90m━━

In [None]:
from openai import OpenAI
import asyncio
from concurrent.futures import ThreadPoolExecutor
import re
import pandas as pd
import time
import random

In [None]:
OPENAI_KEY = "YOUR_API_KEY"
client = OpenAI(api_key=OPENAI_KEY)

In [None]:
def get_response_sync(prompt):
    """
    Sends a synchronous request to the GPT-4o-mini model with the given prompt and returns the response.

    Args:
        prompt (str): The input prompt to send to the GPT-4o-mini model.

    Returns:
        str: The content of the model's response or an error message if an exception occurs.
    """
    try:
        response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "user", "content": prompt},
                ]
            )
        return response.choices[0].message.content  # Return the model's response content
    except Exception as e:
        return f"Error: {e}"  # Return an error message if an exception occurs

In [None]:
async def generate_multiple_responses(prompts, max_requests_per_minute):
    """
    Asynchronously generates responses for a list of prompts, limiting the number of requests sent per minute.

    Args:
        prompts (list): A list of input prompts to send to the model.
        max_requests_per_minute (int): The maximum number of requests allowed per minute.

    Returns:
        list: A list of responses corresponding to each prompt.
    """
    loop = asyncio.get_event_loop()  # Get the current event loop
    responses = []  # List to store responses
    tasks = []  # List to store the asynchronous tasks
    max_concurrent_tasks = min(max_requests_per_minute, 10)  # Limit concurrent tasks to a reasonable number (max 10)
    semaphore = asyncio.Semaphore(max_concurrent_tasks)  # Semaphore to limit concurrency
    executor = ThreadPoolExecutor(max_workers=max_concurrent_tasks)  # Thread pool executor for concurrent execution

    async def sem_task(prompt):
        """
        A wrapped task to ensure that no more than `max_concurrent_tasks` run concurrently.

        Args:
            prompt (str): The input prompt for which to generate a response.

        Returns:
            str: The response generated for the prompt.
        """
        async with semaphore:  # Control concurrency using the semaphore
            response = await loop.run_in_executor(
                executor,
                get_response_sync,  # Run the synchronous get_response_sync function in a thread
                prompt
            )
            return response

    for prompt in prompts:
        task = asyncio.ensure_future(sem_task(prompt))  # Create an async task for each prompt
        tasks.append(task)

    responses = await asyncio.gather(*tasks)  # Await all tasks to complete and gather responses
    return responses  # Return the list of responses

In [None]:
mountain_names = [
    "Mount Kilimanjaro", "K2", "Mount Denali", "Mount Fuji", "Mont Blanc",
    "Mount Elbrus", "Mount Rainier", "Mount Aconcagua", "Matterhorn",
    "Mount Kosciuszko", "Mount Cook", "Mount Olympus", "Mount Whitney",
    "Mount Vinson", "Annapurna", "Mount St. Helens", "Table Mountain",
    "Mount McKinley", "Mount Kenya", "Mount Kinabalu", "Ben Nevis",
    "Mount Ararat", "Mount Sinai", "Mount Shasta", "Mount Hood",
    "Mount Mitchell", "Mount Rushmore", "Mount Etna", "Mount Vesuvius",
    "Mount Gede", "Mount Rinjani", "Mount Ngauruhoe", "Mount Toubkal",
    "Pico de Orizaba", "Mount Robson", "Mount Damavand", "Mount Olympus (Greece)"
]

In [None]:
start_time = time.time()

def create_custom_prompt(mountain):
    prompt = f"""
        Instruction:

        Please generate a new example for a Named Entity Recognition (NER) dataset focused on mountain names. Follow these steps:

        1. **Create a Sentence**: Write a natural English sentence that includes the mountain name "{mountain}".

        2. **Tokenization**: Split the sentence into individual words or tokens. **Ensure that punctuation marks (e.g., periods, commas, parentheses) are treated as separate tokens and are not attached to words.**

        3. **Part-of-Speech (POS) Tagging**: Assign the correct POS tag to each word in the sentence, including punctuation marks. **Use appropriate POS tags for punctuation (e.g., use `.` for periods, `,` for commas, `-LRB-` for '(', `-RRB-` for ')').**

        4. **NER Tagging**: Assign NER tags to each word using the following scheme:
          - Use `B-MOUNTAIN` for the first word of the mountain name.
          - Use `I-MOUNTAIN` for subsequent words in a multi-word mountain name.
          - Use `O` for tokens that are not part of a named entity.

        5. **Formatting**: Present the information in the following structure using plain text without any special formatting:

        Sentence: [Your sentence here]
        Word: [List of words]
        POS: [List of POS tags]
        Tag: [List of NER tags]

        **Ensure that:**
        - **The `Word`, `POS`, and `Tag` lists are enclosed in square brackets `[]`.**
        - **Each word and its corresponding POS tag and NER tag are properly aligned in their respective lists.**
        - **Punctuation marks are included as separate tokens with appropriate POS tags and NER tags.**

        **Example Output:**

        Sentence: I planned to hike up to Mount Olympus (Greece) this summer.
        Word: [I, planned, to, hike, up, to, Mount, Olympus, (, Greece, ), this, summer, .]
        POS: [PRP, VBD, TO, VB, RP, TO, NNP, NNP, -LRB-, NNP, -RRB-, DT, NN, .]
        Tag: [O, O, O, O, O, O, B-MOUNTAIN, I-MOUNTAIN, O, I-MOUNTAIN, O, O, O, O]
    """
    return prompt

num_responses = 2000  # Number of prompts to generate
max_requests_per_minute = 60  # Maximum number of requests allowed per minute
# max_requests_per_minute = 400

# Generate prompts with random mountain names
prompts = [create_custom_prompt(mountain) for mountain in random.choices(mountain_names, k=num_responses)]

# Now generate responses for each prompt asynchronously
responses = await generate_multiple_responses(prompts, max_requests_per_minute)

print("--- %s seconds ---" % (time.time() - start_time))

--- 630.0263381004333 seconds ---


In [None]:
responses

['Sentence: The breathtaking view from the summit of Mount Robson is unforgettable.  \nWord: [The, breathtaking, view, from, the, summit, of, Mount, Robson, is, unforgettable, .]  \nPOS: [DT, JJ, NN, IN, DT, NN, IN, NNP, NNP, VBZ, JJ, .]  \nTag: [O, O, O, O, O, O, O, B-MOUNTAIN, I-MOUNTAIN, O, O, O]',
 'Sentence: Many tourists visit Table Mountain every year for its breathtaking views.  \nWord: [Many, tourists, visit, Table, Mountain, every, year, for, its, breathtaking, views, .]  \nPOS: [JJ, NNS, VB, NNP, NNP, DT, NN, IN, PRP$, JJ, NNS, .]  \nTag: [O, O, O, B-MOUNTAIN, I-MOUNTAIN, O, O, O, O, O, O, O]  ',
 'Sentence: Many climbers dream of reaching the summit of Pico de Orizaba in Mexico.  \nWord: [Many, climbers, dream, of, reaching, the, summit, of, Pico, de, Orizaba, in, Mexico, .]  \nPOS: [JJ, NNS, VBP, IN, VBG, DT, NN, IN, NNP, IN, NNP, IN, NNP, .]  \nTag: [O, O, O, O, O, O, O, O, B-MOUNTAIN, I-MOUNTAIN, I-MOUNTAIN, O, I-MOUNTAIN, O]  ',
 'Sentence: Climbing Ben Nevis is a chall

In [None]:
def parse_list(s):
    """
    Parses a string representation of a list into a Python list.

    Args:
        s (str): The string representation of a list (e.g., "[item1, item2, item3]").

    Returns:
        list: A list of strings where each item is an element from the original string.
    """
    s = s.strip()[1:-1]
    items = s.split(',')
    items = [item.strip() for item in items]
    return items

In [None]:
def parse_response(response):
    """Parses a single response string to extract the sentence, words, POS tags, and NER tags."""

    # Remove leading/trailing whitespace
    response = response.strip()

    # Replace any '\n' or multiple spaces with a single space
    response_cleaned = re.sub(r'\s+', ' ', response)

    # Extract Sentence
    sentence_match = re.search(r'Sentence:\s*(.*?)\s*Word:', response_cleaned)
    sentence = sentence_match.group(1).strip() if sentence_match else ''

    # Extract Word list
    word_match = re.search(r'Word:\s*(\[.*?\])\s*POS:', response_cleaned)
    words = parse_list(word_match.group(1)) if word_match else []

    # Extract POS list
    pos_match = re.search(r'POS:\s*(\[.*?\])\s*Tag:', response_cleaned)
    pos_tags = parse_list(pos_match.group(1)) if pos_match else []

    # Extract Tag list
    tag_match = re.search(r'Tag:\s*(\[.*?\])', response_cleaned)
    ner_tags = parse_list(tag_match.group(1)) if tag_match else []

    return sentence, words, pos_tags, ner_tags

In [None]:
data_rows = []  # List to store the rows of data for the DataFrame
sentence_id = 1  # To keep track of sentence numbers
allowed_tags = {'B-MOUNTAIN', 'I-MOUNTAIN', 'O'}  # Allowed NER tags

# Process each response
for response in responses:
    sentence, words, pos_tags, ner_tags = parse_response(response)  # Parse the response into components
    if len(words) == len(pos_tags) == len(ner_tags):  # Ensure all lists have the same length
        for i in range(len(words)):
            # Replace any tag not in allowed_tags with 'O'
            if ner_tags[i] not in allowed_tags:
                ner_tags[i] = 'O'
            # Append the processed data to the data_rows list
            data_rows.append({
                'Sentence': sentence,  # Original sentence
                'Word': words[i],      # Each word in the sentence
                'POS': pos_tags[i],     # POS tag for the word
                'Tag': ner_tags[i]      # NER tag for the word (modified if necessary)
            })
    else:
        print(f"Warning: Mismatched lengths in sentence {sentence_id}")  # Warn if the lengths don't match
    sentence_id += 1  # Increment sentence_id to track the next sentence

# Create a DataFrame from the data_rows list
df = pd.DataFrame(data_rows)



In [None]:
# Extract the general tag without prefixes: For "B-PER", this will extract "PER"
df["TagGeneral"] = df["Tag"].apply(lambda x: x.split("-")[-1])
# Extract the positional part of a tag(prefix)
df["TagPos"] = df["Tag"].apply(lambda x: x.split("-")[0])

df["Word"] = df["Word"].fillna("None")

In [None]:
df

Unnamed: 0,Sentence,Word,POS,Tag,TagGeneral,TagPos
0,The breathtaking view from the summit of Mount...,The,DT,O,O,O
1,The breathtaking view from the summit of Mount...,breathtaking,JJ,O,O,O
2,The breathtaking view from the summit of Mount...,view,NN,O,O,O
3,The breathtaking view from the summit of Mount...,from,IN,O,O,O
4,The breathtaking view from the summit of Mount...,the,DT,O,O,O
...,...,...,...,...,...,...
20729,The view from Pico de Orizaba is absolutely br...,absolutely,RB,O,O,O
20730,The view from Pico de Orizaba is absolutely br...,breathtaking,JJ,O,O,O
20731,The view from Pico de Orizaba is absolutely br...,at,IN,O,O,O
20732,The view from Pico de Orizaba is absolutely br...,sunrise,NN,O,O,O


In [None]:
# Unique Tags
tag_list = df["Tag"].unique()

# Create a Mapping Between Tags and IDs
tags2ids = {tag: i for i, tag in enumerate(tag_list)}

# Map Tags to Their Corresponding IDs
df["TagId"] = df["Tag"].map(tags2ids)

In [None]:
tag_list

array(['O', 'B-MOUNTAIN', 'I-MOUNTAIN'], dtype=object)

In [None]:
tags2ids

{'O': 0, 'B-MOUNTAIN': 1, 'I-MOUNTAIN': 2}

In [None]:
# Create a dataframe with columns where each value is a list of "Word", "POS", and "Tag" corresponding to each sentence.
df_1 = df.groupby("Sentence").agg(list).reset_index().drop("Sentence", axis=1)
print(f"Number of records: {len(df_1)}")
df_1.head(5)

Number of records: 1454


Unnamed: 0,Word,POS,Tag,TagGeneral,TagPos,TagId
0,"[A, group, of, climbers, is, preparing, to, as...","[DT, NN, IN, NNS, VBZ, VBG, TO, VB, NNP, NNP, ...","[O, O, O, O, O, O, O, O, B-MOUNTAIN, I-MOUNTAI...","[O, O, O, O, O, O, O, O, MOUNTAIN, MOUNTAIN, O...","[O, O, O, O, O, O, O, O, B, I, O, O, O]","[0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0]"
1,"[Adventurous, climbers, often, target, Mount, ...","[JJ, NNS, RB, VB, NNP, NNP, IN, PRP$, JJ, NNS, .]","[O, O, O, O, B-MOUNTAIN, I-MOUNTAIN, O, O, O, ...","[O, O, O, O, MOUNTAIN, MOUNTAIN, O, O, O, O, O]","[O, O, O, O, B, I, O, O, O, O, O]","[0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0]"
2,"[Ascending, Mount, Damavand, is, a, rewarding,...","[VBG, NNP, NNP, VBZ, DT, JJ, NN, IN, NNS, .]","[O, B-MOUNTAIN, I-MOUNTAIN, O, O, O, O, O, O, O]","[O, MOUNTAIN, MOUNTAIN, O, O, O, O, O, O, O]","[O, B, I, O, O, O, O, O, O, O]","[0, 1, 2, 0, 0, 0, 0, 0, 0, 0]"
3,"[Ben, Nevis, is, the, highest, mountain, in, t...","[NNP, NNP, VBZ, DT, JJS, NN, IN, DT, NNP, NNP, .]","[B-MOUNTAIN, I-MOUNTAIN, O, O, O, O, O, O, O, ...","[MOUNTAIN, MOUNTAIN, O, O, O, O, O, O, O, O, O]","[B, I, O, O, O, O, O, O, O, O, O]","[1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,"[Climbers, are, preparing, to, summit, Mount, ...","[NNS, VBP, VBG, TO, VB, NNP, NNP, IN, NNP, JJ,...","[O, O, O, O, O, B-MOUNTAIN, I-MOUNTAIN, O, O, ...","[O, O, O, O, O, MOUNTAIN, MOUNTAIN, O, O, O, O...","[O, O, O, O, O, B, I, O, O, O, O, O]","[0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0]"


In [None]:
MOUNTAIN_TAG_COLOR = '\33[104m'
O_TAG_COLOR  = '\33[100m'

color_mapping = {'O': O_TAG_COLOR, 'MOUNTAIN': MOUNTAIN_TAG_COLOR}

In [None]:
def display_formatted_text(words, tags):
    """
    Displays formatted text with color-coding based on tags.

    Args:
        words (list): A list of words to display.
        tags (list): A list of tags corresponding to the words. The tags are used
                     to apply the appropriate color from `color_mapping`.

    Returns:
        None
    """
    formatted_text = " ".join([color_mapping[tag] + word for word, tag in zip(words, tags)]) + O_TAG_COLOR
    print(formatted_text)


# Get the words and tags from the first row of the dataframe and display the formatted text
text = df.loc[0, "Word"]
tags = df.loc[0, "TagGeneral"]
display_formatted_text(text, tags)

# Get the words and tags from the 42nd row of the dataframe and display the formatted text
text = df.loc[42, "Word"]
tags = df.loc[42, "TagGeneral"]
display_formatted_text(text, tags)

[100mA [100mgroup [100mof [100mclimbers [100mis [100mpreparing [100mto [100mascend [104mMount [104mNgauruhoe [100mnext [100mweek [100m.[100m
[100mClimbing [104mMount [104mEtna [100mis [100ma [100mbreathtaking [100mexperience [100mthat [100mattracts [100mmany [100madventurers [100m.[100m


In [None]:
df_1.to_csv('MountainsDataset.csv', index=False)