In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import openai
from openai import OpenAI
import random
from dotenv import load_dotenv
import os

In [None]:
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
if not OPENAI_API_KEY:
    raise ValueError("OpenAI API is not found")

In [None]:
#Create a list of mountain names
from openai import OpenAI
def scrape_mountain_names():
    url = "https://en.wikipedia.org/wiki/List_of_mountains_by_elevation"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    mountains = []
    tables = soup.find_all('table', {'class': 'wikitable'})
    
    for table in tables:
        rows = table.find_all('tr')[1:]  # Skip header row
        for row in rows:
            cells = row.find_all('td')
            if len(cells) > 1:
                mountain_name = cells[0].text.strip()
                if '(' in mountain_name:
                  mountain_name = mountain_name.split('(')
                  mountains.append(mountain_name[0])
                  if  '/' in mountain_name[1][:-1]:
                    names = mountain_name[1][:-1].split('/')
                    for name in names:
                        mountains.append(name.strip()) 
                  elif  ';' in mountain_name[1][:-1]:
                    names = mountain_name[1][:-1].split(';')
                    for name in names:
                        mountains.append(name.strip()) 
                  else:       
                    mountains.append(mountain_name[1][:-1])
                elif '/'  in mountain_name: 
                  mountain_name = mountain_name.split('/')
                  mountains.append(mountain_name[0])
                  mountains.append(mountain_name[1]) 
                else:
                  mountains.append(mountain_name)
    
    return mountains

In [None]:
mountain_names = scrape_mountain_names()
mountain_names[:10]

In [None]:
#Generate sentence for each mountain name
client = OpenAI(
  api_key=os.environ['OPENAI_API_KEY']
)
sentences = []
for mountain in mountain_names:
    prompt = f"Generate a sentence about {mountain} without mentioning that it's a mountain.{mountain} can be in any part of the sentence."
    
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=30
    )
    
    sentence = response.choices[0].message.content
    sentences.append((sentence, mountain))
sentences[:5] 

In [None]:
# Create DataFrame
df = pd.DataFrame(sentences, columns=['sentence', 'mountain'])
df.head()

In [None]:
df.to_csv(r'./data/mountain_sentences.csv', index=False)
print("Dataset created and saved to 'mountain_sentences.csv")