We'll get our data by scraping for sentences with mountain names

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import random

We'll get the mountain list from wikipedia as it is simple and reliable

In [2]:
def scrape_mountain_list():
    url = "https://en.wikipedia.org/wiki/List_of_mountains_by_elevation"
    response = requests.get(url)

    if response.status_code != 200:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the main table
    tables = soup.find_all('table', {'class': 'wikitable'})

    if not tables:
        print("Could not find the mountain list table")
        return []

    mountains = []

    for table in tables:
      rows = table.find_all('tr')[1:]  # Skip the header row
      for row in rows:
          cols = row.find_all('td')
          if len(cols) >= 3:
              mountain = cols[0].text.strip()
              mountains.append(mountain)

    return mountains

def save_list_to_csv(data_list, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows([[item] for item in data_list])


mountain_list = scrape_mountain_list()

Scraped 1620 mountains
First few mountains:
Mount Everest
K2
Kangchenjunga
Lhotse
Makalu
Mountain list saved to mountains.csv


In [3]:
print(mountain_list)

['Mount Everest', 'K2', 'Kangchenjunga', 'Lhotse', 'Makalu', 'Cho Oyu', 'Dhaulagiri', 'Manaslu', 'Nanga Parbat', 'Annapurna', 'Gasherbrum I (Hidden peak; K5)', 'Broad Peak', 'Gasherbrum II (K4)', 'Shishapangma', 'Gasherbrum III', 'Gyachung Kang', 'Annapurna II', 'Gasherbrum IV (K3)', 'Himalchuli', 'Distaghil Sar', 'Ngadi Chuli', 'Nuptse', 'Khunyang Chhish', 'Masherbrum (K1)', 'Nanda Devi', 'Chomo Lonzo', 'Batura Sar', 'Kanjut Sar', 'Rakaposhi', 'Namcha Barwa', 'Batura II', 'Kamet', 'Saltoro Kangri', 'Batura III', 'Jannu', 'Tirich Mir', 'Molamenqing', 'Gurla Mandhata', 'Saser Kangri', 'Chogolisa', 'Kongur Tagh', 'Shispare', 'Silberzacken', 'Changtse', 'Trivor', 'Gangkhar Puensum', 'Gongga Shan', 'Annapurna III', 'Kula Kangri', 'Skyang Kangri', 'Liankang Kangri', 'Yukshin Gardan Sar', 'Annapurna IV', 'Saser Kangri II', 'Mamostong Kangri', 'Muztagh Ata', 'Ismoil Somoni Peak', 'Saser Kangri III', 'Noshaq', 'Pumari Chhish', 'Passu Sar', 'Jongsong Peak', 'Malubiting', 'Gangapurna', 'Muchu Ch

We'll delete the names containing New York, and all the other sentences which are to be mistaken for something else by the sentence from word services

In [7]:
to_remove = ["K2", "(New York)", "(British Columbia)"]
mountain_list = [item for item in mountain_list if not any(sub in item for sub in to_remove)]
print(mountain_list)
print(len(mountain_list))

['Mount Everest', 'Kangchenjunga', 'Lhotse', 'Makalu', 'Cho Oyu', 'Dhaulagiri', 'Manaslu', 'Nanga Parbat', 'Annapurna', 'Gasherbrum I (Hidden peak; K5)', 'Broad Peak', 'Gasherbrum II (K4)', 'Shishapangma', 'Gasherbrum III', 'Gyachung Kang', 'Annapurna II', 'Gasherbrum IV (K3)', 'Himalchuli', 'Distaghil Sar', 'Ngadi Chuli', 'Nuptse', 'Khunyang Chhish', 'Masherbrum (K1)', 'Nanda Devi', 'Chomo Lonzo', 'Batura Sar', 'Kanjut Sar', 'Rakaposhi', 'Namcha Barwa', 'Batura II', 'Kamet', 'Saltoro Kangri', 'Batura III', 'Jannu', 'Tirich Mir', 'Molamenqing', 'Gurla Mandhata', 'Saser Kangri', 'Chogolisa', 'Kongur Tagh', 'Shispare', 'Silberzacken', 'Changtse', 'Trivor', 'Gangkhar Puensum', 'Gongga Shan', 'Annapurna III', 'Kula Kangri', 'Skyang Kangri', 'Liankang Kangri', 'Yukshin Gardan Sar', 'Annapurna IV', 'Saser Kangri II', 'Mamostong Kangri', 'Muztagh Ata', 'Ismoil Somoni Peak', 'Saser Kangri III', 'Noshaq', 'Pumari Chhish', 'Passu Sar', 'Jongsong Peak', 'Malubiting', 'Gangapurna', 'Muchu Chhish (

Now using the mountain names we can scrape for sentences, there are quite a few services to get the sentences given the word but I chose reverso context as it does not require an account and it is what I use personally

We'll get 3 sentences for each mountain, that gives us a nice list of around 4500 sentences

In [8]:
#reverso is a translation site, so it requires a language to translate to, it does not affect the scraping process
def get_reverso_sentences(word, source_lang='english', target_lang='french', num_sentences=3):
    url = f"https://context.reverso.net/translation/{source_lang}-{target_lang}/{word}"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f"Failed to retrieve data for '{word}'. Status code: {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    #Finding all sentence pairs
    sentence_pairs = soup.find_all('div', class_='example')

    #Extracting source language sentences
    sentences = [pair.find('div', class_='src ltr').text.strip() for pair in sentence_pairs]

    #Randomly selecting num_sentences if we have more than that
    if len(sentences) > num_sentences:
        sentences = random.sample(sentences, num_sentences)

    return sentences

Now that we have the mountains and the sentences we can save and use them for our model

In [None]:
all_sentences = []

for mountain in mountain_list:
  sentences = get_reverso_sentences(mountain)
  all_sentences.extend([sent for sent in sentences])

save_list_to_csv(mountain_list, 'mountains.csv')
save_list_to_csv(all_sentences, 'all_sentences.csv')

Failed to retrieve data for 'Silberzacken'. Status code: 404
Failed to retrieve data for 'Malubiting'. Status code: 404
Failed to retrieve data for 'Kangpenqing'. Status code: 404
Failed to retrieve data for 'Khartaphu'. Status code: 404
Failed to retrieve data for 'Tongshanjiabu'. Status code: 404
Failed to retrieve data for 'Pauhunri'. Status code: 404
Failed to retrieve data for 'Salasungo'. Status code: 404
Failed to retrieve data for 'Panchchuli'. Status code: 404
Failed to retrieve data for 'Parinaquta'. Status code: 404
Failed to retrieve data for 'Palcaraju'. Status code: 404
Failed to retrieve data for 'Mianzimu'. Status code: 404
Failed to retrieve data for 'Tebulosmta'. Status code: 404
Failed to retrieve data for 'Diklosmta'. Status code: 404
