In [36]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

import re
import pandas as pd
import time 
from typing import *
from tqdm import tqdm
import os
import datetime
import tkinter as tk
from tkinter import ttk, messagebox
import threading

# Meriam scraper

All words are scraped from https://www.merriam-webster.com

In [38]:
class scraper:
    def __init__(self, wait: float = 2.0, target_letter_len: int = 5, criteria: Literal["all", "common"] = "all", savepath: str= ""):

        self.scrraped_words = set()
        self.wait = wait
        self.target_letter_len = target_letter_len
        self.criteria = criteria
        self.savepath = savepath


        userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
        options = Options()
        options.add_argument(f'user-agent={userAgent}')
        self.driver = webdriver.Chrome(options=options)
        
        
        self.start()

    
    def get_words_one_page(self) -> List[str]:
        words = []
        word_list = self.driver.find_element(By.XPATH, './/div[@class="panel-body"]')
        word_elements = word_list.find_elements(By.XPATH, './/li[@class="pb-4 d-flex"]')
        for word_element in word_elements:
            words.append(word_element.text.strip())
        
        return words
    
    def start(self):
        url = f"https://www.merriam-webster.com/wordfinder/classic/contains/{self.criteria}/{self.target_letter_len}"
        vowels = ['a', 'e', 'i', 'o', 'u']
        for vowel in vowels:
            self.driver.get(f'{url}/{vowel}/1')
            try:
                # Will throw exception if only one page
                temp = self.driver.find_elements(By.XPATH, './/div[@class="wordfinder-pagination mt-5"]/div/div/ul/li')
                len_pages = int(temp[-1].text)
                for page in range(2, len_pages + 1):
                    time.sleep(self.wait)
                    self.driver.get(f'{url}/{vowel}/{page}')
                    words = self.get_words_one_page()
                    self.scrraped_words.update(words)
            except:
                continue

        if self.savepath:
            self.save_words(self.savepath)
    
    def save_words(self, savepath: str):
        df = pd.DataFrame(sorted(list(self.scrraped_words)), columns=['word'])
        df.to_csv(savepath, index=False)

In [None]:
run = scraper(wait=0.5, target_letter_len=5, savepath="scraped_5.csv")