### Working of the code

- Opens the UVF dictionary website using Chrome WebDriver.
- Loads and highlights categories one by one for manual selection.
- Extracts words from the carousel, identifying when it starts repeating.
- Clicks "Next" manually to move through all pages in the category.
- Extracts metadata for each word:
	- Example sentence in Portuguese
	- Example sentence in Libras (Brazilian Sign Language)
	- Video URL showing the sign
	- Hand Gesture Image (if available)
	- Category the word belongs to
- Saves the data in a dictionary and writes to a CSV file after each word.
- Returns to the category page and continues the process for the next category

In [None]:
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Open Chrome WebDriver
options = webdriver.ChromeOptions()
options.add_experimental_option("detach", True)  # Keep browser open
driver = webdriver.Chrome(options=options)
driver.get("https://sistemas.cead.ufv.br/capes/dicionario/")  # Open dictionary site

# Wait object (max 10 sec for efficiency)
wait = WebDriverWait(driver, 10)

# Get all category elements
category_elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".categories a")))

# Iterate through each category
for category in category_elements:
    # Highlight category and ask for manual selection
    driver.execute_script("arguments[0].style.border='3px solid red'", category)
    input("Click the highlighted category in the browser, then press Enter to continue...")

    word_metadata = {}  # Dictionary to store metadata
    words_list = []  # Store words in order

    while True:
        try:
            # Extract words from the current carousel page
            word_elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".carousel-inner .item.active a")))
            current_page_words = [word.text.strip() for word in word_elements if word.text.strip()]

            # Stop if words start repeating (carousel loop detected)
            if any(word in words_list for word in current_page_words):
                print("Loop detected. Returning to category selection.")
                break  # Exit loop to return to category selection

            words_list.extend(current_page_words)

            print(f"Processing words: {current_page_words}")

            # Extract metadata for each word
            for word in current_page_words:
                try:
                    time.sleep(2)
                    word_element = driver.find_element(By.LINK_TEXT, word)
                    word_url = word_element.get_attribute("href")

                    # Open word page
                    driver.get(word_url)

                    # Default value in case category is not found
                    category_name = "Not Available"

                    try:
                        # Find all elements with class 'result'
                        category_elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.result")))

                        for category in category_elements:
                            style_attr = category.get_attribute("style")  # Get the style attribute

                            # Check if the specific style is present in the element
                            if "background-color: rgb(0, 146, 167); color: rgb(255, 255, 255);" in style_attr:
                                category_name = category.text.strip()  # Extract category text
                                break  # Stop searching after finding the correct one

                    except:
                        pass  # Keep "Not Available" if category is missing



                    # Extract Example in Portuguese
                    example_pt = "Not Available"
                    try:
                        example_pt = wait.until(EC.presence_of_element_located(
                            (By.XPATH, "//h3[contains(text(),'Exemplo em português')]/following-sibling::h4"))).text.strip()
                    except:
                        pass

                    # Extract Example in Libras
                    example_libras = "Not Available"
                    try:
                        example_libras = wait.until(EC.presence_of_element_located(
                            (By.XPATH, "//h3[contains(text(),'Exemplo em libras')]/following-sibling::h4"))).text.strip()
                    except:
                        pass

                    # Extract Video URL
                    video_url = "Not Available"
                    try:
                        video_element = wait.until(EC.presence_of_element_located((By.TAG_NAME, "video")))
                        video_url = video_element.get_attribute("src")
                    except:
                        pass

                    # Extract Hand Gesture Image URL (if available)
                    hand_gesture_url = "Not Available"
                    try:
                        hand_gesture_element = driver.find_element(By.CSS_SELECTOR, ".pull-right img")
                        hand_gesture_url = hand_gesture_element.get_attribute("src")
                    except:
                        pass

                    # Store metadata in dictionary
                    word_metadata[word] = {
                        "category": category_name,
                        "example_pt": example_pt,
                        "example_libras": example_libras,
                        "video_url": video_url,
                        "hand_gesture": hand_gesture_url
                    }

                    # Save progress to CSV after each word
                    df = pd.DataFrame.from_dict(word_metadata, orient='index')
                    df.to_csv("UVF_Metadata.csv", encoding="utf-8", index_label="Word")

                except Exception as e:
                    print(f"Skipping word '{word}' due to error:", e)
                    continue

            # Highlight "Next" button and ask the user to manually click it
            try:
                next_button = driver.find_element(By.CSS_SELECTOR, ".right[role='button']")
                driver.execute_script("arguments[0].style.border='3px solid red'", next_button)
                input("Click the highlighted 'Next' button in the browser, then press Enter to continue...")
            except:
                print("No more words in this category. Returning to category selection.")
                break  # Exit loop to return to category selection

        except Exception as e:
            print("Error or no more words:", e)
            break  # Stop if any issue occurs

    # Return to the category page for the next category
    driver.back()

# Close browser after all categories are processed
driver.quit()
