In [None]:
# ActionChains Documentation: 
# https://www.selenium.dev/selenium/docs/api/py/webdriver/selenium.webdriver.common.action_chains.html#selenium.webdriver.common.action_chains.ActionChains.release

# Beautiful Soup Docs:
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import pandas as pd

# Set up Selenium
options = webdriver.ChromeOptions()
options.headless = True  # Run Chrome in headless mode (without GUI)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Visit Vivino's page
url = "https://www.vivino.com/explore"
driver.get(url)

# Wait for the page to load properly
wait = WebDriverWait(driver, 10)

# Step 1: click the dropdown menu
dropdown = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "simpleLabel-module__selectedKey--3ngzL")))
dropdown.click()
time.sleep(2)

# Step 2: select "United States" from the list
country_option = wait.until(EC.element_to_be_clickable((By.XPATH, "//a[@data-value='us']")))
country_option.click()
time.sleep(3)

# Step 3: unselect the "Red" wine filter
red_wine_label = wait.until(EC.element_to_be_clickable((By.XPATH, "//span[contains(@class, 'MuiButtonBase-root') and @data-testid='wineTypes_1']")))
driver.execute_script("arguments[0].click();", red_wine_label)
time.sleep(3)

# Step 4: drag slider to select all prices.
# Find the first slider handle (left)
slider_1 = driver.find_element(By.XPATH, "//div[@class='rc-slider-handle rc-slider-handle-1']")

# Use ActionChains to drag the first slider
actions = ActionChains(driver)
actions.click_and_hold(slider_1).move_by_offset(-10, 0).release().perform()

# Find the second slider handle (right)
slider_2 = driver.find_element(By.XPATH, "//div[@class='rc-slider-handle rc-slider-handle-2']")

# Use ActionChains to drag the second slider
actions = ActionChains(driver)
actions.click_and_hold(slider_2).move_by_offset(150, 0).release().perform()

# Step 5: Select Any Rating option
any_rating_label = wait.until(EC.element_to_be_clickable((By.XPATH, "//label[.//input[@name='rating' and @value='1']]")))
driver.execute_script("arguments[0].click();", any_rating_label)


# Step 6: Select Countries
countries_dropdown = driver.find_element(By.XPATH, "//legend[@data-testid='filter-toggle-button'][.//h5[.='Countries']]")
driver.execute_script("arguments[0].click();", countries_dropdown)
time.sleep(2)
countries_ar = wait.until(EC.element_to_be_clickable((By.XPATH, "//span[contains(@class, 'MuiButtonBase-root')]/parent::label[@data-testid='countries_ar']")))
#countries_cl = wait.until(EC.element_to_be_clickable((By.XPATH, "//span[contains(@class, 'MuiButtonBase-root')]/parent::label[@data-testid='countries_cl']")))
#driver.execute_script("arguments[0].click(); arguments[1].click();", countries_ar countries_cl)
driver.execute_script("arguments[0].click();", countries_ar)
countries_search = wait.until(EC.element_to_be_clickable((By.XPATH, "//input[@id='filter-checkboxes-search' and @placeholder='Search countries']")))
countries_search.click()
countries_search.send_keys("Brazil")
time.sleep(1)
countries_br = wait.until(EC.element_to_be_clickable((By.XPATH, "//span[contains(@class, 'MuiButtonBase-root')]/parent::label[@data-testid='countries_br']")))
driver.execute_script("arguments[0].click()", countries_br)

# Wait and see the result
time.sleep(5)

def getPageLinks(list):
    next_page = True

    # Get the page source after it's fully loaded
    html = driver.page_source

    # Pass it to BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')

    links = [a["href"] for a in soup.find_all('a', {"data-testid": "vintagePageLink"})]

    full_links = ["https://www.vivino.com" + link for link in links]

    list.extend(full_links)

        
    try:
        next_page_btn = wait.until(EC.element_to_be_clickable((By.XPATH, "//a[@data-trackingid='buttonDefault'][.//span[.='Next']]")))
        
        try:
            disabled_btn = next_page_btn.get_attribute("aria-disabled")        
            if disabled_btn == "true":
                next_page = False
                return next_page
        except:
            driver.execute_script("arguments[0].click()", next_page_btn)
            time.sleep(5)
            next_page = False
            return next_page
        
    except:
        next_page = False
        return next_page

link_list = []
next_page_available = True

while next_page_available:
    next_page_available = getPageLinks(link_list)

link_df = pd.DataFrame(link_list, columns=["wine_link"])

route_save = "../src/data/raw/links/"

link_df.to_csv(route_save + "wine_links.csv", index=False)

driver.quit()

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import pandas as pd

route_import = "../src/data/raw/links/"

csv = pd.read_csv(route_import + "rest-links3.csv")
wine_df = pd.DataFrame(csv, columns=["wine_link"]).drop_duplicates()
wine_df["name"] = None
wine_df["year"] = None
wine_df["winery"] = None
wine_df["rating"] = None
wine_df["rating_qty"] = None
wine_df["price"] = None
wine_df["body"] = None
wine_df["tannins"] = None
wine_df["sweetness"] = None
wine_df["acidity"] = None
wine_df["notes"] = None
wine_df["pairings"] = None
wine_df["grapes"] = None
wine_df["region"] = None
wine_df["style"] = None
wine_df["image"] = None

options = webdriver.ChromeOptions()
options.headless = False  
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

first_link = True

# Iterar en las columnas del DF
for index, row in wine_df.iterrows():
    link = row["wine_link"]
    driver.get(link)

    if first_link:
        wait = WebDriverWait(driver, 30)

        # Step 1: click the dropdown menu
        dropdown = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "simpleLabel-module__selectedKey--3ngzL")))
        dropdown.click()
        time.sleep(5)

        # Step 2: select "United States" from the list
        country_option = wait.until(EC.element_to_be_clickable((By.XPATH, "//a[@data-value='us']")))
        country_option.click()
        time.sleep(5)

        first_link = False
        
    time.sleep(6)  

    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    # DATA 1: NOMBRE (name)
    try:
        wine_name_tag = soup.find("a", {"data-cartitemsource": "wine-page-master-link"})
        wine_name = wine_name_tag.get_text(strip=True) if wine_name_tag else None
    except:
        wine_name = None

    # DATA 2: AÑO (year)
    wine_year = None

    try:
        if wine_name_tag:
            parent_div = wine_name_tag.find_parent("div")
            if parent_div:
                year_text = parent_div.get_text(strip=True)
                wine_year = year_text.replace(wine_name, "").strip()
    except:
        wine_year = None

    # DATA 3: BODEGA (WINERY)
    try:
        winery = soup.find("h1").find("div").find("a").find("div").get_text(strip=True)
    except:
        winery = None

    # DATA 4: REVIEWS (Rating)
    try:
        rating_tag = soup.find("a", {"href": "#all_reviews"})
        rating_value = rating_tag.find("div").find("div").get_text(strip=True)
        rating_qty = rating_tag.find_all("div")[-1].get_text(strip=True).split(" ")[0]
    except:
        rating_tag = None
        rating_value = None
        rating_qty = None

    # DATA 5: PRICE (Precio)
    try:
        price = soup.find("span", {"class": "purchaseAvailability__currentPrice--3mO4u"}).get_text(strip=True).replace("$", "")
    except:
        price = None

    # DATA 6: TASTE (Sabor)

    try:
        wine_taste = soup.find_all("tr", {"class": "tasteStructure__tasteCharacteristic--jLtsE"})

        body = None
        tannins = None
        sweetness = None
        acidity = None

        for i, e in enumerate(wine_taste):
            # name = buscar sabor
            name = wine_taste[i].find("td").get_text(strip=True)
            bar = wine_taste[i].find("span", {"class", "indicatorBar__progress--3aXLX"})["style"]
            val = bar.split(";")[1].split(":")[1]
            
            # 6.1 : Light/Bold - Body (Cuerpo)   
            if name == "Light":
                body = val
            
            # 6.2 : Smooth/Tannic - tannins (Tanicidad)
            elif name == "Smooth":
                tannins = val
            
            # 6.3 : Dry/Sweet - Sweetness (Dulzura)
            elif name == "Dry":
                sweetness = val
            
            # 6.4 : Soft/Acid - Acidity (Acidez)
            elif name == "Soft":
                acidity = val
    except:
        body = None
        tannins = None
        sweetness = None
        acidity = None


    # DATA 7 : NOTE MENTIONS (Menciones de notas)

    try:
        mentions = soup.find_all("div", {"data-testid": "mentions"})

        mentions_dict = {}

        for mention in mentions:
            note = mention.find("span").get_text(strip=True)
            n_mentions = mention.get_text().split(" ")[0]
            mentions_dict[str(note)] = int(n_mentions)
    except:
        mentions_dict = None
    
    # DATA 8 : PAIRINGS (Maridajes)
    try:
        pairing_tags = soup.find("div", {"class": "foodPairing__foodContainer--1bvxM"}).find_all("a", recursive=False)
        
        pairing_list = []

        for i in pairing_tags:
            pairing = i.find("div", {"role": "img"})["aria-label"].lower()
            pairing_list.append(pairing)
    except:
        pairing_tags = None


    # DATA 9 : GRAPES (Uvas)
    try:
        wine_facts = soup.find_all("tr", {"data-testid": "wineFactRow"})
    except:
        wine_facts = None

    try:
        grapes = list(wine_facts[1].find("td").get_text(strip=True).split(","))
    except:
        grapes = None

    try:
    # DATA 10 : REGION
        region = list(wine_facts[2].find("td").get_text(strip=True).split("/"))
    except:
        region = None

    try:
        # DATA 11 : STYLE (Estilo)
        style = wine_facts[3].find("td").get_text()
    except:
        style = None

    try:
        # DATA 12 : WINE IMAGE (Imagen del Vino)
        img = "https:" + soup.find("img", {"class": "wineLabel-module__image--3HOnd"})["src"]
    except:
        img = None

    
    # ÚLTIMO PASO: GUARDAR EN DATAFRAME
    wine_df.at[index, "name"] = wine_name
    wine_df.at[index, "year"] = wine_year
    wine_df.at[index, "winery"] = winery
    wine_df.at[index, "rating"] = rating_value
    wine_df.at[index, "rating_qty"] = rating_qty
    wine_df.at[index, "price"] = price
    wine_df.at[index, "body"] = body
    wine_df.at[index, "tannins"] = tannins
    wine_df.at[index, "sweetness"] = sweetness
    wine_df.at[index, "acidity"] = acidity
    wine_df.at[index, "notes"] = mentions_dict
    wine_df.at[index, "pairings"] = pairing_list
    wine_df.at[index, "grapes"] = grapes
    wine_df.at[index, "region"] = region
    wine_df.at[index, "style"] = style
    wine_df.at[index, "image"] = img


route_save = "../src/data/raw/scraped_wines/"

# Guardar el CSV actualizado
wine_df.to_csv(route_save + "wines1056-2026.csv", index=False)

driver.quit()

wine_df

In [None]:
scraped_wines = wine_df.loc[wine_df["name"].notna()]
scraped_wines.to_csv("wines1056-2026.csv", index=False)

In [None]:
route_import = "../src/data/raw/links/"
wines_left = wine_df.loc[wine_df["name"].isna()]["wine_link"]
wines_left.to_csv(route_import + "rest-links3.csv", index=False)

In [None]:
import pandas as pd

route_import = "../src/data/raw/scraped_wines/"
wines_1 = pd.read_csv(route_import + "wines1-155.csv")
wines_2 = pd.read_csv(route_import + "wines156-866.csv")
wines_3 = pd.read_csv(route_import + "wines867-1055.csv")
wines_4 = pd.read_csv(route_import + "wines1056-2026.csv")

df_wines_list = [wines_1, wines_2, wines_3, wines_4]

all_wines = pd.concat(df_wines_list, ignore_index=True, sort=False)

route_save = "../src/data/processed/"

all_wines.to_csv("all_wines.csv", index=False)