# Firefox version - Working. Tested

In [None]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import openpyxl
import re
import time
import os
import glob

def rename_last_modified_file(folder_path, desired_filename):
    # Get the list of files in the folder
    file_list = glob.glob(os.path.join(folder_path, '*'))

    # Sort the files by modification time (most recent first)
    sorted_files = sorted(file_list, key=os.path.getmtime, reverse=True)

    # Find the first file that is not a directory
    for file_path in sorted_files:
        if not os.path.isdir(file_path):
            # Rename the file to the desired filename
            new_file_path = os.path.join(folder_path, desired_filename)
            os.rename(file_path, new_file_path)
            print(f"Renamed '{os.path.basename(file_path)}' to '{desired_filename}'")
            return

    print("No files found in the folder")

# Configure Selenium WebDriver (you need to download the appropriate driver for your browser)
geckodriver_path = '../webdriver/geckodriver.exe'  # Update with the correct file name and extension

# Set Firefox profile with download directory preference
firefox_profile = webdriver.FirefoxProfile()
download_directory_path = os.path.abspath('../data/votes_download')
firefox_profile.set_preference("browser.download.folderList", 2)
firefox_profile.set_preference("browser.download.dir", download_directory_path)
firefox_profile.set_preference("browser.download.manager.showWhenStarting", False)
firefox_profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream")

# Create a FirefoxOptions instance with the configured profile
firefox_options = Options()
firefox_options.profile = firefox_profile

# Create a Firefox WebDriver instance with the configured options
driver = webdriver.Firefox(executable_path=geckodriver_path, options=firefox_options)

# Define directories and create them if required
directory_path = '../data'
os.makedirs(directory_path, exist_ok=True)

# File for input and outputs
outfile_file = os.path.join(directory_path, 'data.xlsx')

# Load the data file
data_workbook = openpyxl.load_workbook(outfile_file)
Input_sheet = data_workbook['Snapshot Summary']

# Create a new workbook for posts
Output_sheet = data_workbook.create_sheet(title='Snapshot Details')

# Create titles
titles = ['Title', 'Status', 'Proposer', 'Proposal motivation', 'Start date', 'End date','Downloaded Filename']
for column, title in enumerate(titles, start=1):
    Output_sheet.cell(row=1, column=column).value = title


def scrape_posts(link, title, sheet, driver,download_directory_path):
    # Open the webpage
    driver.get(link)
    time.sleep(15) # wait for elements to load

    # Get the HTML source of the page and parse using bs4
    html_source = driver.page_source
    soup = BeautifulSoup(html_source, 'html.parser')

    row = sheet.max_row + 1
    status = soup.find('span', class_='bg-violet-600 State text-white mr-2').text
    proposer = soup.find('span', class_='w-full cursor-pointer truncate text-skin-link').text
    proposal_motivation = soup.find('div', class_='markdown-body break-words').text
    start_date = soup.find('div', class_='space-y-1').find_all('div')[3].text
    end_date = soup.find('div', class_='space-y-1').find_all('div')[4].text

    download_button = driver.find_element(By.CSS_SELECTOR, 'button.flex:nth-child(3)')
    download_button.click()
    print('Clicked on Download button')
    
    time.sleep(15) # Waits 15 seconds for file to be downloaded
    desired_filename = re.sub(r'\W+', '_', title) + '.csv'
    rename_last_modified_file(download_directory_path,desired_filename)

    sheet.cell(row=row, column=1).value = title
    sheet.cell(row=row, column=2).value = status
    sheet.cell(row=row, column=3).value = proposer
    sheet.cell(row=row, column=4).value = proposal_motivation
    sheet.cell(row=row, column=5).value = start_date
    sheet.cell(row=row, column=6).value = end_date
    sheet.cell(row=row, column=7).value = desired_filename

#     print(title)
#     print(status)
#     print(proposer)
#     print(start_date)
#     print(end_date)

    print(f"Topic {title} completed. Moving to next")


# Iterate through the links in the data file
for row in Input_sheet.iter_rows(min_row=2, max_col=9, values_only=True):
    title, link = row[1], row[5]
    print(f'Title - {title} started')
    print(link)
    scrape_posts(link, title, Output_sheet, driver,download_directory_path)
    data_workbook.save(outfile_file)
    print(f'Title - {title} finished')
    print("*" * 50)

# Save the workbook
data_workbook.save(outfile_file)

# Close the browser
driver.quit()


# ChatGPT generated Chrome version
### Need to update chromedriver path and data file/output file paths

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import openpyxl
import re
import time
import os
import glob

def rename_last_modified_file(folder_path, desired_filename):
    # Get the list of files in the folder
    file_list = glob.glob(os.path.join(folder_path, '*'))

    # Sort the files by modification time (most recent first)
    sorted_files = sorted(file_list, key=os.path.getmtime, reverse=True)

    # Find the first file that is not a directory
    for file_path in sorted_files:
        if not os.path.isdir(file_path):
            # Rename the file to the desired filename
            new_file_path = os.path.join(folder_path, desired_filename)
            os.rename(file_path, new_file_path)
            print(f"Renamed '{os.path.basename(file_path)}' to '{desired_filename}'")
            return

    print("No files found in the folder")

# Configure Selenium WebDriver (you need to download the appropriate driver for your browser)
chromedriver_path = '../webdriver/chromedriver.exe'  # Update with the correct file name and extension

# Set Chrome options with download directory preference
chrome_options = Options()
download_directory_path = os.path.abspath('../data/votes_download')
chrome_options.add_experimental_option("prefs", {
    "download.default_directory": download_directory_path,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True,
    "safebrowsing.enabled": True
})

# Create a Chrome WebDriver instance with the configured options
driver = webdriver.Chrome(executable_path=chromedriver_path, options=chrome_options)

# Define directories and create them if required
directory_path = '../data'
os.makedirs(directory_path, exist_ok=True)

# File for input and outputs
outfile_file = os.path.join(directory_path, 'data.xlsx')

# Load the data file
data_workbook = openpyxl.load_workbook(outfile_file)
Input_sheet = data_workbook['Snapshot Summary']

# Create a new workbook for posts
Output_sheet = data_workbook.create_sheet(title='Snapshot Details')

# Create titles
titles = ['Title', 'Status', 'Proposer', 'Proposal motivation', 'Start date', 'End date','Downloaded Filename']
for column, title in enumerate(titles, start=1):
    Output_sheet.cell(row=1, column=column).value = title


def scrape_posts(link, title, sheet, driver, download_directory_path):
    # Open the webpage
    driver.get(link)
    time.sleep(10) # wait for elements to load

    # Get the HTML source of the page and parse using bs4
    html_source = driver.page_source
    soup = BeautifulSoup(html_source, 'html.parser')

    row = sheet.max_row + 1
    status = soup.find('span', class_='bg-violet-600 State text-white mr-2').text
    proposer = soup.find('span', class_='w-full cursor-pointer truncate text-skin-link').text
    proposal_motivation = soup.find('div', class_='markdown-body break-words').text
    start_date = soup.find('div', class_='space-y-1').find_all('div')[3].text
    end_date = soup.find('div', class_='space-y-1').find_all('div')[4].text

    download_button = driver.find_element(By.CSS_SELECTOR, 'button.flex:nth-child(3)')
    download_button.click()
    print('Clicked on Download button')
    
    time.sleep(15) # Waits 15 seconds for file to be downloaded
    desired_filename = re.sub(r'\W+', '_', title) + '.csv'
    rename_last_modified_file(download_directory_path, desired_filename)

    sheet.cell(row=row, column=1).value = title
    sheet.cell(row=row, column=2).value = status
    sheet.cell(row=row, column=3).value = proposer
    sheet.cell(row=row, column=4).value = proposal_motivation
    sheet.cell(row=row, column=5).value = start_date
    sheet.cell(row=row, column=6).value = end_date
    sheet.cell(row=row, column=7).value = desired_filename

    print(f"Topic {title} completed. Moving to next")


# Iterate through the links in the data file
for row in Input_sheet.iter_rows(min_row=2, max_col=9, values_only=True):
    title, link = row[1], row[5]
    print(f'Title - {title} started')
    print(link)
    scrape_posts(link, title, Output_sheet, driver, download_directory_path)
    data_workbook.save(outfile_file)
    print(f'Title - {title} finished')
    print("*" * 50)

# Save the workbook
data_workbook.save(outfile_file)

# Close the browser
driver.quit()
