In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [None]:
URL = "https://www.mastersportal.com/search/master/united-states"

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options

import time

In [None]:
def extractTag(URL, tagType, tagClass, timeout=10):
    options = Options()
    options.set_preference("dom.popup_maximum", 0)
    options.set_preference("privacy.popups.disable_from_plugins", 3)
    
    driver = webdriver.Firefox(options=options)

    # Navigate to the page
    driver.get(URL)

    try:
        # Wait for the dynamic content to load (adjust the timeout as needed)
        disciplines = WebDriverWait(driver, timeout).until(
            EC.presence_of_element_located((By.XPATH, f"//{tagType}[@class='{tagClass}']"))
        )

        # Extract the data
        scraped_data = disciplines.text
        print(scraped_data)

    finally:
        # Close the browser window
        driver.quit()
    return scraped_data

## Basic exploration

In [None]:
DISCIPLINES = extractTag(URL, "section", "DisciplineFilterWrapper")

In [None]:
DISCIPLINES

In [None]:
import json

In [None]:
cleanedUpDisciplines = DISCIPLINES.split('\n')[1::2]
with open('ALL_DISCIPLINES.json', 'w') as f:
    json.dump(cleanedUpDisciplines, f)

In [None]:
TargetURL = "https://www.mastersportal.com/search/master/business-management/united-states"

In [None]:
businessDISCIPLINES = extractTag(TargetURL, "section", "DisciplineFilterWrapper")

In [None]:
businessDISCIPLINES

In [None]:
cleanedUpSubDisciplines = businessDISCIPLINES.split('\n')[5::2]
with open('BUSINESS_DISCIPLINES.json', 'w') as f:
    json.dump(cleanedUpSubDisciplines, f)

## running for business disciplines

In [None]:
advertisingURL = "https://www.mastersportal.com/search/master/advertising/united-states"

In [None]:
allProgramsInfo = extractTag(advertisingURL, "ul", "SearchResultsList", timeout=15)

In [None]:
def _extractSinglePrograms(allProgramInfoList):
    programInfoCleaned = []
    currProgram = []

    for item in allProgramInfoList:
        if "Add to compare" in item:
            if currProgram:
                programInfoCleaned.append(currProgram)
                currProgram = []
        else:
           if item != 'Check match': currProgram.append(item) 

    if currProgram: programInfoCleaned.append(currProgram)
    return programInfoCleaned

def processProgramInfo(rawProgramInfo):
    cleanedUpProgramInfo = rawProgramInfo.split('\n')
    cleanedUpIndividualPrograms = _extractSinglePrograms(cleanedUpProgramInfo)
    return cleanedUpIndividualPrograms


In [None]:
x = processProgramInfo(allProgramsInfo)

In [None]:
relevantProgramInfo = [sublist[:7] for sublist in x]

In [None]:
accountingPageOneDF = pd.DataFrame(relevantProgramInfo)

In [None]:
accountingPageOneDF.columns=["Program", "Tuition", "Duration", "Description", 
                             "Program Type", "University", "Location"]

In [None]:
accountingPageOneDF

In [None]:
accountingPageOneDF.to_json('account_1.json', orient='records')
accountingPageOneDF.to_excel('advertising_pageOne.xlsx', index=False)

In [None]:
pageCount = int(extractTag(advertisingURL, "p", "SeeMoreLabelVar1", timeout=5)[-1])

In [None]:
allProgramsInfoRawList = []
for pageIndex in range(1, pageCount):
    currPageURL = f"{advertisingURL}/page={pageIndex + 1}" if pageIndex > 0 else advertisingURL
    allProgramsInfoRawList.extend(extractTag(currPageURL, "ul", "SearchResultsList", timeout=10))

In [None]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import TimeoutException

import time

def extractTagWithPagination(URL, tagType, tagClass, next_button_xpath, timeout=10, max_pages=None):
    options = Options()
    options.set_preference("dom.popup_maximum", 0)
    options.set_preference("privacy.popups.disable_from_plugins", 3)
    options.add_argument("--headless=new")
    
    driver = webdriver.Firefox(options=options)

    # Navigate to the page
    driver.get(URL)

    try:
        print(driver.current_url)
        page_number = 1
        while True:
            next_button = WebDriverWait(driver, timeout).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "NextButton"))
                )
            # Wait for the dynamic content to load (adjust the timeout as needed)
            disciplines = WebDriverWait(driver, timeout).until(
                EC.presence_of_element_located((By.XPATH, f"//{tagType}[@class='{tagClass}']"))
            )

            # Extract the data
            scraped_data = disciplines.text
            print(f"Page {page_number}:", scraped_data, "\n")
            print(f"Scraped Page {page_number} \n")

            # Check if there's a next button
            try:
                next_button.click()
                page_number += 1

                time.sleep(2)
            except TimeoutException:
                # Break the loop if there's no next button (reached the last page)
                print("EOF")
                break

            if max_pages is not None and page_number >= max_pages:
                break

    finally:
        scraped_data += disciplines.text
        # Close the browser window
        driver.quit()

    return scraped_data

# Example usage:
next_button_xpath = "//button[@class='NextButton']"  # Update with the correct XPath for the next button
max_pages = 2  # Set to None if you want to scrape all pages

a = extractTagWithPagination(advertisingURL, "ul", "SearchResultsList", next_button_xpath, max_pages=max_pages)


In [None]:
a

In [None]:
allProgramsInfoRawList = processProgramInfo(allProgramsInfo)
allProgramsInfoRawList = [sublist[:7] for sublist in allProgramsInfoRawList]

In [None]:
accountingDF = pd.DataFrame(relevantProgramInfo)
accountingDF.columns=["Program", "Tuition", "Duration", "Description", 
                             "Program Type", "University", "Location"]

In [None]:
accountingDF

## Other stuff

In [None]:
def changeGeoLoc(URL, tagType, tagClass, timeout=10):

    # Set the latitude and longitude for the desired geolocation
    location = {'latitude': 40.7128, 'longitude': -74.0060}

    firefox_options = webdriver.FirefoxOptions()

    # Enable geolocation
    firefox_options.set_preference("geo.enabled", True)

    # Set the geolocation coordinates using JavaScript
    firefox_options.set_preference("geo.provider.network.url", 
        f"data:application/json, {{\"location\": {{\"lat\": {location['latitude']}, \"lng\": {location['longitude']}}}}}")


    # Create a Firefox WebDriver with the configured options
    driver = webdriver.Firefox(options=firefox_options)

    # Example usage: Open a website to check geolocation
    driver.get(URL)

    try:
        # Wait for the dynamic content to load (adjust the timeout as needed)
        disciplines = WebDriverWait(driver, timeout).until(
            EC.presence_of_element_located((By.XPATH, f"//{tagType}[@class='{tagClass}']"))
        )

        # Extract the data
        scraped_data = disciplines.text
        print(scraped_data)


    finally:
        # Close the browser window
        driver.quit()
    return scraped_data

In [None]:
YRL = "https://www.mastersportal.com/search/master/actuarial-science"

In [None]:
def t2(URL, tagType, tagClass, nextButtonClass, timeout=10, isHeadless=False):
    """
    Extracts data from a paginated web page with dynamic content using Selenium.

    Args:
        URL (str): The URL of the web page to scrape.
        tagType (str): The HTML tag type (e.g., 'ul', 'div') containing the data.
        tagClass (str): The class attribute value of the HTML tag containing the data.
        nextButtonClass (str): The class attribute value of the 'Next' button for pagination.
        timeout (int, optional): Maximum time to wait for elements to load in seconds (default is 10).
        isHeadless (bool, optional): Option to set the browser in headless mode. Set to False by default.

    Returns:
        str: Concatenated string containing the scraped data from all pages.

    Notes:
        - The function uses a headless Firefox browser for scraping.
        - Adjust the 'timeout' parameter as needed based on the page load time.
        - It's recommended to set a reasonable delay using 'time.sleep()' to allow page load before scraping.
    """
    scraped_data = ''  # initializing a string accumulator

    options = Options()
    options.set_preference("dom.popup_maximum", 0)
    options.set_preference("privacy.popups.disable_from_plugins", 3)

    if isHeadless:
        options.add_argument("--headless")  # Setting browser to headless mode

    driver = webdriver.Firefox(options=options)

    driver.get(URL)  # Navigate to the page

    try:
        page_number = 1
        while True:
            # Wait for the dynamic content to load (adjust the timeout as needed)
            disciplines = WebDriverWait(driver, timeout).until(
                EC.presence_of_element_located((By.XPATH, f"//{tagType}[@class='{tagClass}']"))
            )

            # Extract the data
            scraped_data += disciplines.text + '\n'
            print(f"Scraped Page {page_number} \n")

            # Check if there's a next button and if it's enabled
            try:
                next_button = WebDriverWait(driver, timeout).until(
                    EC.presence_of_element_located((By.CLASS_NAME, nextButtonClass))
                )
                if not next_button.is_enabled():
                    print("Next button is disabled. End of pagination.")
                    break

                next_button.click()
                page_number += 1

                time.sleep(2)
            except TimeoutException:
                # Break the loop if there's no next button (reached the last page)
                print("EOF")
                break

    finally:
        # Close the browser window
        driver.quit()

    return scraped_data


In [None]:
t2(YRL, "ul", "SearchResultsList", "NextButton")

In [None]:
def extractTagWithPagination(URL, tagType, tagClass, nextButtonClass, timeout=10, max_pages=None, isHeadless=False):
    """
    Extracts data from a paginated web page with dynamic content using Selenium.

    Args:
        URL (str): The URL of the web page to scrape.
        tagType (str): The HTML tag type (e.g., 'ul', 'div') containing the data.
        tagClass (str): The class attribute value of the HTML tag containing the data.
        nextButtonClass (str): The class attribute value of the 'Next' button for pagination.
        timeout (int, optional): Maximum time to wait for elements to load in seconds (default is 10).
        max_pages (int, optional): Maximum number of pages to scrape. Set to None to scrape all pages.
        isHeadless (bool, optional): Option to set the browser in headless mode. Set to False by default.

    Returns:
        str: Concatenated string containing the scraped data from all pages.

    Notes:
        - The function uses a headless Firefox browser for scraping.
        - Adjust the 'timeout' parameter as needed based on the page load time.
        - It's recommended to set a reasonable delay using 'time.sleep()' to allow page load before scraping.
    """
    scraped_data = '' #initializing a string accumulator


    options = Options()
    options.set_preference("dom.popup_maximum", 0)
    options.set_preference("privacy.popups.disable_from_plugins", 3)

    if isHeadless: options.add_argument("--headless") # Setting browser to headless mode
    
    driver = webdriver.Firefox(options=options)

    driver.get(URL) # Navigate to the page

    try:
        page_number = 1
        while True:
            next_button = WebDriverWait(driver, timeout).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "NextButton"))
                )
            # Wait for the dynamic content to load (adjust the timeout as needed)
            disciplines = WebDriverWait(driver, timeout).until(
                EC.presence_of_element_located((By.XPATH, f"//{tagType}[@class='{tagClass}']"))
            )

            # Extract the data
            scraped_data += disciplines.text + '\n'
            print(f"Scraped Page {page_number} \n")

            # Check if there's a next button
            try:
                next_button.click()
                page_number += 1

                time.sleep(2)
            except TimeoutException:
                # Break the loop if there's no next button (reached the last page)
                print("EOF")
                break

            if max_pages is not None and page_number >= max_pages:
                break

    finally:
        scraped_data += disciplines.text
        print(f"Scraped Page {page_number} \n")
        # Close the browser window
        driver.quit()

    return scraped_data