In [3]:
import time
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import (
    StaleElementReferenceException, 
    TimeoutException, 
    NoSuchElementException
)
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import os




# --------------------------------------------------------------------------------------------------------
# Part 1: Function to scrape links from a single URL with additional filtering
# --------------------------------------------------------------------------------------------------------

def scrape_links(driver, url, retries=3):
    def perform_scrape():
        driver.get(url)
        links = driver.find_elements(By.TAG_NAME, 'a')
        hrefs_1 = []

        for link in links:
            try:
                href = link.get_attribute('href')
                text_content = link.text  # Get the visible text of the link

   
                # Check in href, title, class, id, or text content
                if href:
                    if ('call' in (href or '').lower() or
                        'call' in (text_content or '').lower() or
                        '2024' in (href or '').lower() or
                        '2024' in (text_content or '').lower()):
                        hrefs_1.append(href)

            except StaleElementReferenceException:
                continue

        return hrefs_1

    for attempt in range(retries + 1):
        try:
            hrefs_1 = perform_scrape()
            return hrefs_1
        except StaleElementReferenceException:
            if attempt < retries:
                print(f"Retrying {url} due to stale element reference...")
                time.sleep(2)
            else:
                print(f"Error scraping links from {url} : StaleElementReferenceException")
                return []
        except Exception as e:
            print(f"Error scraping links from {url} : {e}")
            return []
        

# --------------------------------------------------------------------------------------------------------
# Part 3: Scraping the links from the inital links
# --------------------------------------------------------------------------------------------------------
 
    
# Path to your Chrome WebDriver executable
webdriver_path = r'C:\Users\Lee Ming Jia\Desktop\driver\chromedriver-win64\chromedriver.exe'

# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run Chrome in headless mode (without opening browser window)

# Create a new Chrome WebDriver
driver = webdriver.Chrome(service=ChromeService(executable_path=webdriver_path), options=chrome_options)

# Open the target webpage
url = "https://csrankings.org/#/fromyear/2024/toyear/2024/index?all&us"
driver.get(url)

# Allow some time for the page to load
time.sleep(5)

# Locate the div with the specified class
div_element = driver.find_element(By.CSS_SELECTOR, "div.col-centered.col-xs-6.col-sm-push-6.col-sm-6.col-md-6.col-lg-6.text-center")

# Find all <p> tags with the specified class within the div
p_elements = div_element.find_elements(By.CSS_SELECTOR, "p.text-muted[style='font-variant:small-caps;']")

link_1 = set()

# Extract and print the links from the <p> tags
for p in p_elements:
    links = p.find_elements(By.TAG_NAME, "a")
    for link in links:
        href = link.get_attribute("href")
        link_1.add(href)

print(link_1)

# --------------------------------------------------------------------------------------------------------
# Part 4: Scraping the rest of the links from the individual websites
# --------------------------------------------------------------------------------------------------------

# Initialize a set to store the collected links
link_call = set()

for url in link_1:
    print(f"Processing URL: {url}")
    scraped_links = scrape_links(driver, url)
    link_call.update(scraped_links)  # Add scraped links to the link_call_all set

num_links_1 = len(link_call)
print(f"Number of links in link_call: {num_links_1}")

print("First part done")
    
# Print the collected links from the first scraping
#print("Links from the first scraping (link_call):")
#print(link_call)

# Step 2: Second round of scraping based on the links collected in link_call
#link_call_all = set(link_call)  # Initialize the new set with the links from the first scraping

link_call_all = set()
link_call_all_1 = set()
link_call_all_2 = set()

print('link_call_all')
for url in link_call:
    #print(f"Processing scraped URL: {url}")
    
    # Navigate to the URL using Selenium
    driver.get(url)
    
    # Optionally, check if the URL itself contains '2024' (you can adjust this condition as needed)

    scraped_links = scrape_links(driver, url)
        
    # Add only the links that contain '2024'
    for link in scraped_links:
        if '2024' in link and 'call' in link.lower():  # Check both '2024' and case-insensitive 'call'
            link_call_all.add(link)

print('link_call_all_1')
for url in link_call:
    #print(f"Processing scraped URL: {url}")
    
    # Navigate to the URL using Selenium
    driver.get(url)
    
    # Optionally, check if the URL itself contains '2024' (you can adjust this condition as needed)

    scraped_links = scrape_links(driver, url)
        
    # Add only the links that contain '2024'
    for link in scraped_links:
        if '24' in link and 'call' in link.lower():  # Check both '2024' and case-insensitive 'call'
            link_call_all_1.add(link)        

print('link_call_all_2')
for url in link_call:
    #print(f"Processing scraped URL: {url}")
    
    # Navigate to the URL using Selenium
    driver.get(url)
    
    # Optionally, check if the URL itself contains '2024' (you can adjust this condition as needed)

    scraped_links = scrape_links(driver, url)
        
    # Add only the links that contain '2024'
    for link in scraped_links:
        if 'call' in link.lower():  # Check both '2024' and case-insensitive 'call'
            link_call_all_2.add(link)


            
# Final output
#print("All collected links from the second round restricted to 2024 (link_call_all):")
#for link in link_call_all:
#    print(link)

num_links_2 = len(link_call_all)
print(f"Number of links in link_call_all: {num_links_2}")
num_links_3 = len(link_call_all_1)
print(f"Number of links in link_call_all when 2024 is 24: {num_links_3}")
num_links_4 = len(link_call_all_2)
print(f"Number of links in link_call_all when 2024 is not included: {num_links_4}")
# Close the WebDriver after scraping is done
driver.quit()

print('Finish')

{'https://www.computer.org/communities/technical-committees/tcmf', 'https://sigecom.org/', 'http://sigplan.org/', 'https://sigda.org/', 'https://sigcse.org/', 'https://www.kdd.org/', 'https://www.ieee-security.org/', 'https://sigmod.org/', 'http://sigir.org/', 'https://www.aclweb.org/portal/', 'https://sigact.org/', 'http://sigsac.org/', 'https://iclr.cc/', 'https://www.sigarch.org/', 'http://sigmetrics.org/', 'https://www.sighpc.org/', 'https://sigchi.org/', 'https://tc.computer.org/vgtc/', 'https://siggraph.org/', 'http://www.machinelearning.org/', 'https://siglog.org/', 'https://www.usenix.org/', 'https://aaai.org/', 'https://www.sigsoft.org/index.html', 'https://neurips.cc/', 'http://sigai.acm.org/index.html', 'https://sigmobile.org/', 'http://sigcomm.org/', 'https://sigbed.org/', 'https://www.iacr.org/', 'https://www.cv-foundation.org/', 'https://sigops.org/', 'http://www.sigbio.org/', 'https://www.ieee-ras.org/'}
Processing URL: https://www.computer.org/communities/technical-comm

In [4]:
    
# Path to your Chrome WebDriver executable
webdriver_path = r'C:\Users\Lee Ming Jia\Desktop\driver\chromedriver-win64\chromedriver.exe'

# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run Chrome in headless mode (without opening browser window)

# Create a new Chrome WebDriver
driver = webdriver.Chrome(service=ChromeService(executable_path=webdriver_path), options=chrome_options)

link_call_all_3 = set()

print('link_call_all_3')
for url in link_call:
    #print(f"Processing scraped URL: {url}")
    
    # Navigate to the URL using Selenium
    driver.get(url)
    
    # Optionally, check if the URL itself contains '2024' (you can adjust this condition as needed)

    scraped_links = scrape_links(driver, url)
        
    # Add only the links that contain '2024'
    for link in scraped_links:
        if '2024' in link:  # Check both '2024' and case-insensitive 'call'
            link_call_all_3.add(link)

            
num_links_5 = len(link_call_all_3)
print(f"Number of links in link_call_all when call is not included but 2024 is: {num_links_5}")
# Close the WebDriver after scraping is done
driver.quit()

print('Finish')

link_call_all_3
Number of links in link_call_all when call is not included but 2024 is: 1090
Finish
