In [9]:
import requests
from bs4 import BeautifulSoup
import csv
import os
import time
# from gsc_crawler import get_google_scholar_url


In [8]:
!pip install -r /Users/paniz/Documents/GitHub/SE390/ReSearch/requirements.txt

Collecting openai==1.47.0 (from -r /Users/paniz/Documents/GitHub/SE390/ReSearch/requirements.txt (line 2))
  Using cached openai-1.47.0-py3-none-any.whl.metadata (24 kB)
Collecting pandas==2.2.3 (from -r /Users/paniz/Documents/GitHub/SE390/ReSearch/requirements.txt (line 3))
  Using cached pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting PyYAML==6.0.2 (from -r /Users/paniz/Documents/GitHub/SE390/ReSearch/requirements.txt (line 4))
  Using cached PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.1 kB)
Collecting requests==2.32.3 (from -r /Users/paniz/Documents/GitHub/SE390/ReSearch/requirements.txt (line 5))
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting scholarly==1.7.11 (from -r /Users/paniz/Documents/GitHub/SE390/ReSearch/requirements.txt (line 6))
  Using cached scholarly-1.7.11-py3-none-any.whl.metadata (7.4 kB)
Collecting wordcloud==1.9.3 (from -r /Users/paniz/Documents/GitHub/SE390/ReSearch/requirements.txt (line 7)

In [10]:
import requests
from bs4 import BeautifulSoup
import csv
import os
import time

from gsc_crawler import get_google_scholar_url

# Function to crawl profile data from the award profile URL
def profile_crawler(name, profile_url):
    response = requests.get(profile_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    last_name, first_name = name.split(", ")
    full_name = soup.find('h1').text.strip()  # Ensure no trailing spaces

    awards_info = soup.find_all('section', {'class': 'awards-winners__citation'})
    acm_award = next((award for award in awards_info if award.find('h2').a.text == 'ACM A. M. Turing Award'), None)

    if acm_award:
        location, year = acm_award.find('h3', {'class': 'awards-winners__location'}).text.split(' - ')
        citation = ' '.join(acm_award.find('p', {'class': "awards-winners__citation-short"}).text.split('\n')).strip()
    else:
        location, year, citation = '', '', ''

    # Extract Google Scholar data
    gsc_data = get_google_scholar_url(full_name)
    if not gsc_data and len(full_name.split()) >= 3:
        first_last_name = f'{full_name.split()[0]} {full_name.split()[-1]}'
        gsc_data = get_google_scholar_url(first_last_name)
    
    if gsc_data:
        gsc_url = f'https://scholar.google.com/citations?user={gsc_data["scholar_id"]}'
        affiliation = gsc_data.get("affiliation", "")
        interests = " ".join(gsc_data.get('interests', []))
    else:
        gsc_url, affiliation, interests = '', '', '[]'

    return [last_name, first_name, year, location, citation, profile_url, gsc_url, affiliation, interests]


# Scraping ACM Turing Award page
url = 'https://awards.acm.org/turing/award-recipients'
session = requests.Session()

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

response = session.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

# Locate the table and check if it exists
table = soup.find('table', class_='awards-tables--fullWidth')
if not table:
    print("Table not found!")
    exit(1)

# Extract table headers (optional step)
headers = [th.text.strip() for th in table.find('thead').find_all('th')]

# Extract table rows
rows = []
for tr in table.find('tbody').find_all('tr'):
    # Extract Name
    name_element = tr.find_all('td')[0].find('a')
    recipient_name = name_element.text.strip()
    profile_url = f'https://awards.acm.org{name_element["href"]}'
    
    # Extract Award
    award = tr.find_all('td')[1].text.strip()
    
    # Extract Year
    year = tr.find_all('td')[2].text.strip()
    
    # Extract Region
    region = tr.find_all('td')[3].text.strip()
    
    # Extract DL Link (if available)
    dl_element = tr.find_all('td')[4].find('a')
    dl_link = dl_element['href'] if dl_element else 'N/A'
    
    rows.append([recipient_name, award, year, region, dl_link])

# Print the extracted data for verification
print("Headers:", headers)
for row in rows:
    print("Row:", row)

# Sort rows by year (column index 2)
rows.sort(key=lambda row: int(row[2]), reverse=True)

# Handling file and checkpoint for resuming
it = 0
checkpoint = 'last_iteration.txt'
fileName = 'acm_turings.csv'
fileExist = os.path.isfile(fileName) and os.path.isfile(checkpoint)

with open(fileName, 'a' if fileExist else 'w', newline='') as file:
    writer = csv.writer(file)
    # Write the header row if the file is new
    if not fileExist:
        writer.writerow(['Index', 'Last Name', 'Given Name', 'Year', 'Region', 'Award', 'Profile URL', 'Digital Library Link', 'Google Scholar Profile', 'Affiliation', 'Interests'])
    else:
        with open(checkpoint, 'r') as f:
            index = int(f.readline().split(':')[-1])
            rows = rows[index:]
            it = index
    
    for row in rows:
        try:
            # Separate last name and first name from the recipient name
            name = row[0]
            profile_url = row[4]  # Profile URL is the last column in the row

            # Clean the name to remove non-ASCII characters
            name_clean = ''.join([i if ord(i) < 128 else ' ' for i in name])
            
            # Crawl profile data
            data = profile_crawler(name_clean, profile_url)
            it += 1

            data.insert(0, it)  # Add index at the start
            writer.writerow(data)

            if it % 20 == 0:
                print(f"Finished {it} iterations...")
            time.sleep(1)

        except KeyboardInterrupt:
            print("Process interrupted manually.")
            with open(checkpoint, 'w') as f:
                f.write(f'Last completed iteration: {it}')
            break

        except Exception as e:
            print(f"Exception occurred: {e}")
            with open(checkpoint, 'w') as f:
                f.write(f'Failed at iteration: {it}')
            break


Headers: ['Name', 'Award', 'Year', 'Region', 'DL']
Row: ['Milner,\xa0A\xa0J', 'ACM A. M. Turing Award', '1991', 'Europe', 'https://dl.acm.org/author_page.cfm?id=81332515695']
Row: ['Perlis,\xa0A.\xa0J.', 'ACM A. M. Turing Award', '1966', 'North America', 'https://dl.acm.org/author_page.cfm?id=81100086771']
Row: ['Shamir,\xa0Adi', 'ACM A. M. Turing Award', '2002', 'Europe', 'https://dl.acm.org/author_page.cfm?id=81100081898']
Row: ['Kay,\xa0Alan', 'ACM A. M. Turing Award', '2003', 'North America', 'https://dl.acm.org/author_page.cfm?id=81100544599']
Row: ['Aho,\xa0Alfred\xa0V', 'ACM A. M. Turing Award', '2020', 'North America', 'https://dl.acm.org/author_page.cfm?id=81100024612']
Row: ['Newell,\xa0Allen', 'ACM A. M. Turing Award', '1975', 'North America', 'https://dl.acm.org/author_page.cfm?id=81100393604']
Row: ['Pnueli,\xa0Amir', 'ACM A. M. Turing Award', '1996', 'North America', 'https://dl.acm.org/author_page.cfm?id=81100648459']
Row: ['Yao,\xa0Andrew\xa0C', 'ACM A. M. Turing Award'

In [14]:
import requests
from bs4 import BeautifulSoup
import re
import csv
import time
import random
import logging

# User-Agent list for rotation
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.48',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1'
]

# Setup logging
logging.basicConfig(filename='scraper.log', level=logging.INFO, format='%(asctime)s - %(message)s')

# Optional proxies
PROXIES = [
    # Example proxy format
    # 'http://user:password@proxyserver:port',
    # Add your proxies here or leave it as an empty list for no proxy usage
]

# Function to get a random User-Agent
def get_random_user_agent():
    return random.choice(USER_AGENTS)

# Function to get a random proxy (optional)
def get_random_proxy():
    if PROXIES:
        return {"http": random.choice(PROXIES), "https": random.choice(PROXIES)}
    return None

# Function to scrape a single researcher's additional details
def scrape_dl_profile(dl_url):
    profile_info = {}
    try:
        headers = {'User-Agent': get_random_user_agent()}
        proxy = get_random_proxy()
        dl_response = requests.get(dl_url, headers=headers, proxies=proxy, timeout=10)
        
        if dl_response.status_code == 200:
            dl_soup = BeautifulSoup(dl_response.content, 'html.parser')
            
            # Find all elements with classes containing double underscores
            double_underscore_elements = dl_soup.find_all(class_=re.compile(r'\w+__\w+'))
            
            # Collecting text from these elements
            double_underscore_text = [element.get_text(strip=True) for element in double_underscore_elements]
            
            # Example fields to store:
            profile_info['double_underscore_content'] = " | ".join(double_underscore_text) if double_underscore_text else 'N/A'
            
        else:
            logging.error(f"Failed to fetch DL profile: {dl_url}")
    except Exception as e:
        logging.error(f"Error scraping DL profile {dl_url}: {e}")
    
    return profile_info

# Function to scrape the ACM award recipients page
def scrape_acm_award_recipients():
    url = "https://awards.acm.org/award-recipients"
    headers = {'User-Agent': get_random_user_agent()}
    proxy = get_random_proxy()

    try:
        response = requests.get(url, headers=headers, proxies=proxy, timeout=10)
    except Exception as e:
        logging.error(f"Failed to fetch the ACM page: {e}")
        return

    if response.status_code != 200:
        logging.error("Failed to retrieve the ACM awards page.")
        return
    
    # Parse the page content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Try to find the table using a more generic approach
    table = soup.find('table')
    
    if table is None:
        logging.error("No table found on the page. Check the structure or class name.")
        return
    
    # Now try to find all rows in the table
    rows = table.find_all('tr')[1:]  # Skip the header row
    
    if not rows:
        logging.error("No rows found in the table. Verify the page content.")
        return
    
    # List to store scraped data
    recipients_data = []
    
    for idx, row in enumerate(rows):
        cols = row.find_all('td')
        name = cols[0].text.strip()
        award = cols[1].text.strip()
        year = cols[2].text.strip()
        region = cols[3].text.strip()
        dl_link = cols[4].find('a')['href'] if cols[4].find('a') else None
        
        if dl_link:
            dl_url = f"https://dl.acm.org{dl_link}" if dl_link.startswith('/') else dl_link
        else:
            dl_url = 'N/A'

        # Scrape additional details from the DL profile
        profile_details = scrape_dl_profile(dl_url) if dl_url != 'N/A' else {'double_underscore_content': 'N/A'}
        
        # Append all the data together
        recipient = {
            'name': name,
            'award': award,
            'year': year,
            'region': region,
            'dl_profile': dl_url,
            **profile_details
        }
        recipients_data.append(recipient)
        
        # Logging progress
        logging.info(f"Processed {idx+1}/{len(rows)}: {name} ({dl_url})")

        # Progress output
        print(f"Progress: {idx+1}/{len(rows)} | Scraped {name} - {dl_url}")
        
        # Wait before making the next request to avoid blocking
        time.sleep(random.uniform(1, 3))  # Wait 1-3 seconds randomly between requests
    
    # Write data to CSV
    with open('acm_award_recipients.csv', 'w', newline='') as csvfile:
        fieldnames = ['name', 'award', 'year', 'region', 'dl_profile', 'double_underscore_content']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        for data in recipients_data:
            writer.writerow(data)
    
    logging.info("Data saved to acm_award_recipients.csv")
    print("Data saved to acm_award_recipients.csv")

if __name__ == "__main__":
    scrape_acm_award_recipients()


Progress: 1/1393 | Scraped Milner, A J - https://dl.acm.org/author_page.cfm?id=81332515695
Progress: 2/1393 | Scraped Sreejith, A V - https://dl.acm.org/author_page.cfm?id=81479663157
Progress: 3/1393 | Scraped Malossi, A. Cristiano I. - N/A
Progress: 4/1393 | Scraped Perlis, A. J. - https://dl.acm.org/author_page.cfm?id=81100086771
Progress: 5/1393 | Scraped Turner, A. Joe - https://dl.acm.org/author_page.cfm?id=81408600192


KeyboardInterrupt: 

In [5]:
import requests
import random
import time

# User-Agent list for rotation
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.48',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1'
]

# Function to get a random User-Agent
def get_random_user_agent():
    return random.choice(USER_AGENTS)

# Function to fetch and save HTML page to local file
def fetch_and_save_html(url, save_as):
    headers = {'User-Agent': get_random_user_agent()}
    try:
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code == 200:
            with open(save_as, 'w', encoding='utf-8') as file:
                file.write(response.text)
            print(f"HTML content saved as {save_as}")
        else:
            print(f"Failed to fetch the page: {url}")
    except requests.exceptions.Timeout:
        print(f"Request timed out for {url}")
    except Exception as e:
        print(f"Error fetching the page: {e}")

# Function to fetch and save DL profiles for each recipient
def scrape_and_save_html():
    url = "https://awards.acm.org/award-recipients"
    fetch_and_save_html(url, "acm_award_recipients.html")  # Save the main page locally

    # Simulating the DL profile URLs for demo purposes
    dl_profiles = [
        "https://dl.acm.org/author_page.cfm?id=81332515695"
       
    ]
    
    for dl_url in dl_profiles:
        # Save each DL profile HTML as a local file
        profile_id = dl_url.split("=")[-1]
        save_as = f"profile_{profile_id}.html"
        fetch_and_save_html(dl_url, save_as)

        # Sleep between requests to avoid overwhelming the server
        time.sleep(random.uniform(3, 6))

if __name__ == "__main__":
    scrape_and_save_html()  # Fetch and save the pages locally


HTML content saved as acm_award_recipients.html
HTML content saved as profile_81332515695.html


In [6]:
from bs4 import BeautifulSoup

# Function to load and parse a local HTML file
def parse_local_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    soup = BeautifulSoup(content, 'html.parser')

    # Example: Extracting URLs from the saved file
    for link in soup.find_all('a', href=True):
        print(f"  Found URL: {link['href']}")

# Function to parse local award recipient page and DL profile pages
def parse_saved_html():
    # Parse the award recipient page
    print("Parsing ACM award recipients page...")
    parse_local_html("acm_award_recipients.html")

    # Simulating saved DL profiles
    dl_profile_files = ["profile_81332515695.html"]
    
    for profile_file in dl_profile_files:
        print(f"\nParsing DL profile: {profile_file}")
        parse_local_html(profile_file)

if __name__ == "__main__":
    parse_saved_html()  # Parse the saved HTML files


Parsing ACM award recipients page...
  Found URL: #SkipTarget
  Found URL: https://www.acm.org
  Found URL: http://amturing.acm.org
  Found URL: http://www.acm.org/turing-award-50
  Found URL: http://dl.acm.org
  Found URL: http://cacm.acm.org
  Found URL: http://queue.acm.org
  Found URL: http://technews.acm.org
  Found URL: https://www.acm.org
  Found URL: 
  Found URL: /
  Found URL: /award-recipients
  Found URL: /contact-us
  Found URL: #
  Found URL: /
  Found URL: /award-recipients
  Found URL: /contact-us
  Found URL: #
  Found URL: /
  Found URL: /about/turing-laureates-spotlight
  Found URL: /turing
  Found URL: /acm-prize
  Found URL: /thacker
  Found URL: /allen
  Found URL: /barroso
  Found URL: /distinguished-service
  Found URL: /doctoral-dissertation
  Found URL: /eckert-mauchly
  Found URL: /hopper
  Found URL: /bell
  Found URL: /bell-climate
  Found URL: international-science-and-engineering-fair
  Found URL: /kanellakis
  Found URL: /karlstrom
  Found URL: /kennedy


In [10]:
from bs4 import BeautifulSoup

def extract_author_profile(html_file):
    with open(html_file, 'r', encoding='utf-8') as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, 'html.parser')
    
    author_data = {}

    # Extracting Author's Name
    author_name_tag = soup.find('h1', class_='title')
    if author_name_tag:
        author_data['Name'] = author_name_tag.text.strip()

    # Extracting Bibliometrics
    bibliometrics = {}
    bibliometrics_section = soup.find('div', class_='bibliometrics equal-height-slides')
    if bibliometrics_section:
        metrics = bibliometrics_section.find_all('div', class_='slide-item')
        for metric in metrics:
            title = metric.find('div', class_='bibliometrics__title').text.strip()
            value = metric.find('div', class_='bibliometrics__count').text.strip()
            bibliometrics[title] = value
    author_data['Bibliometrics'] = bibliometrics
    
    # Extracting Author's Image URL
    image_tag = soup.find('img', alt=True, class_='image-lazy-loaded')
    if image_tag:
        author_data['Image_URL'] = image_tag['src']
    
    # Extracting Co-Authors and Affiliations
    co_authors = []
    co_author_section = soup.find_all('div', class_='colored-block shadow contrib-metrics__multi-items')
    for co_author in co_author_section:
        co_author_data = {}
        title_tag = co_author.find('h5')
        if title_tag and title_tag.text.strip() in ['Most frequent co-Author', 'Most cited colleague', 'Most frequent Affiliation']:
            co_author_data['Title'] = title_tag.text.strip()

            author_tag = co_author.find('div', class_='box-item')
            if author_tag:
                co_author_data['Details'] = author_tag.get_text(separator=' ').strip()

            co_authors.append(co_author_data)

    author_data['Co_Authors'] = co_authors
    
    # Extracting Top Subjects and Keywords
    # top_subjects = []
    # subject_section = soup.find_all('div', class_='colored-block shadow')
    # for subject_block in subject_section:
    #     title_tag = subject_block.find('h3', class_='title-header')
    #     if title_tag and title_tag.text.strip() == 'Top subject':
    #         top_subjects.append(subject_block.find('div', class_='top-rated-text').text.strip())

    # author_data['Top_Subjects'] = top_subjects
    
    # Extracting Keywords
    keywords = []
    keyword_section = soup.find('div', class_='colored-block__content')
    if keyword_section:
        keyword_tags = keyword_section.find_all('div', class_='tag-cloud')
        for keyword_tag in keyword_tags:
            keyword_text = keyword_tag.get_text(separator=' ').strip()
            keywords.append(keyword_text)
    author_data['Keywords'] = keywords

    # Extracting Publications
    publications = []
    pub_list_section = soup.find_all('li', class_='grid-item')
    for pub in pub_list_section:
        pub_data = {}
        title_tag = pub.find('h3', class_='issue-item__title')
        if title_tag:
            pub_data['Title'] = title_tag.text.strip()

        date_tag = pub.find('div', class_='issue-item__detail')
        if date_tag:
            pub_data['Details'] = date_tag.text.strip()
        
        doi_tag = pub.find('a', class_='issue-item__doi')
        if doi_tag:
            pub_data['DOI'] = doi_tag.text.strip()

        publications.append(pub_data)
    
    author_data['Publications'] = publications

    return author_data


# Usage Example
html_file = 'profile_81332515695.html'  # Replace with the actual file path
author_profile_data = extract_author_profile(html_file)

# Print the extracted data
print("Author Profile Data:")
print(author_profile_data)


Author Profile Data:
{'Name': 'Robin Milner', 'Bibliometrics': {'Average Citation per Article': '95', 'Citation count': '9,201', 'Publication counts': '97', 'Publication Years': '1971 - 2013', 'Available for Download': '19', 'Average Downloads per Article': '1,706', 'Downloads (6 weeks)': '403', 'Downloads (12 months)': '9,237', 'Downloads (cumulative)': '32,421'}, 'Image_URL': '/do/10.1145/contrib-81332515695/full/81332515695-1588898507063.jpg', 'Co_Authors': [], 'Keywords': [], 'Publications': [{'Title': 'An inductive characterization of matching in binding bigraphs', 'Details': 'March 2013Formal Aspects of Computing, Volume 25, Issue 2https://doi.org/10.1007/s00165-011-0184-5', 'DOI': 'https://doi.org/10.1007/s00165-011-0184-5'}, {'Title': 'Bigraphical Categories', 'Details': 'September 2009CONCUR 2009: Proceedings of the 20th International Conference on Concurrency Theoryhttps://doi.org/10.1007/978-3-642-04081-8_3', 'DOI': 'https://doi.org/10.1007/978-3-642-04081-8_3'}, {'Title': '

In [12]:
import requests
from bs4 import BeautifulSoup
import re
import random
import time
import pandas as pd

# User-Agent list for rotation
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.48',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1'
]

# Function to get a random User-Agent
def get_random_user_agent():
    return random.choice(USER_AGENTS)

# Function to extract author profile details from DL profile page
def extract_author_profile(dl_url):
    try:
        headers = {'User-Agent': get_random_user_agent()}
        print(f"Fetching DL profile: {dl_url}")
        dl_response = requests.get(dl_url, headers=headers, timeout=10)
        time.sleep(random.uniform(2, 4))

        if dl_response.status_code == 200:
            soup = BeautifulSoup(dl_response.content, 'html.parser')
            
            author_data = {}
            
            # Extracting Author's Name
            author_name_tag = soup.find('h1', class_='title')
            if author_name_tag:
                author_data['Name'] = author_name_tag.text.strip()

            # Extracting Bibliometrics
            bibliometrics = {}
            bibliometrics_section = soup.find('div', class_='bibliometrics equal-height-slides')
            if bibliometrics_section:
                metrics = bibliometrics_section.find_all('div', class_='slide-item')
                for metric in metrics:
                    title = metric.find('div', class_='bibliometrics__title').text.strip()
                    value = metric.find('div', class_='bibliometrics__count').text.strip()
                    bibliometrics[title] = value
            author_data['Bibliometrics'] = bibliometrics
            
            # Extracting Author's Image URL
            image_tag = soup.find('img', alt=True, class_='image-lazy-loaded')
            if image_tag:
                author_data['Image_URL'] = image_tag['src']
            
            # Extracting Co-Authors and Affiliations
            co_authors = []
            co_author_section = soup.find_all('div', class_='colored-block shadow contrib-metrics__multi-items')
            for co_author in co_author_section:
                co_author_data = {}
                title_tag = co_author.find('h5')
                if title_tag and title_tag.text.strip() in ['Most frequent co-Author', 'Most cited colleague', 'Most frequent Affiliation']:
                    co_author_data['Title'] = title_tag.text.strip()
                    author_tag = co_author.find('div', class_='box-item')
                    if author_tag:
                        co_author_data['Details'] = author_tag.get_text(separator=' ').strip()
                    co_authors.append(co_author_data)
            author_data['Co_Authors'] = co_authors
            
            # Extracting Keywords
            keywords = []
            keyword_section = soup.find('div', class_='colored-block__content')
            if keyword_section:
                keyword_tags = keyword_section.find_all('div', class_='tag-cloud')
                for keyword_tag in keyword_tags:
                    keyword_text = keyword_tag.get_text(separator=' ').strip()
                    keywords.append(keyword_text)
            author_data['Keywords'] = keywords

            # Extracting Publications
            publications = []
            pub_list_section = soup.find_all('li', class_='grid-item')
            for pub in pub_list_section:
                pub_data = {}
                title_tag = pub.find('h3', class_='issue-item__title')
                if title_tag:
                    pub_data['Title'] = title_tag.text.strip()

                date_tag = pub.find('div', class_='issue-item__detail')
                if date_tag:
                    pub_data['Details'] = date_tag.text.strip()
                
                doi_tag = pub.find('a', class_='issue-item__doi')
                if doi_tag:
                    pub_data['DOI'] = doi_tag.text.strip()

                publications.append(pub_data)
            
            author_data['Publications'] = publications
            
            return author_data
        else:
            print(f"Failed to fetch DL profile: {dl_url}")
            return None
    except requests.exceptions.Timeout:
        print(f"Request timed out for {dl_url}")
        return None
    except Exception as e:
        print(f"Error scraping DL profile {dl_url}: {e}")
        return None

# Function to scrape ACM award recipients and extract DL profile data
def scrape_and_explore_dl_links_and_save():
    url = "https://awards.acm.org/award-recipients"
    headers = {'User-Agent': get_random_user_agent()}
    
    # Send request to the ACM award page
    try:
        print(f"Fetching ACM award page: {url}")
        response = requests.get(url, headers=headers, timeout=10)
        time.sleep(random.uniform(2, 5))  # Random sleep between 2-5 seconds
        
    except requests.exceptions.Timeout:
        print("Request timed out for ACM page")
        return
    except Exception as e:
        print(f"Failed to fetch the ACM page: {e}")
        return

    if response.status_code != 200:
        print("Failed to retrieve the ACM awards page.")
        return
    
    # Parse the page content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    table = soup.find('table')
    
    if table is None:
        print("No table found on the page.")
        return
    
    rows = table.find_all('tr')[1:]  # Skip the header row
    
    if not rows:
        print("No rows found in the table.")
        return
    
    rows_to_scrape = rows[:3]  # Limit to first 3 recipients for testing
    
    all_profiles = []
    
    for idx, row in enumerate(rows_to_scrape):
        cols = row.find_all('td')
        name = cols[0].text.strip()
        award = cols[1].text.strip()
        year = cols[2].text.strip()
        region = cols[3].text.strip()
        dl_link = cols[4].find('a')['href'] if cols[4].find('a') else None
        
        if dl_link:
            dl_url = f"https://dl.acm.org{dl_link}" if dl_link.startswith('/') else dl_link
        else:
            dl_url = 'N/A'

        # Print the data for verification
        print(f"\nRecipient {idx+1}:")
        print(f"  Name: {name}")
        print(f"  Award: {award}")
        print(f"  Year: {year}")
        print(f"  Region: {region}")
        print(f"  DL Profile: {dl_url}")
        
        # Explore and extract DL profile for specific information
        if dl_url != 'N/A':
            author_profile = extract_author_profile(dl_url)
            if author_profile:
                author_profile['Award'] = award
                author_profile['Year'] = year
                author_profile['Region'] = region
                all_profiles.append(author_profile)

        time.sleep(random.uniform(3, 6))  # Sleep between rows

    # Save to CSV using pandas
    save_profiles_to_csv(all_profiles, 'acm_author_profiles.csv')

# Save the data to a CSV file
def save_profiles_to_csv(profiles, output_file):
    flat_profiles = []
    
    for profile in profiles:
        flat_profile = {
            'Name': profile.get('Name'),
            'Award': profile.get('Award'),
            'Year': profile.get('Year'),
            'Region': profile.get('Region'),
            'Bibliometrics': str(profile.get('Bibliometrics', {})),  # Convert dict to string
            'Image_URL': profile.get('Image_URL'),
            'Co_Authors': ', '.join([co_author['Details'] for co_author in profile.get('Co_Authors', [])]),
            'Keywords': ', '.join(profile.get('Keywords', [])),
            'Publications': ', '.join([pub['Title'] for pub in profile.get('Publications', [])])
        }
        flat_profiles.append(flat_profile)
    
    df = pd.DataFrame(flat_profiles)
    df.to_csv(output_file, index=False)
    print(f"Saved data to {output_file}")

if __name__ == "__main__":
    scrape_and_explore_dl_links_and_save()


Fetching ACM award page: https://awards.acm.org/award-recipients

Recipient 1:
  Name: Milner, A J
  Award: ACM A. M. Turing Award
  Year: 1991
  Region: Europe
  DL Profile: https://dl.acm.org/author_page.cfm?id=81332515695
Fetching DL profile: https://dl.acm.org/author_page.cfm?id=81332515695

Recipient 2:
  Name: Sreejith, A V
  Award: ACM India Doctoral Dissertation Award
  Year: 2014
  Region: Asia
  DL Profile: https://dl.acm.org/author_page.cfm?id=81479663157
Fetching DL profile: https://dl.acm.org/author_page.cfm?id=81479663157

Recipient 3:
  Name: Malossi, A. Cristiano I.
  Award: ACM Gordon Bell Prize
  Year: 2015
  Region: Europe
  DL Profile: N/A
Saved data to acm_author_profiles.csv


In [15]:
import requests
from bs4 import BeautifulSoup
import re
import random
import time
import pandas as pd

# User-Agent list for rotation
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.48',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1'
]

# Function to get a random User-Agent
def get_random_user_agent():
    return random.choice(USER_AGENTS)

# Function to extract author profile details from DL profile page
def extract_author_profile(dl_url):
    try:
        headers = {'User-Agent': get_random_user_agent()}
        print(f"Fetching DL profile: {dl_url}")
        dl_response = requests.get(dl_url, headers=headers, timeout=10)
        time.sleep(random.uniform(2, 4))

        if dl_response.status_code == 200:
            soup = BeautifulSoup(dl_response.content, 'html.parser')
            
            author_data = {}
            
            # Extracting Author's Name
            author_name_tag = soup.find('h1', class_='title')
            if author_name_tag:
                author_data['Name'] = author_name_tag.text.strip()

            # Extracting Bibliometrics
            bibliometrics = {}
            bibliometrics_section = soup.find('div', class_='bibliometrics equal-height-slides')
            if bibliometrics_section:
                metrics = bibliometrics_section.find_all('div', class_='slide-item')
                for metric in metrics:
                    title = metric.find('div', class_='bibliometrics__title').text.strip()
                    value = metric.find('div', class_='bibliometrics__count').text.strip()
                    bibliometrics[title] = value
            author_data['Bibliometrics'] = bibliometrics
            
            # Extracting Author's Image URL
            image_tag = soup.find('img', alt=True, class_='image-lazy-loaded')
            if image_tag:
                author_data['Image_URL'] = image_tag['src']
            
            # Extracting Co-Authors and Affiliations
            co_authors = []
            co_author_section = soup.find_all('div', class_='colored-block shadow contrib-metrics__multi-items')
            for co_author in co_author_section:
                co_author_data = {}
                title_tag = co_author.find('h5')
                if title_tag and title_tag.text.strip() in ['Most frequent co-Author', 'Most cited colleague', 'Most frequent Affiliation']:
                    co_author_data['Title'] = title_tag.text.strip()
                    author_tag = co_author.find('div', class_='box-item')
                    if author_tag:
                        co_author_data['Details'] = author_tag.get_text(separator=' ').strip()
                    co_authors.append(co_author_data)
            author_data['Co_Authors'] = co_authors
            
            # Extracting Keywords
            keywords = []
            keyword_section = soup.find('div', class_='colored-block__content')
            if keyword_section:
                keyword_tags = keyword_section.find_all('div', class_='tag-cloud')
                for keyword_tag in keyword_tags:
                    keyword_text = keyword_tag.get_text(separator=' ').strip()
                    keywords.append(keyword_text)
            author_data['Keywords'] = keywords

            # Extracting Publications
            publications = []
            pub_list_section = soup.find_all('li', class_='grid-item')
            for pub in pub_list_section:
                pub_data = {}
                title_tag = pub.find('h3', class_='issue-item__title')
                if title_tag:
                    pub_data['Title'] = title_tag.text.strip()

                date_tag = pub.find('div', class_='issue-item__detail')
                if date_tag:
                    pub_data['Details'] = date_tag.text.strip()
                
                doi_tag = pub.find('a', class_='issue-item__doi')
                if doi_tag:
                    pub_data['DOI'] = doi_tag.text.strip()

                publications.append(pub_data)
            
            author_data['Publications'] = publications
            
            return author_data
        else:
            print(f"Failed to fetch DL profile: {dl_url}")
            return None
    except requests.exceptions.Timeout:
        print(f"Request timed out for {dl_url}")
        return None
    except Exception as e:
        print(f"Error scraping DL profile {dl_url}: {e}")
        return None

# Function to scrape ACM award recipients and extract DL profile data
def scrape_and_explore_dl_links_and_save():
    url = "https://awards.acm.org/award-recipients"
    headers = {'User-Agent': get_random_user_agent()}
    
    # Send request to the ACM award page
    try:
        print(f"Fetching ACM award page: {url}")
        response = requests.get(url, headers=headers, timeout=10)
        time.sleep(random.uniform(2, 5))  # Random sleep between 2-5 seconds
        
    except requests.exceptions.Timeout:
        print("Request timed out for ACM page")
        return
    except Exception as e:
        print(f"Failed to fetch the ACM page: {e}")
        return

    if response.status_code != 200:
        print("Failed to retrieve the ACM awards page.")
        return
    
    # Parse the page content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    table = soup.find('table')
    
    if table is None:
        print("No table found on the page.")
        return
    
    rows = table.find_all('tr')[1:]  # Skip the header row
    
    if not rows:
        print("No rows found in the table.")
        return
    
    rows_to_scrape = rows[:3]  # Limit to first 3 recipients for testing
    
    all_profiles = []
    
    for idx, row in enumerate(rows_to_scrape):
        cols = row.find_all('td')
        name = cols[0].text.strip()
        award = cols[1].text.strip()
        year = cols[2].text.strip()
        region = cols[3].text.strip()
        dl_link = cols[4].find('a')['href'] if cols[4].find('a') else None
        
        if dl_link:
            dl_url = f"https://dl.acm.org{dl_link}" if dl_link.startswith('/') else dl_link
        else:
            dl_url = 'N/A'

        # Print the data for verification
        print(f"\nRecipient {idx+1}:")
        print(f"  Name: {name}")
        print(f"  Award: {award}")
        print(f"  Year: {year}")
        print(f"  Region: {region}")
        print(f"  DL Profile: {dl_url}")
        
        # Explore and extract DL profile for specific information
        if dl_url != 'N/A':
            author_profile = extract_author_profile(dl_url)
            if author_profile:
                author_profile['Award'] = award
                author_profile['Year'] = year
                author_profile['Region'] = region
                all_profiles.append(author_profile)

        time.sleep(random.uniform(3, 6))  # Sleep between rows

    # Save to CSV using pandas
    save_profiles_to_csv(all_profiles, 'acm_author_profiles.csv')

# Save the data to a CSV file
def save_profiles_to_csv(profiles, output_file):
    flat_profiles = []
    
    for profile in profiles:
        flat_profile = {
            'Name': profile.get('Name'),
            'Award': profile.get('Award'),
            'Year': profile.get('Year'),
            'Region': profile.get('Region'),
            # Flatten the bibliometrics
            'Average Citation per Article': profile['Bibliometrics'].get('Average Citation per Article', 'N/A'),
            'Citation Count': profile['Bibliometrics'].get('Citation count', 'N/A'),
            'Publication Count': profile['Bibliometrics'].get('Publication counts', 'N/A'),
            'Publication Years': profile['Bibliometrics'].get('Publication Years', 'N/A'),
            'Downloads (12 months)': profile['Bibliometrics'].get('Downloads (12 months)', 'N/A'),
            'Image_URL': profile.get('Image_URL'),
            'Co_Authors': ', '.join([co_author['Details'] for co_author in profile.get('Co_Authors', [])]),
            'Keywords': ', '.join(profile.get('Keywords', [])),
            'Publications': ', '.join([pub['Title'] for pub in profile.get('Publications', [])])
        }
        flat_profiles.append(flat_profile)
    
    df = pd.DataFrame(flat_profiles)
    df.to_csv(output_file, index=False)
    print(f"Saved data to {output_file}")

if __name__ == "__main__":
    scrape_and_explore_dl_links_and_save()


Fetching ACM award page: https://awards.acm.org/award-recipients

Recipient 1:
  Name: Milner, A J
  Award: ACM A. M. Turing Award
  Year: 1991
  Region: Europe
  DL Profile: https://dl.acm.org/author_page.cfm?id=81332515695
Fetching DL profile: https://dl.acm.org/author_page.cfm?id=81332515695

Recipient 2:
  Name: Sreejith, A V
  Award: ACM India Doctoral Dissertation Award
  Year: 2014
  Region: Asia
  DL Profile: https://dl.acm.org/author_page.cfm?id=81479663157
Fetching DL profile: https://dl.acm.org/author_page.cfm?id=81479663157

Recipient 3:
  Name: Malossi, A. Cristiano I.
  Award: ACM Gordon Bell Prize
  Year: 2015
  Region: Europe
  DL Profile: N/A
Saved data to acm_author_profiles.csv


In [15]:
import os
import json
import requests
import time
from bs4 import BeautifulSoup
import random

# User-Agent list for rotation
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.48',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1'
]

# Function to get a random User-Agent
def get_random_user_agent():
    return random.choice(USER_AGENTS)

# Function to scrape the ACM award recipients and gather their DL profile URLs
def scrape_acm_award_recipients():
    url = "https://awards.acm.org/award-recipients"
    headers = {'User-Agent': get_random_user_agent()}
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code != 200:
            print(f"Failed to retrieve ACM awards page. Status code: {response.status_code}")
            return None
        
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table')
        if not table:
            print("No table found on the page.")
            return None

        rows = table.find_all('tr')[1:]  # Skip the header row
        recipient_urls = []
        
        for row in rows:
            cols = row.find_all('td')
            dl_link = cols[4].find('a')['href'] if cols[4].find('a') else None
            
            if dl_link:
                dl_url = f"https://dl.acm.org{dl_link}" if dl_link.startswith('/') else dl_link
                recipient_urls.append(dl_url)

        return recipient_urls
    except Exception as e:
        print(f"An error occurred while scraping ACM recipients: {e}")
        return None

# Function to extract author profile from DL profile URL
def extract_author_profile_from_url(dl_url):
    try:
        headers = {'User-Agent': get_random_user_agent()}
        response = requests.get(dl_url, headers=headers, timeout=10)
        if response.status_code != 200:
            print(f"Failed to retrieve profile page for {dl_url}")
            return None

        soup = BeautifulSoup(response.content, 'html.parser')
        return extract_author_profile_from_soup(soup)
    except Exception as e:
        print(f"Error fetching profile for {dl_url}: {e}")
        return None

# Function to extract author profile details from the soup object
def extract_author_profile_from_soup(soup):
    author_data = {}

    # Extracting Author's Name
    author_name_tag = soup.find('h1', class_='title')
    if author_name_tag:
        author_data['Name'] = author_name_tag.text.strip()

    # Extracting Bibliometrics
    bibliometrics = {}
    bibliometrics_section = soup.find('div', class_='bibliometrics equal-height-slides')
    if bibliometrics_section:
        metrics = bibliometrics_section.find_all('div', class_='slide-item')
        for metric in metrics:
            title = metric.find('div', class_='bibliometrics__title').text.strip()
            value = metric.find('div', class_='bibliometrics__count').text.strip()
            bibliometrics[title] = value
    author_data['Bibliometrics'] = bibliometrics
    
    # Extracting Author's Image URL
    image_tag = soup.find('img', alt=True, class_='image-lazy-loaded')
    if image_tag:
        author_data['Image_URL'] = image_tag['src']
    
    # Extracting Co-Authors and Affiliations
    co_authors = []
    co_author_section = soup.find_all('div', class_='colored-block shadow contrib-metrics__multi-items')
    for co_author in co_author_section:
        co_author_data = {}
        title_tag = co_author.find('h5')
        if title_tag and title_tag.text.strip() in ['Most frequent co-Author', 'Most cited colleague', 'Most frequent Affiliation']:
            co_author_data['Title'] = title_tag.text.strip()

            author_tag = co_author.find('div', class_='box-item')
            if author_tag:
                co_author_data['Details'] = author_tag.get_text(separator=' ').strip()

            co_authors.append(co_author_data)

    author_data['Co_Authors'] = co_authors

    # Extracting Keywords
    keywords = []
    keyword_section = soup.find('div', class_='colored-block__content')
    if keyword_section:
        keyword_tags = keyword_section.find_all('div', class_='tag-cloud')
        for keyword_tag in keyword_tags:
            keyword_text = keyword_tag.get_text(separator=' ').strip()
            keywords.append(keyword_text)
    author_data['Keywords'] = keywords

    # Extracting Publications
    publications = []
    pub_list_section = soup.find_all('li', class_='grid-item')
    for pub in pub_list_section:
        pub_data = {}
        title_tag = pub.find('h3', class_='issue-item__title')
        if title_tag:
            pub_data['Title'] = title_tag.text.strip()

        date_tag = pub.find('div', class_='issue-item__detail')
        if date_tag:
            pub_data['Details'] = date_tag.text.strip()
        
        doi_tag = pub.find('a', class_='issue-item__doi')
        if doi_tag:
            pub_data['DOI'] = doi_tag.text.strip()

        publications.append(pub_data)
    
    author_data['Publications'] = publications

    return author_data

# Main function to extract profiles for all recipients
def extract_all_acm_profiles():
    recipient_urls = scrape_acm_award_recipients()
    if not recipient_urls:
        print("No recipients found.")
        return

    all_profiles = []
    for url in recipient_urls:
        print(f"Extracting profile from: {url}")
        profile = extract_author_profile_from_url(url)
        if profile:
            all_profiles.append(profile)
        time.sleep(random.uniform(2, 5))  # Avoid overloading the server with requests

    return all_profiles

# Save the profiles to a JSON file
def save_profiles_to_json(profiles, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(profiles, f, indent=4, ensure_ascii=False)
    print(f"Profiles saved to {output_file}")

if __name__ == "__main__":
    profiles = extract_all_acm_profiles()
    if profiles:
        save_profiles_to_json(profiles, 'acm_recipient_profiles.json')


Extracting profile from: https://dl.acm.org/author_page.cfm?id=81332515695
Extracting profile from: https://dl.acm.org/author_page.cfm?id=81479663157
Extracting profile from: https://dl.acm.org/author_page.cfm?id=81100086771
Extracting profile from: https://dl.acm.org/author_page.cfm?id=81408600192
Extracting profile from: https://dl.acm.org/author_page.cfm?id=81100604913
Extracting profile from: https://dl.acm.org/author_page.cfm?id=81758701057


KeyboardInterrupt: 

In [21]:
import os
import json
import requests
import time
import random
from bs4 import BeautifulSoup


In [22]:
# User-Agent list for rotation
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.48',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1'
]

# Retry configuration
MAX_RETRIES = 5
RETRY_BACKOFF = 2

In [23]:
# Function to get a random User-Agent
def get_random_user_agent():
    return random.choice(USER_AGENTS)

In [24]:

# Retry function with backoff strategy
def retry_request(url, headers, max_retries=MAX_RETRIES, timeout=10):
    retries = 0
    while retries < max_retries:
        try:
            response = requests.get(url, headers=headers, timeout=timeout)
            if response.status_code == 200:
                return response
            else:
                print(f"Error: {response.status_code} - Retrying...")
        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e} - Retrying...")
        retries += 1
        time.sleep(RETRY_BACKOFF * retries)  # Exponential backoff
    return None


In [29]:
# Get Google Scholar profile data with retries
def get_google_scholar_url(full_name):
    base_url = f"https://api.scholarlydata.com/get_profile?name={full_name}"
    headers = {'User-Agent': get_random_user_agent()}
    response = retry_request(base_url, headers)
    
    if response:
        try:
            data = response.json()
            if 'scholar_id' in data:
                return data
            else:
                return {}
        except json.JSONDecodeError:
            print("Failed to parse the JSON response.")
            return {}
    else:
        print(f"Failed to retrieve Google Scholar data for {full_name}")
        return {}


In [30]:
# Function to scrape ACM award recipients
def scrape_acm_award_recipients():
    url = "https://awards.acm.org/award-recipients"
    headers = {'User-Agent': get_random_user_agent()}
    response = retry_request(url, headers)
    
    if not response:
        return None
    
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table')
    if not table:
        print("No table found on the page.")
        return None
    
    rows = table.find_all('tr')[1:]  # Skip the header row
    recipient_urls = []
    recepient_data = []
    selected_range = 3  # Limit to first 3 recipients for testing
    for row in rows[:selected_range]:
        cols = row.find_all('td')
        name = cols[0].text.strip()
        award = cols[1].text.strip()
        year = cols[2].text.strip()
        region = cols[3].text.strip()
        dl_link = cols[4].find('a')['href'] if cols[4].find('a') else None

        # save the corresponding details 
        recipient_profile = {
            'Name': name,
            'Award': award,
            'Year': year,
            'Region': region,
            'DL_Link': dl_link
        }
        
        recepient_data.append(recipient_profile)

        if dl_link:
            dl_url = f"https://dl.acm.org{dl_link}" if dl_link.startswith('/') else dl_link
            recipient_urls.append(dl_url)

    return recipient_urls, recepient_data

In [5]:

# Extract author profile from URL with retries
def extract_author_profile_from_url(dl_url):
    headers = {'User-Agent': get_random_user_agent()}
    response = retry_request(dl_url, headers)
    
    if not response:
        print(f"Failed to retrieve profile page for {dl_url}")
        return None
    
    soup = BeautifulSoup(response.content, 'html.parser')
    return extract_author_profile_from_soup(soup)


In [6]:
# Function to extract author profile from DL profile URL
def extract_author_profile_from_url(dl_url):
    try:
        headers = {'User-Agent': get_random_user_agent()}
        response = requests.get(dl_url, headers=headers, timeout=10)
        if response.status_code != 200:
            print(f"Failed to retrieve profile page for {dl_url}")
            return None

        soup = BeautifulSoup(response.content, 'html.parser')
        return extract_author_profile_from_soup(soup)
    except Exception as e:
        print(f"Error fetching profile for {dl_url}: {e}")
        return None

# Function to extract author profile details from the soup object
def extract_author_profile_from_soup(soup):
    author_data = {}

    # Extracting Author's Name
    author_name_tag = soup.find('h1', class_='title')
    if author_name_tag:
        author_data['Name'] = author_name_tag.text.strip()

    # Extracting Bibliometrics
    bibliometrics = {}
    bibliometrics_section = soup.find('div', class_='bibliometrics equal-height-slides')
    if bibliometrics_section:
        metrics = bibliometrics_section.find_all('div', class_='slide-item')
        for metric in metrics:
            title = metric.find('div', class_='bibliometrics__title').text.strip()
            value = metric.find('div', class_='bibliometrics__count').text.strip()
            bibliometrics[title] = value
    author_data['Bibliometrics'] = bibliometrics
    
    # Extracting Author's Image URL
    image_tag = soup.find('img', alt=True, class_='image-lazy-loaded')
    if image_tag:
        author_data['Image_URL'] = image_tag['src']
    
    # Extracting Co-Authors and Affiliations
    co_authors = []
    co_author_section = soup.find_all('div', class_='colored-block shadow contrib-metrics__multi-items')
    for co_author in co_author_section:
        co_author_data = {}
        title_tag = co_author.find('h5')
        if title_tag and title_tag.text.strip() in ['Most frequent co-Author', 'Most cited colleague', 'Most frequent Affiliation']:
            co_author_data['Title'] = title_tag.text.strip()

            author_tag = co_author.find('div', class_='box-item')
            if author_tag:
                co_author_data['Details'] = author_tag.get_text(separator=' ').strip()

            co_authors.append(co_author_data)

    author_data['Co_Authors'] = co_authors

    # Extracting Keywords
    keywords = []
    keyword_section = soup.find('div', class_='colored-block__content')
    if keyword_section:
        keyword_tags = keyword_section.find_all('div', class_='tag-cloud')
        for keyword_tag in keyword_tags:
            keyword_text = keyword_tag.get_text(separator=' ').strip()
            keywords.append(keyword_text)
    author_data['Keywords'] = keywords

    # Extracting Publications
    publications = []
    pub_list_section = soup.find_all('li', class_='grid-item')
    for pub in pub_list_section:
        pub_data = {}
        title_tag = pub.find('h3', class_='issue-item__title')
        if title_tag:
            pub_data['Title'] = title_tag.text.strip()

        date_tag = pub.find('div', class_='issue-item__detail')
        if date_tag:
            pub_data['Details'] = date_tag.text.strip()
        
        doi_tag = pub.find('a', class_='issue-item__doi')
        if doi_tag:
            pub_data['DOI'] = doi_tag.text.strip()

        publications.append(pub_data)
    
    author_data['Publications'] = publications

    return author_data

In [39]:

# Save profiles to JSON (save in batches to avoid data loss)
def save_profiles_to_json(profiles, output_file):
    if not profiles:
        print("No profiles to save.")
        return
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(profiles, f, indent=4, ensure_ascii=False)
    print(f"Profiles saved to {output_file}")


In [70]:
import json
from bs4 import BeautifulSoup

def get_random_user_agent():
    # Function to get a random user agent (placeholder for real user agent rotation logic)
    return 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'

def retry_request(url, headers):
    # Placeholder for the retry_request function, which should handle retries and backoff
    import requests
    return requests.get(url, headers=headers)

def extract_author_profile():
    url = "https://dl.acm.org/author_page.cfm?id=81332515695"
    headers = {'User-Agent': get_random_user_agent()}
    response = retry_request(url, headers)
    
    html_content = response.content

    # Parsing the HTML content with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the div with class 'tag-cloud'
    tag_cloud_div = soup.find('div', class_='tag-cloud')

    # Find the parent of parent and extract h4 text
    if tag_cloud_div:
        parent_of_parent = tag_cloud_div.find_parent().find_parent()
        h4_text = parent_of_parent.find('h4').text.strip() if parent_of_parent.find('h4') else "No h4 found"

    # Extract the JSON-like data from the 'data-tags' attribute
    keywords = []
    if tag_cloud_div and tag_cloud_div.has_attr('data-tags'):
        data_tags = tag_cloud_div['data-tags']
        data_tags = data_tags.replace('&quot;', '"')  # Convert HTML entities to normal characters
        
        # Parse the JSON string into Python objects
        try:
            tags_data = json.loads(data_tags)

            # Extract relevant information (term, label, count)
            for tag in tags_data:
                keyword_info = {
                    'term': tag.get('term'),
                    'label': tag.get('label'),
                    'count': tag.get('count'),
                    'link': tag.get('link')
                }
                keywords.append(keyword_info)

        except json.JSONDecodeError as e:
            print(f"Failed to parse JSON: {e}")
    else:
        print("No tag cloud data found.")
    
    # Return a dictionary mapping the h4 text to the list of keywords
    return {h4_text: keywords}

# Run the extraction and print the results
result = extract_author_profile()
print(result)


{'Subject Areas': [{'term': '111', 'label': 'Logic', 'count': 16, 'link': None}, {'term': '460', 'label': 'Program semantics', 'count': 12, 'link': None}, {'term': '108', 'label': 'Models of computation', 'count': 7, 'link': None}, {'term': '1199', 'label': 'Parallel computing models', 'count': 7, 'link': None}, {'term': '115', 'label': 'Semantics and reasoning', 'count': 7, 'link': None}, {'term': '1719', 'label': 'Semantics', 'count': 6, 'link': None}, {'term': '109', 'label': 'Formal languages and automata theory', 'count': 5, 'link': None}, {'term': '770', 'label': 'Lambda calculus', 'count': 5, 'link': None}, {'term': '397', 'label': 'Concurrency', 'count': 4, 'link': None}, {'term': '774', 'label': 'Lambda calculus', 'count': 4, 'link': None}, {'term': '680', 'label': 'Formal language definitions', 'count': 3, 'link': None}, {'term': '1716', 'label': 'Language types', 'count': 3, 'link': None}, {'term': '68', 'label': 'Document types', 'count': 2, 'link': None}, {'term': '714', '

In [73]:
import json
from bs4 import BeautifulSoup

def get_random_user_agent():
    # Function to get a random user agent (placeholder for real user agent rotation logic)
    return 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'

def retry_request(url, headers):
    # Placeholder for the retry_request function, which should handle retries and backoff
    import requests
    return requests.get(url, headers=headers)

def extract_author_profile():
    url = "https://dl.acm.org/author_page.cfm?id=81332515695"
    headers = {'User-Agent': get_random_user_agent()}
    response = retry_request(url, headers)
    
    html_content = response.content

    # Parsing the HTML content with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all div elements with class 'tag-cloud'
    tag_cloud_divs = soup.find_all('div', class_='tag-cloud')
    h4_to_keywords_map = {}

    bar_count = soup.find_all('svg', class_='d3-bar-chart')  

    data_chart_data = []

    if bar_count.has_attr("data-chart-data"):
        data_chart_data = bar_count['data-chart-data']
        data_chart_data = data_chart_data.replace('&quot;', '"')
        try:
            chart_data = json.loads(data_chart_data)
            for data in chart_data:
                data_chart_data.append(data)
        except json.JSONDecodeError as e:
            print(f"Failed to parse JSON: {e}")

    data_contrib = soup.find_all('div', class_="contrib-metrics__multi-items ajax-done")
    if data_contrib.has_attr("data-component"):
        data_component = data_contrib['data-component']
        data_component = data_component.replace('&quot;', '"')
        try:
            component_data = json.loads(data_component)
            for data in component_data:
                data_contrib.append(data)
        except json.JSONDecodeError as e:
            print(f"Failed to parse JSON: {e}")
    


    # Iterate over each 'tag-cloud' div
    for tag_cloud_div in tag_cloud_divs:
        # Find the grandparent of the 'tag-cloud' div and extract h4 text
        parent_of_parent = tag_cloud_div.find_parent().find_parent()
        h4_text = parent_of_parent.find('h4').text.strip() if parent_of_parent.find('h4') else "No h4 found"

        # Extract the JSON-like data from the 'data-tags' attribute
        keywords = []
        if tag_cloud_div and tag_cloud_div.has_attr('data-tags'):
            data_tags = tag_cloud_div['data-tags']
            data_tags = data_tags.replace('&quot;', '"')  # Convert HTML entities to normal characters

            # Parse the JSON string into Python objects
            try:
                tags_data = json.loads(data_tags)

                # Extract relevant information (term, label, count)
                for tag in tags_data:
                    keyword_info = {
                        'label': tag.get('label'),
                        'count': tag.get('count'),
                    }
                    keywords.append(keyword_info)

            except json.JSONDecodeError as e:
                print(f"Failed to parse JSON: {e}")
        else:
            print("No tag cloud data found.")

        # Map the h4 text to the list of keywords
        h4_to_keywords_map[h4_text] = keywords

    # Return the complete dictionary mapping h4 text to keywords
    return h4_to_keywords_map

# Run the extraction and print the results
result = extract_author_profile()
# save to json gifle 
with open('tags.json', 'w') as f:
    json.dump(result, f, indent=4)


In [2]:
import json
import requests
from bs4 import BeautifulSoup

def get_random_user_agent():
    # Function to get a random user agent (placeholder for real user agent rotation logic)
    return 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'

def retry_request(url, headers):
    # Placeholder for retry_request logic that retries on failure
    return requests.get(url, headers=headers)

def extract_author_profile():
    url = "https://dl.acm.org/author_page.cfm?id=81332515695"
    headers = {'User-Agent': get_random_user_agent()}
    response = retry_request(url, headers)
    
    html_content = response.content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Dictionary to hold all extracted data
    author_data = {}

    # Extract keywords from the 'tag-cloud' divs
    tag_cloud_divs = soup.find_all('div', class_='tag-cloud')
    h4_to_keywords_map = {}

    for tag_cloud_div in tag_cloud_divs:
        parent_of_parent = tag_cloud_div.find_parent().find_parent()
        h4_text = parent_of_parent.find('h4').text.strip() if parent_of_parent.find('h4') else "No h4 found"

        keywords = []
        if tag_cloud_div and tag_cloud_div.has_attr('data-tags'):
            data_tags = tag_cloud_div['data-tags'].replace('&quot;', '"')  # Convert HTML entities

            try:
                tags_data = json.loads(data_tags)
                for tag in tags_data:
                    keyword_info = {
                        'label': tag.get('label'),
                        'count': tag.get('count'),
                    }
                    keywords.append(keyword_info)
            except json.JSONDecodeError as e:
                print(f"Failed to parse JSON in tag-cloud data: {e}")
        
        h4_to_keywords_map[h4_text] = keywords
    
    author_data['Keywords'] = h4_to_keywords_map

    # Extract chart data from 'd3-bar-chart' SVG elements
    bar_charts = soup.find_all('svg', class_='d3-bar-chart')
    chart_data_list = []
    
    for bar_chart in bar_charts:
        if bar_chart.has_attr('data-chart-data'):
            chart_data_raw = bar_chart['data-chart-data'].replace('&quot;', '"')
            try:
                chart_data = json.loads(chart_data_raw)
                chart_data_list.append(chart_data)
            except json.JSONDecodeError as e:
                print(f"Failed to parse JSON in chart data: {e}")
    
    author_data['Chart_Data'] = chart_data_list

    # Extract contribution data from 'contrib-metrics__multi-items' divs
    collab_colleagues = soup.find_all('div', class_='box-item__item')
    contrib_data = []
    for c in collab_colleagues:


        # Extract the colleague's name from 'title' div
        name_tag = c.find('div', class_='title')
        if name_tag and name_tag.find('a'):
            contrib_data['Name'] = name_tag.find('a').text.strip()
        
        # Extract the paper count from 'box-item__count'
        count_tag = c.find('div', class_='box-item__count')
        if count_tag:
            paper_count = count_tag.text.strip().split()[0]  # Extracts the number part only
            contrib_data['Paper Count'] = paper_count
        
        # Append the extracted data for this colleague to the list
        author_data['Collaborators'].append(contrib_data)
    
    # Add the colleague collaboration data to the final output
    author_data['Collaborators'] = collab_colleagues


    return author_data

# Run the extraction and save the results to JSON
result = extract_author_profile()

# Save to JSON file
with open('author_profile.json', 'w', encoding='utf-8') as f:
    json.dump(result, f, indent=4, ensure_ascii=False)

print("Data successfully extracted and saved to 'author_profile.json'")


KeyError: 'Collaborators'

In [89]:
import json
from bs4 import BeautifulSoup
import requests

def get_random_user_agent():
    # Function to get a random user agent
    return 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'

def retry_request(url, headers):
    # Placeholder for retry_request logic that retries on failure
    return requests.get(url, headers=headers)

def extract_author_profile():
    url = "https://dl.acm.org/author_page.cfm?id=81332515695"
    headers = {'User-Agent': get_random_user_agent()}
    response = retry_request(url, headers)
    
    html_content = response.content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Dictionary to hold all extracted data
    author_data = {}

    # Extract keywords from the 'tag-cloud' divs
    tag_cloud_divs = soup.find_all('div', class_='tag-cloud')
    h4_to_keywords_map = {}

    for tag_cloud_div in tag_cloud_divs:
        parent_of_parent = tag_cloud_div.find_parent().find_parent()
        h4_text = parent_of_parent.find('h4').text.strip() if parent_of_parent.find('h4') else "No h4 found"

        keywords = []
        if tag_cloud_div and tag_cloud_div.has_attr('data-tags'):
            data_tags = tag_cloud_div['data-tags'].replace('&quot;', '"')  # Convert HTML entities

            try:
                tags_data = json.loads(data_tags)
                for tag in tags_data:
                    keyword_info = {
                        'label': tag.get('label'),
                        'count': tag.get('count'),
                    }
                    keywords.append(keyword_info)
            except json.JSONDecodeError as e:
                print(f"Failed to parse JSON in tag-cloud data: {e}")
        
        h4_to_keywords_map[h4_text] = keywords
    
    author_data['Keywords'] = h4_to_keywords_map

    # Extract chart data from 'd3-bar-chart' SVG elements
    bar_charts = soup.find_all('svg', class_='d3-bar-chart')
    chart_data_list = []
    
    for bar_chart in bar_charts:
        if bar_chart.has_attr('data-chart-data'):
            chart_data_raw = bar_chart['data-chart-data'].replace('&quot;', '"')
            try:
                chart_data = json.loads(chart_data_raw)
                chart_data_list.append(chart_data)
            except json.JSONDecodeError as e:
                print(f"Failed to parse JSON in chart data: {e}")
    
    author_data['Chart_Data'] = chart_data_list

    # Extract contribution data from 'contrib-metrics__multi-items' divs
    collab_colleagues = soup.find('div', 'data-component='contrib-metrics__multi-items')
    print(collab_colleagues)
    author_data['Collaborators'] = []  # Initialize the list for collaborators

    for c in collab_colleagues:
        contrib_data = {}  # Create a new dictionary for each colleague
        print(c)
        # Extract the colleague's name from 'title' div
        content_tag = c.find('div', class_='creative-work__content list')
        print("CONTENT TAG")
        print(content_tag)
        
        # Extract the paper count from 'box-item__count'
        count_tag = c.find('div', class_='box-item__count')
        if count_tag:
            paper_count = count_tag.text.strip().split()[0]  # Extract the number part only
            contrib_data['Paper Count'] = paper_count
        
        # Append the extracted data for this colleague to the list
        author_data['Collaborators'].append(contrib_data)

    return author_data

# Run the extraction and print the results
result = extract_author_profile()

# Save to JSON file
with open('author_profile.json', 'w', encoding='utf-8') as f:
    json.dump(result, f, indent=4, ensure_ascii=False)

print("Data successfully extracted and saved to 'author_profile.json'")


None


TypeError: 'NoneType' object is not iterable