# Web Scrapping the CoNLL research papers using https://aclanthology.org/venues/conll/

In [20]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import time
def download_pdf(pdf_link, title,year):
    try:
        response = requests.get(pdf_link, stream=True)
        response.raise_for_status()
       
        # Create a folder named 'papers' to save the downloaded PDFs
        os.makedirs(str(year), exist_ok=True)
        pdf_path = os.path.join(str(year), f"{title}.pdf")

        with open(pdf_path, "wb") as pdf_file:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    pdf_file.write(chunk)

        return pdf_path
    except Exception as e:
        print(f"Error downloading PDF: {e}")
        return None

def scrape_conll_papers(base_url, start_year, end_year):
    # Initialize lists to store data
    pdf_links = []
    titles = []
    authors = []
    years=[]
    invalid_characters = ['\\', '/', ':', '*', '?', '"', '<', '>', '|']
    # Iterate through the years
    for year in range(start_year, end_year + 1):
        # Construct the URL with the current year
        url = f"{base_url}{year}/"

        # Send an HTTP GET request to the URL
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find all elements with the specified structure
            paper_elements = soup.find_all('p', class_='d-sm-flex align-items-stretch')

            # Iterate through each paper element and extract information
            for paper_element in paper_elements:
                years.append(year)
                # Extract PDF link
                pdf_link_element = paper_element.find('a', class_='badge-primary')
                pdf_link = pdf_link_element['href'] if pdf_link_element else None
                pdf_links.append(pdf_link)

                # Extract title
                title_element = paper_element.find('strong').find('a')
                title = title_element.text.strip() if title_element else None
                titles.append(''.join(char if char not in invalid_characters else '' for char in title))


                # Extract authors
                author_elements = paper_element.find_all('a', href=lambda x: x and '/people/' in x)
                author_names = [author.text.strip() for author in author_elements]
                authors.append(','.join(author_names))
        else:
            print(f"Failed to retrieve data from {url}. Status Code: {response.status_code}")

        # Add a delay before the next iteration
        time.sleep(1)  # You can adjust the delay as needed

    return pdf_links, titles, authors,years

# Example usage
conll_base_url = 'https://aclanthology.org/events/conll-'
start_year = 2013
end_year = 2022
pdf_links, titles, authors,years = scrape_conll_papers(conll_base_url, start_year, end_year)

# Initialize an empty DataFrame
data = {
    "Title": titles,
    "year" : years,
    "Authors": authors,
    "PDF Link": pdf_links
    
}

df = pd.DataFrame(data)

# Download each paper and update the DataFrame with the PDF path
for idx, (pdf_link, title,year) in enumerate(zip(pdf_links, titles,years)):
    pdf_path = download_pdf(pdf_link, title,year)


# Save the DataFrame to a CSV file
df.to_csv("conll_papers_years.csv", index=False)

print("Data collected and saved to 'conll_papers_years.csv'.")


Data collected and saved to 'conll_papers_years.csv'.
