### Webscraping Assessment, by Pranav Vishwanath  - to scrape author and article details from semantic scholar's website

In [51]:
import requests
import pandas as pd

class Article:
    def __init__(self, full_name:str):
        self.full_name = full_name
        self.eai_url = None
        self.link = ""
        self.pdf_link = ""
        self.publication_date = None
        self.data_source = ""
        self.publication = ""
        self.title = ""
        self.eai_match = False
        self.affiliation = ""
        self.type = ""
        self.citations = 0

def fetch_author_data(url, params):
    response = requests.get(url, params=params) 
    if response.status_code == 200:
        data = response.json()
        return data
    else:
        print("Failed to fetch data from Semantic Scholar API.")
        return None

if __name__ == "__main__":
    url = "https://api.semanticscholar.org/graph/v1/author/search"
    query = "Kenneth Church"
    offset = 0
    limit = 490
    fields = "name,affiliations,papers.title,papers.url,papers.publicationDate,papers.citationCount,papers.publicationTypes,papers.openAccessPdf,papers.venue,papers.externalIds"
    params = {
        "query": query,
        "offset": offset,
        "limit": limit,
        "fields": fields
    }
    author_data = fetch_author_data(url, params)
    if author_data:
        articles = []
        for author in author_data["data"]:
            if "Northeastern University" in author.get("affiliations", ""):
                for paper in author.get("papers", []):
                    if isinstance(paper, dict):
                        article = Article(author['name'])
                        article.title = paper.get('title', '')
                        article.link = paper.get('url')  
                        article.publication_date = paper.get('publicationDate', '')
                        article.affiliation = author.get('affiliations', '')
                        article.citations = paper.get('citationCount', 0)
                        article.type = paper.get('publicationTypes', '')
                        article.pdf_link = paper.get('openAccessPdf', '')
                        article.publication = paper.get('venue', '')
                        articles.append(article)
                        article.eai_match = True
                        article.data_source = paper.get('externalIds')

        print(f"Total articles found: {len(articles)}")
        
        # Convert articles list to DataFrame
        df = pd.DataFrame([vars(article) for article in articles])
        def process_pdf_link(link_dict):
            """Function that extracts the URL from the PDF Link dictionary.
            Accepts the link_dict as input and returns the URL."""
            if isinstance(link_dict, dict):
                return link_dict.get('url', None)
            else:
                return None

        # Apply the function to the 'pdf_link' column
        df['pdf_link'] = df['pdf_link'].apply(process_pdf_link)
        
        def process_data_source(data_source_dict):
            """Function that extracts the first key from the data source dictionary.
            Accepts the data_source_dict as input and returns the first key."""
            if isinstance(data_source_dict, dict):
                return next(iter(data_source_dict))
            else:
                return None

        # Apply the function to the 'data_source' column
        df['data_source'] = df['data_source'].apply(process_data_source)


        # Export DataFrame to CSV
        filename = f"{query}.csv"
        df.to_csv(filename, index=False)
        print(f"Articles exported to '{filename}'")

    else:
        print("No data found.")


Total articles found: 219
Articles exported to 'Kenneth Church.csv'


In [52]:
df

Unnamed: 0,full_name,eai_url,link,pdf_link,publication_date,data_source,publication,title,eai_match,affiliation,type,citations
0,Kenneth Ward Church,,https://www.semanticscholar.org/paper/3b6d0eb3...,,2023-08-20,DOI,Interspeech,Improved Contextualized Speech Representations...,True,[Northeastern University],,0
1,Kenneth Ward Church,,https://www.semanticscholar.org/paper/46cc3955...,https://ojs.aaai.org/index.php/ICWSM/article/d...,2023-06-02,DBLP,International Conference on Web and Social Media,An Example of (Too Much) Hyper-Parameter Tunin...,True,[Northeastern University],[JournalArticle],2
2,Kenneth Ward Church,,https://www.semanticscholar.org/paper/9678a516...,https://www.cambridge.org/core/services/aop-ca...,2023-09-01,DBLP,Natural Language Engineering,Emerging trends: Smooth-talking machines,True,[Northeastern University],[JournalArticle],1
3,Kenneth Ward Church,,https://www.semanticscholar.org/paper/0aa43be1...,,,ACL,International Conference on Language Resources...,Training on Lexical Resources,True,[Northeastern University],[JournalArticle],1
4,Kenneth Ward Church,,https://www.semanticscholar.org/paper/3bc07732...,,2022-04-27,DOI,Wireless and Microwave Technology Conference,Advanced Manufacturing and Characterization of...,True,[Northeastern University],[Conference],0
...,...,...,...,...,...,...,...,...,...,...,...,...
214,Kenneth Ward Church,,https://www.semanticscholar.org/paper/bd9629da...,,1979-08-20,MAG,International Joint Conference on Artificial I...,Co-ordinate Square: Solution to Many Chess Paw...,True,[Northeastern University],"[JournalArticle, Conference]",6
215,Kenneth Ward Church,,https://www.semanticscholar.org/paper/0528e7bb...,,,MAG,,Feasibility of applying a computer program to ...,True,[Northeastern University],,0
216,Kenneth Ward Church,,https://www.semanticscholar.org/paper/216525f3...,,,CorpusId,,"Discrimination decisions for l O 0 , O 00-dime...",True,[Northeastern University],,0
217,Kenneth Ward Church,,https://www.semanticscholar.org/paper/ab1dfead...,,,CorpusId,,Practical Procedures for Dimension Reduction i...,True,[Northeastern University],,1


### arXiv PDF iterative search

In [56]:
import urllib, urllib.request
url = 'http://export.arxiv.org/api/query?search_query=Northeastern'
data = urllib.request.urlopen(url)
print(data.read().decode('utf-8'))

<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
  <link href="http://arxiv.org/api/query?search_query%3DNortheastern%26id_list%3D%26start%3D0%26max_results%3D10" rel="self" type="application/atom+xml"/>
  <title type="html">ArXiv Query: search_query=Northeastern&amp;id_list=&amp;start=0&amp;max_results=10</title>
  <id>http://arxiv.org/api/PMZTZwaucAW/yPj6dJ/zs7ted3w</id>
  <updated>2024-03-05T00:00:00-05:00</updated>
  <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">602</opensearch:totalResults>
  <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>
  <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">10</opensearch:itemsPerPage>
  <entry>
    <id>http://arxiv.org/abs/hep-ph/0410007v1</id>
    <updated>2004-10-01T09:52:55Z</updated>
    <published>2004-10-01T09:52:55Z</published>
    <title>MSUGRA Dark Matter and the b Quark Mass</title>


In [60]:
pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.10.4-py3-none-any.whl (54 kB)
Collecting pypdfium2>=4.18.0
  Downloading pypdfium2-4.27.0-py3-none-win_amd64.whl (2.7 MB)
Collecting pdfminer.six==20221105
  Downloading pdfminer.six-20221105-py3-none-any.whl (5.6 MB)
Collecting cryptography>=36.0.0
  Downloading cryptography-42.0.5-cp39-abi3-win_amd64.whl (2.9 MB)
Installing collected packages: cryptography, pypdfium2, pdfminer.six, pdfplumber
  Attempting uninstall: cryptography
    Found existing installation: cryptography 3.4.8
    Uninstalling cryptography-3.4.8:
      Successfully uninstalled cryptography-3.4.8
Successfully installed cryptography-42.0.5 pdfminer.six-20221105 pdfplumber-0.10.4 pypdfium2-4.27.0
Note: you may need to restart the kernel to use updated packages.




In [139]:
import requests
import xml.etree.ElementTree as ET
import pdfplumber
import re

# Define the base URL for arXiv API
base_url = 'http://export.arxiv.org/api/query?'

# Define the author name you want to search for
author_name = 'Jennifer G Dy'

# Define the query parameters
query = f'search_query=au:"{author_name}"'

# Make the HTTP request to the arXiv API
response = requests.get(base_url + query)

# Parse the XML response
root = ET.fromstring(response.content)

total_papers = 0

# Iterate through each entry in the XML response
for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):
    title = entry.find('{http://www.w3.org/2005/Atom}title').text
    authors = [author.find('{http://www.w3.org/2005/Atom}name').text for author in entry.findall('{http://www.w3.org/2005/Atom}author')]
    
    # Check if any author affiliations are present in the link
    links = entry.findall('{http://www.w3.org/2005/Atom}link')
    for link in links:
        if 'title' in link.attrib and 'pdf' in link.attrib['title'].lower():
            pdf_link = link.attrib['href']
            break
    else:
        pdf_link = None

    print('Title:', title)
    print('Authors:', authors)
    print('PDF Link:', pdf_link)
    
    # If PDF link is available, extract potential affiliations from the first page
    if pdf_link:
        response = requests.get(pdf_link)
        with open('temp.pdf', 'wb') as f:
            f.write(response.content)
        
        with pdfplumber.open('temp.pdf') as pdf:
            first_page_text = pdf.pages[0].extract_text()
        
        # Extract potential affiliations based on patterns
        potential_affiliations = []
        lines = first_page_text.split('\n')
        for i, line in enumerate(lines):
            #pattern for identifying affiliations
            if "Northeastern University" in line:
                print(i,line)
            affiliation_pattern = re.compile(r'(affiliation|department|university|insti|uni|dept|e-mail)', re.IGNORECASE)
            #if affiliation_pattern.search(line):
                # Extract the next few lines as potential affiliations
                #potential_affiliations.extend(lines[i:i+1])
        
        #print('Potential Affiliations:')
        #for affiliation in potential_affiliations:
         #   if "Northeastern University" in affiliation:
          #      print("affiliations:" , affiliation)
           #     print("Author is affiliated with Northeastern University")

    print('---')
    total_papers += 1

# Print the total number of papers
print("Total number of papers:", total_papers)



Title: Streaming Adaptive Nonparametric Variational Autoencoder
Authors: ['Tingting Zhao', 'Zifeng Wang', 'Aria Masoomi', 'Jennifer G. Dy']
PDF Link: http://arxiv.org/pdf/1906.03288v2
2 Northeastern University Northeastern University Northeastern University Northeastern University
---
Title: Asymptotic Analysis of Objectives based on Fisher Information in Active
  Learning
Authors: ['Jamshid Sourati', 'Murat Akcakaya', 'Todd K. Leen', 'Deniz Erdogmus', 'Jennifer G. Dy']
PDF Link: http://arxiv.org/pdf/1605.08798v2
29 ∗Department of Electrical and Computer Engineering, Northeastern University, Boston MA. E–mail:
33 §Department of Electrical and Computer Engineering, Northeastern University, Boston MA.. E–mail:
35 ¶Department of Electrical and Computer Engineering, Northeastern University, Boston MA.. E–mail:
---
Title: Iterative Spectral Method for Alternative Clustering
Authors: ['Chieh Wu', 'Stratis Ioannidis', 'Mario Sznaier', 'Xiangyu Li', 'David Kaeli', 'Jennifer G. Dy']
PDF Link: h

In [137]:
for i, line in enumerate(lines):
    

2 Northeastern University Northeastern University Boston College
5 Northeastern University Northeastern University Northeastern University


In [141]:
import requests
import xml.etree.ElementTree as ET
import pdfplumber
import re

# Define the base URL for arXiv API
base_url = 'http://export.arxiv.org/api/query?'

# Define the author name you want to search for
author_name = 'Jennifer G Dy'

# Define the query parameters
query = f'search_query=au:"{author_name}"'

# Define the desired affiliation
desired_affiliation = "Northeastern University"

# Make the HTTP request to the arXiv API
response = requests.get(base_url + query)

# Parse the XML response
root = ET.fromstring(response.content)

# Initialize counter for the number of papers
total_papers = 0

# Iterate through each entry in the XML response
for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):
    title = entry.find('{http://www.w3.org/2005/Atom}title').text
    authors = [author.find('{http://www.w3.org/2005/Atom}name').text for author in entry.findall('{http://www.w3.org/2005/Atom}author')]
    
    # Check if any author affiliations are present in the link
    links = entry.findall('{http://www.w3.org/2005/Atom}link')
    for link in links:
        if 'title' in link.attrib and 'pdf' in link.attrib['title'].lower():
            pdf_link = link.attrib['href']
            break
    else:
        pdf_link = None

    print('Title:', title)
    print('Authors:', authors)
    print('PDF Link:', pdf_link)
    
    # If PDF link is available, extract potential affiliations from the first page
    if pdf_link:
        response = requests.get(pdf_link)
        with open('temp.pdf', 'wb') as f:
            f.write(response.content)
        
        with pdfplumber.open('temp.pdf') as pdf:
            first_page_text = pdf.pages[0].extract_text()
        
        # Extract potential affiliations based on patterns
        potential_affiliations = []
        lines = first_page_text.split('\n')
        for i, line in enumerate(lines):
            #pattern for identifying affiliations
            affiliation_pattern = re.compile(r'(affiliation|department|university|insti|uni|dept)', re.IGNORECASE)
            if affiliation_pattern.search(line):
                # Extract the next few lines as potential affiliations
                potential_affiliations.extend(lines[i:i+2])
        
        print('Potential Affiliations:')
        for affiliation in potential_affiliations:
            if desired_affiliation in affiliation:
                # Check if any authors match this affiliation
                for author in authors:
                    print(f'Author: {author} - Affiliation: {affiliation}')
    print('---')
    total_papers += 1

# Print the total number of papers
print("Total number of papers:", total_papers)


Title: Streaming Adaptive Nonparametric Variational Autoencoder
Authors: ['Tingting Zhao', 'Zifeng Wang', 'Aria Masoomi', 'Jennifer G. Dy']
PDF Link: http://arxiv.org/pdf/1906.03288v2
Potential Affiliations:
Author: Tingting Zhao - Affiliation: Northeastern University Northeastern University Northeastern University Northeastern University
Author: Zifeng Wang - Affiliation: Northeastern University Northeastern University Northeastern University Northeastern University
Author: Aria Masoomi - Affiliation: Northeastern University Northeastern University Northeastern University Northeastern University
Author: Jennifer G. Dy - Affiliation: Northeastern University Northeastern University Northeastern University Northeastern University
---
Title: Asymptotic Analysis of Objectives based on Fisher Information in Active
  Learning
Authors: ['Jamshid Sourati', 'Murat Akcakaya', 'Todd K. Leen', 'Deniz Erdogmus', 'Jennifer G. Dy']
PDF Link: http://arxiv.org/pdf/1605.08798v2
Potential Affiliations:
A