### Webscraping Assessment, by Pranav Vishwanath  - to scrape author and article details from semantic scholar's website

In [238]:
import requests
import csv
import numpy as np

class Article:    #defining our article class to store our data from the api
    def __init__(self, full_name:str):
        self.full_name = full_name
        self.eai_url = None
        self.link = ""
        self.pdf_link = ""
        self.publication_date = None
        self.data_source = ""
        self.publication = ""
        self.title = ""
        self.eai_match = False
        self.affiliation = ""
        self.type = ""
        self.citations = 0

def fetch_author_data(url, params):
    """fucntion check for valid response of json from the api and fetch author data.
    accepts two arguments url and params and returns the json response from the semantic scholar author api"""
    response = requests.get(url, params=params) 
    if response.status_code == 200:
        data = response.json()
        return data
    else:
        print("Failed to fetch data from Semantic Scholar API.")
        return None

def save_to_csv(articles, filename):
    """function to export our data into a csv. it accepts the two arguments, articles list and filename and returns
    a csv file"""
    with open(filename, mode='w', newline='', encoding='utf-8') as file:  #function to 
        writer = csv.writer(file)
        writer.writerow(["Full Name", "EAI URL", "Link", "PDF Link", "Publication Date", "Data Source", "Publication", "Title", "EAI Match", "Affiliation", "Type", "Citations"])
        for article in articles:
            writer.writerow([article.full_name, article.eai_url, article.link, article.pdf_link, article.publication_date, article.data_source, article.publication, article.title, article.eai_match, article.affiliation, article.type, article.citations])

if __name__ == "__main__":
    url = "https://api.semanticscholar.org/graph/v1/author/search" #semantic scholar author api url
    query = "Kenneth Church" #our sample query
    offset = 0
    limit = 490 #api sets the limit to 490 max outputs with matching author names
    fields = "name,affiliations,papers.title,papers.url,papers.publicationDate,papers.citationCount,papers.publicationTypes,papers.openAccessPdf,papers.venue" #defining our fields
    params = {
        "query": query,
        "offset": offset,
        "limit": limit,
        "fields": fields
    }
    author_data = fetch_author_data(url, params) #fetch out our data
    if author_data: #check for data on the api response
        articles = []
        for author in author_data["data"]: 
            if "Northeastern University" in author.get("affiliations"): #check for affiliations and then search the json
                for paper in author.get("papers"):
                    if isinstance(paper, dict): #push the values into the defined article class
                        article = Article(author['name'])
                        article.eai_url = None
                        article.title = paper.get('title', '')
                        article.link = paper.get('url')  
                        article.publication_date = paper.get('publicationDate', '')
                        article.affiliation = author.get('affiliations', '')
                        article.citations = paper.get('citationCount', 0)
                        article.type = paper.get('publicationTypes', '')
                        article.pdf_link = paper.get('openAccessPdf', '')
                        article.publication = paper.get('venue', '')
                        articles.append(article)
                        article.eai_match = True

        
        print(f"Total articles found: {len(articles)}") #print number of responses received
        filename = f"{query}.csv" #export to a csv
        save_to_csv(articles, filename) #save our csv
        print(f"Articles exported to '{filename}'") #show name of csv if successful
        
    else:
        print("No data found.") #if no data is found


Total articles found: 219
Articles exported to 'Kenneth Church.csv'


In [239]:
df = pd.read_csv("Kenneth Church.csv")

In [242]:
#function to process json strings to get URLS to a processible format
import numpy as np
"""function that extracts url from the json string in out dataframe, accepts two arguments : row in the dataframe
and separator, returns the processed row """
def process_pdf_link(row, separator=' '):
    link = row['PDF Link']
    if isinstance(link, float):
        return np.nan
    else:
        link_parts = link.split(separator)
        return link_parts[1] if len(link_parts) > 1 else np.nan

df['PDF Link'] = df.apply(process_pdf_link, axis=1) #apply using lambda function

In [243]:
df

Unnamed: 0,Full Name,EAI URL,Link,PDF Link,Publication Date,Data Source,Publication,Title,EAI Match,Affiliation,Type,Citations
0,Kenneth Ward Church,,https://www.semanticscholar.org/paper/3b6d0eb3...,,2023-08-20,,Interspeech,Improved Contextualized Speech Representations...,True,['Northeastern University'],,0
1,Kenneth Ward Church,,https://www.semanticscholar.org/paper/46cc3955...,,2023-06-02,,International Conference on Web and Social Media,An Example of (Too Much) Hyper-Parameter Tunin...,True,['Northeastern University'],['JournalArticle'],2
2,Kenneth Ward Church,,https://www.semanticscholar.org/paper/9678a516...,,2023-09-01,,Natural Language Engineering,Emerging trends: Smooth-talking machines,True,['Northeastern University'],['JournalArticle'],1
3,Kenneth Ward Church,,https://www.semanticscholar.org/paper/0aa43be1...,,,,International Conference on Language Resources...,Training on Lexical Resources,True,['Northeastern University'],['JournalArticle'],1
4,Kenneth Ward Church,,https://www.semanticscholar.org/paper/3bc07732...,,2022-04-27,,Wireless and Microwave Technology Conference,Advanced Manufacturing and Characterization of...,True,['Northeastern University'],['Conference'],0
...,...,...,...,...,...,...,...,...,...,...,...,...
214,Kenneth Ward Church,,https://www.semanticscholar.org/paper/bd9629da...,,1979-08-20,,International Joint Conference on Artificial I...,Co-ordinate Square: Solution to Many Chess Paw...,True,['Northeastern University'],"['JournalArticle', 'Conference']",6
215,Kenneth Ward Church,,https://www.semanticscholar.org/paper/0528e7bb...,,,,,Feasibility of applying a computer program to ...,True,['Northeastern University'],,0
216,Kenneth Ward Church,,https://www.semanticscholar.org/paper/216525f3...,,,,,"Discrimination decisions for l O 0 , O 00-dime...",True,['Northeastern University'],,0
217,Kenneth Ward Church,,https://www.semanticscholar.org/paper/ab1dfead...,,,,,Practical Procedures for Dimension Reduction i...,True,['Northeastern University'],,1
