### Webscraping Assessment, by Pranav Vishwanath  - to scrape author and article details from semantic scholar's website

In [51]:
import requests
import pandas as pd

class Article:
    def __init__(self, full_name:str):
        self.full_name = full_name
        self.eai_url = None
        self.link = ""
        self.pdf_link = ""
        self.publication_date = None
        self.data_source = ""
        self.publication = ""
        self.title = ""
        self.eai_match = False
        self.affiliation = ""
        self.type = ""
        self.citations = 0

def fetch_author_data(url, params):
    response = requests.get(url, params=params) 
    if response.status_code == 200:
        data = response.json()
        return data
    else:
        print("Failed to fetch data from Semantic Scholar API.")
        return None

if __name__ == "__main__":
    url = "https://api.semanticscholar.org/graph/v1/author/search"
    query = "Kenneth Church"
    offset = 0
    limit = 490
    fields = "name,affiliations,papers.title,papers.url,papers.publicationDate,papers.citationCount,papers.publicationTypes,papers.openAccessPdf,papers.venue,papers.externalIds"
    params = {
        "query": query,
        "offset": offset,
        "limit": limit,
        "fields": fields
    }
    author_data = fetch_author_data(url, params)
    if author_data:
        articles = []
        for author in author_data["data"]:
            if "Northeastern University" in author.get("affiliations", ""):
                for paper in author.get("papers", []):
                    if isinstance(paper, dict):
                        article = Article(author['name'])
                        article.title = paper.get('title', '')
                        article.link = paper.get('url')  
                        article.publication_date = paper.get('publicationDate', '')
                        article.affiliation = author.get('affiliations', '')
                        article.citations = paper.get('citationCount', 0)
                        article.type = paper.get('publicationTypes', '')
                        article.pdf_link = paper.get('openAccessPdf', '')
                        article.publication = paper.get('venue', '')
                        articles.append(article)
                        article.eai_match = True
                        article.data_source = paper.get('externalIds')

        print(f"Total articles found: {len(articles)}")
        
        # Convert articles list to DataFrame
        df = pd.DataFrame([vars(article) for article in articles])
        def process_pdf_link(link_dict):
            """Function that extracts the URL from the PDF Link dictionary.
            Accepts the link_dict as input and returns the URL."""
            if isinstance(link_dict, dict):
                return link_dict.get('url', None)
            else:
                return None

        # Apply the function to the 'pdf_link' column
        df['pdf_link'] = df['pdf_link'].apply(process_pdf_link)
        
        def process_data_source(data_source_dict):
            """Function that extracts the first key from the data source dictionary.
            Accepts the data_source_dict as input and returns the first key."""
            if isinstance(data_source_dict, dict):
                return next(iter(data_source_dict))
            else:
                return None

        # Apply the function to the 'data_source' column
        df['data_source'] = df['data_source'].apply(process_data_source)


        # Export DataFrame to CSV
        filename = f"{query}.csv"
        df.to_csv(filename, index=False)
        print(f"Articles exported to '{filename}'")

    else:
        print("No data found.")


Total articles found: 219
Articles exported to 'Kenneth Church.csv'


In [52]:
df

Unnamed: 0,full_name,eai_url,link,pdf_link,publication_date,data_source,publication,title,eai_match,affiliation,type,citations
0,Kenneth Ward Church,,https://www.semanticscholar.org/paper/3b6d0eb3...,,2023-08-20,DOI,Interspeech,Improved Contextualized Speech Representations...,True,[Northeastern University],,0
1,Kenneth Ward Church,,https://www.semanticscholar.org/paper/46cc3955...,https://ojs.aaai.org/index.php/ICWSM/article/d...,2023-06-02,DBLP,International Conference on Web and Social Media,An Example of (Too Much) Hyper-Parameter Tunin...,True,[Northeastern University],[JournalArticle],2
2,Kenneth Ward Church,,https://www.semanticscholar.org/paper/9678a516...,https://www.cambridge.org/core/services/aop-ca...,2023-09-01,DBLP,Natural Language Engineering,Emerging trends: Smooth-talking machines,True,[Northeastern University],[JournalArticle],1
3,Kenneth Ward Church,,https://www.semanticscholar.org/paper/0aa43be1...,,,ACL,International Conference on Language Resources...,Training on Lexical Resources,True,[Northeastern University],[JournalArticle],1
4,Kenneth Ward Church,,https://www.semanticscholar.org/paper/3bc07732...,,2022-04-27,DOI,Wireless and Microwave Technology Conference,Advanced Manufacturing and Characterization of...,True,[Northeastern University],[Conference],0
...,...,...,...,...,...,...,...,...,...,...,...,...
214,Kenneth Ward Church,,https://www.semanticscholar.org/paper/bd9629da...,,1979-08-20,MAG,International Joint Conference on Artificial I...,Co-ordinate Square: Solution to Many Chess Paw...,True,[Northeastern University],"[JournalArticle, Conference]",6
215,Kenneth Ward Church,,https://www.semanticscholar.org/paper/0528e7bb...,,,MAG,,Feasibility of applying a computer program to ...,True,[Northeastern University],,0
216,Kenneth Ward Church,,https://www.semanticscholar.org/paper/216525f3...,,,CorpusId,,"Discrimination decisions for l O 0 , O 00-dime...",True,[Northeastern University],,0
217,Kenneth Ward Church,,https://www.semanticscholar.org/paper/ab1dfead...,,,CorpusId,,Practical Procedures for Dimension Reduction i...,True,[Northeastern University],,1
