# Crawl data from different websites
Python Script for crawling publiation's abstracts from five diffrerent websites.

In [1]:
import time
import json
import httpx
import urllib.request
import pandas as pd
import requests
from bs4 import BeautifulSoup
from lxml import etree



In [2]:
sciencedirect_apikey = 'your sciencedirect apikey'

In [3]:
springer_key = 'your springer apikey'

In [4]:
#  Read file information
df = pd.read_csv("./combined_urls.csv")
urls = df["url"]
urls.head(8)

0                https://arxiv.org/abs/2108.03555
1    https://doi.org/10.1109/AIVR50618.2020.00019
2         https://doi.org/10.1145/3106426.3106490
3    https://doi.org/10.1007/978-3-030-37734-2_34
4       https://doi.org/10.48550/arXiv.2203.08878
5      https://dl.acm.org/citation.cfm?id=2799675
6    https://ieeexplore.ieee.org/document/7944787
7    https://doi.org/10.1016/j.jksuci.2019.04.004
Name: url, dtype: object

In [5]:
def ieee_abstract(url):
    try:
        response = requests.get(url)
        #print(response.status_code)
        soup = BeautifulSoup(response.text,"html.parser")
        content = soup.head
        abstract = content.find(attrs={'property': 'og:description'}).get("content")
        abstract = abstract.replace('\n', '') 
        return(abstract)
    except:
        print("It could be something wrong.")
        return 0

In [6]:
def arxiv_abstract(url):
    try:
        response = requests.get(url)
        #print(response.status_code)
        soup = BeautifulSoup(response.text,"html.parser")
        content = soup.head
        abstract = content.find(attrs={'name': 'citation_abstract'}).get("content")
        abstract = abstract.replace('\n', '') 
        return(abstract)
    except:
        print("It could be something wrong.")
        return 0

In [7]:
def acm_abstract(url):
    try:
        response = requests.get(url,timeout=4)
        #print(response.status_code)
        soup = BeautifulSoup(response.text,"html.parser")
        elements = soup.find("div", class_="abstractSection abstractInFull") 
        abstract = elements.text
        abstract = abstract.replace('\n', '') 
        return(abstract)
    except:
        print("It could be something wrong.")
        return 0

In [8]:
def elsevier_data(paper_doi,apikey):
    head, sep, tail = paper_doi.partition("//doi.org/")
    url=f"https://api.elsevier.com/content/article/doi/"+tail
    apikey=apikey
    headers={
        "X-ELS-APIKey":apikey,
        "Accept":'application/json'
         }
    timeout = httpx.Timeout(10.0, connect=60.0)
    client = httpx.Client(timeout=timeout,headers=headers)
    query="&view=FULL"
    r=client.get(url)
    #print(r)
    return r

In [9]:
def elsevier_abstract(url):
    y = elsevier_data(url,sciencedirect_apikey)
    json_acceptable_string = y.text
    d = json.loads(json_acceptable_string)
    abstract = d['full-text-retrieval-response']['coredata']['dc:description']
    if abstract:
        abstract = abstract.strip()
        return(abstract)
    else:
        return 0

In [10]:
def springer_abstract(url):
    try:
        head, sep, tail = url.partition("//doi.org/")
        url=f"http://api.springernature.com/meta/v2/json?q=doi:"+tail+'&api_key='+springer_key
        response = requests.get(url,timeout=4)
        #print(response.status_code)
        text = response.json()
        for i in text["records"]:
            abstract = i["abstract"]
        return(abstract)
    except:
        print("It could be something wrong.")
        return 0

In [11]:
'''
combination
'''
rows = []
for u in urls:
    url = u
    if "arxiv.org" in url or "doi.org/10.48550" in url:
        abstract = arxiv_abstract(url)
    elif "ieeexplore" in url or "doi.org/10.1109" in url:
        abstract = ieee_abstract(url)
    elif "dl.acm.org" in url or "doi.org/10.1145" in url:
        abstract = acm_abstract(url)
    elif "doi.org/10.1016" in url: # elsevier
        abstract = elsevier_abstract(url)
    elif "doi.org/10.1007" in url: # Springer
        abstract = springer_abstract(url)
    else:
        abstract = 0
    row = { "url": url,"abstract" : abstract}
    rows.append(row)
    time.sleep(3)
    

http://api.springernature.com/meta/v2/json?q=doi:10.1007/978-3-030-37734-2_34&api_key=cccba4267c6bf43583b0e156bae36aa6


In [12]:
df = pd.DataFrame(rows)
df

Unnamed: 0,url,abstract
0,https://arxiv.org/abs/2108.03555,Background: Accurate diagnosis of skull base ...
1,https://doi.org/10.1109/AIVR50618.2020.00019,The Virtual Reality Lifelog Explorer is a prot...
2,https://doi.org/10.1145/3106426.3106490,Emoji have grown to become one of the most imp...
3,https://doi.org/10.1007/978-3-030-37734-2_34,Continuous media capture via a wearable device...
4,https://doi.org/10.48550/arXiv.2203.08878,Uncertainty estimation in deep learning has b...
5,https://dl.acm.org/citation.cfm?id=2799675,This paper describes a casual Facebook game to...
6,https://ieeexplore.ieee.org/document/7944787,Low-cost mini-drones with advanced sensing and...
7,https://doi.org/10.1016/j.jksuci.2019.04.004,The life of a Muslim cannot be separated from ...


In [None]:
# # Generate a new file with abstract
pd.DataFrame(rows).to_csv("paper_abstracts.csv",index=False,encoding="utf-8_sig")