In [1]:
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup
import bibtexparser
import tqdm



In [2]:
def read_json(path):
    with open(path) as f:
        return json.load(f)

# IEEE Xplore

In [18]:
ieee_path = "../data/literature_search/raw/ieeeXplore-raw.json"

In [19]:
ieee_raw = read_json(ieee_path)

In [21]:
ieee_raw['records'][0].keys()

dict_keys(['authors', 'patentCitationCount', 'accessType', 'publicationYear', 'publicationNumber', 'documentLink', 'articleNumber', 'doi', 'citationCount', 'isNumber', 'publicationLink', 'ephemera', 'vj', 'pdfSize', 'startPage', 'endPage', 'publicationDate', 'rightslinkFlag', 'rightsLink', 'articleTitle', 'downloadCount', 'htmlLink', 'citationsLink', 'showHtml', 'graphicalAbstract', 'showAlgorithm', 'showDataset', 'showVideo', 'publisher', 'redline', 'showCheckbox', 'handleProduct', 'contentType', 'publicationTitle', 'displayPublicationTitle', 'abstract', 'articleContentType', 'pdfLink', 'highlightedTitle', 'isStandard', 'isConference', 'isJournalAndMagazine', 'isEarlyAccess', 'isMagazine', 'isJournal', 'isBook', 'course', 'isBookWithoutChapters', 'docIdentifier', 'displayContentType'])

In [37]:
authors, titles, abstracts, publication_years, dois, pdfs, citation_cnts, urls = [], [], [], [], [], [], [], []
for record in ieee_raw['records']:
    authors.append('; '.join(a['preferredName'] for a in record['authors']))
    titles.append(record['articleTitle'])
    abstracts.append(record['abstract'])
    publication_years.append(record['publicationYear'])
    dois.append(('https://doi.org/' + record['doi']) if 'doi' in record.keys() else None)
    pdfs.append('https://ieeexplore.ieee.org' + record['pdfLink'])
    citation_cnts.append(record['citationCount'])
    urls.append('https://ieeexplore.ieee.org' + record['htmlLink'])

In [38]:
ieee_df = pd.DataFrame.from_dict({
    'authors': authors,
    'title': titles,
    'abstract': abstracts,
    'publication_year': publication_years,
    'doi': dois,
    'pdf': pdfs,
    'citation_cnt': citation_cnts,
    'url': urls
})

In [39]:
ieee_df.head()

Unnamed: 0,authors,title,abstract,publication_year,doi,pdf,citation_cnt,url
0,Elizaveta Zimina; Jyrki Nummenmaa; Kalervo Jar...,MuG-QA: Multilingual Grammatical Question Answ...,We introduce [::Multilingual::] Grammatical [:...,2018,https://doi.org/10.1109/PIC.2018.8706310,https://ieeexplore.ieee.org/stamp/stamp.jsp?tp...,1,https://ieeexplore.ieee.org/document/8706310/
1,Xiangzhou Huang; Baogang Wei; Yin Zhang,Automatic Question-Answering Based on Wikipedi...,The [::question-answering::] ([::QA::]) system...,2015,https://doi.org/10.1109/ISKE.2015.78,https://ieeexplore.ieee.org/stamp/stamp.jsp?tp...,3,https://ieeexplore.ieee.org/document/7383065/
2,Aleksandr Perevalov; Dennis Diefenbach; Ricard...,QALD-9-plus: A Multilingual Dataset for Questi...,The ability to have the same experience for di...,2022,https://doi.org/10.1109/ICSC52841.2022.00045,https://ieeexplore.ieee.org/stamp/stamp.jsp?tp...,0,https://ieeexplore.ieee.org/document/9736271/
3,Bo Liu; Li-Ming Zhan; Li Xu; Lin Ma; Yan Yang;...,Slake: A Semantically-Labeled Knowledge-Enhanc...,Medical visual [::question answering::] (Med-V...,2021,https://doi.org/10.1109/ISBI48211.2021.9434010,https://ieeexplore.ieee.org/stamp/stamp.jsp?tp...,1,https://ieeexplore.ieee.org/document/9434010/
4,Santosh K. Ray; Khaled Shaalan,A Review and Future Perspectives of Arabic Que...,[::Question Answering::] Systems (QASs) have e...,2016,https://doi.org/10.1109/TKDE.2016.2607201,https://ieeexplore.ieee.org/stamp/stamp.jsp?tp...,25,https://ieeexplore.ieee.org/document/7563293/


In [40]:
ieee_df.to_csv('../data/literature_search/ieee.csv', index=False)

# ACM DL

In [48]:
with open('../data/literature_search/raw/acm-dl.bib') as bibtex_file:
    bib_database = bibtexparser.load(bibtex_file)

In [50]:
bib_database.entries[0].keys()

dict_keys(['series', 'location', 'keywords', 'numpages', 'pages', 'booktitle', 'abstract', 'doi', 'url', 'address', 'publisher', 'isbn', 'year', 'title', 'author', 'ENTRYTYPE', 'ID'])

In [52]:
bib_database.entries[0]['doi']

'10.1145/3106426.3106514'

In [58]:
authors, titles, abstracts, publication_years, dois, citation_cnts, urls = [], [], [], [], [], [], []
for record in bib_database.entries:
    authors.append(record['author'] if 'author' in record.keys() else None)
    titles.append(record['title'] if 'title' in record.keys() else None)
    abstracts.append(record['abstract'] if 'abstract' in record.keys() else None)
    publication_years.append(record['year'])
    dois.append(('https://doi.org/' + record['doi']) if 'doi' in record.keys() else None)
    urls.append(record['url'] if 'url' in record.keys() else None)

In [59]:
acm_df = pd.DataFrame.from_dict({
    'authors': authors,
    'title': titles,
    'abstract': abstracts,
    'publication_year': publication_years,
    'doi': dois,
    'url': urls
})

In [60]:
acm_df.to_csv('../data/literature_search/acm.csv', index=False)

# Springer

In [61]:
springer_chapter = pd.read_csv('../data/literature_search/raw/springer-chapter.csv')
springer_article = pd.read_csv('../data/literature_search/raw/springer-article.csv')

In [72]:
springer_chapter.shape, springer_article.shape

((1000, 10), (356, 10))

In [67]:
authors = springer_chapter['Authors'].to_list() + springer_article['Authors'].to_list()
titles = springer_chapter['Item Title'].to_list() + springer_article['Item Title'].to_list()
publication_years = springer_chapter['Publication Year'].to_list() + springer_article['Publication Year'].to_list()
dois = ['https://doi.org/' + d for d in springer_chapter['Item DOI'].to_list() + springer_article['Item DOI'].to_list()]
urls = springer_chapter['URL'].to_list() + springer_article['URL'].to_list()

In [68]:
springer_df = pd.DataFrame.from_dict({
    'authors': authors,
    'title': titles,
    'publication_year': publication_years,
    'doi': dois,
    'url': urls
})

In [69]:
springer_df.to_csv('../data/literature_search/springer.csv', index=False)

## German Springer

In [4]:
springer_chapter = pd.read_csv('../data/literature_search/raw/springer-chapter-de.csv')
springer_article = pd.read_csv('../data/literature_search/raw/springer-article-de.csv')

In [5]:
authors = springer_chapter['Authors'].to_list() + springer_article['Authors'].to_list()
titles = springer_chapter['Item Title'].to_list() + springer_article['Item Title'].to_list()
publication_years = springer_chapter['Publication Year'].to_list() + springer_article['Publication Year'].to_list()
dois = ['https://doi.org/' + d for d in springer_chapter['Item DOI'].to_list() + springer_article['Item DOI'].to_list()]
urls = springer_chapter['URL'].to_list() + springer_article['URL'].to_list()

In [6]:
springer_df = pd.DataFrame.from_dict({
    'authors': authors,
    'title': titles,
    'publication_year': publication_years,
    'doi': dois,
    'url': urls
})

In [7]:
springer_df.to_csv('../data/literature_search/springer-de.csv', index=False)

# DBLP

In [73]:
with open('../data/literature_search/raw/dblp-raw.bib') as bibtex_file:
    bib_database = bibtexparser.load(bibtex_file)

In [74]:
authors, titles, abstracts, publication_years, dois, citation_cnts, urls = [], [], [], [], [], [], []
for record in bib_database.entries:
    authors.append(record['author'] if 'author' in record.keys() else None)
    titles.append(record['title'] if 'title' in record.keys() else None)
    abstracts.append(record['abstract'] if 'abstract' in record.keys() else None)
    publication_years.append(record['year'])
    dois.append(('https://doi.org/' + record['doi']) if 'doi' in record.keys() else None)
    urls.append(record['url'] if 'url' in record.keys() else None)

In [75]:
dblp_df = pd.DataFrame.from_dict({
    'authors': authors,
    'title': titles,
    'abstract': abstracts,
    'publication_year': publication_years,
    'doi': dois,
    'url': urls
})

In [76]:
dblp_df.to_csv('../data/literature_search/dblp.csv', index=False)

# Cyberleninka

In [3]:
cyberleninka_raw = read_json("../data/literature_search/raw/cyberleninka-raw-2.json")

In [4]:
cyberleninka_raw['articles'][0].keys()

dict_keys(['name', 'annotation', 'link', 'authors', 'year', 'journal', 'journal_link', 'ocr', 'catalogs'])

In [5]:
cyberleninka_raw['articles'][0]['link']

'/article/n/tehnologiya-primeneniya-patternov-ontologicheskogo-proektirovaniya-dlya-optimizatsii-vypolneniya-zaprosov-v-sistemah-obespecheniya'

In [6]:
authors, titles, abstracts, publication_years, urls = [], [], [], [], []
for article in cyberleninka_raw['articles']:
    authors.append('; '.join(article['authors']))
    titles.append(article['name'])
    abstracts.append(article['annotation'])
    publication_years.append(article['year'])
    urls.append('https://cyberleninka.ru' + article['link'])

In [7]:
cyberleninka_df = pd.DataFrame.from_dict({
    'authors': authors,
    'title': titles,
    'abstract': abstracts,
    'publication_year': publication_years,
    'url': urls
})

In [8]:
cyberleninka_df.to_csv('../data/literature_search/cyberleninka-2.csv', index=False)

# ACL Anthology

In [29]:
# replace this regex: month = .*,$ with this: month = "",
with open('../data/literature_search/raw/anthology-abstracts.bib') as bibtex_file:
    bib_database = bibtexparser.load(bibtex_file)

In [30]:
len(bib_database.entries)

74539

In [31]:
bib_database.entries[0].keys()

dict_keys(['url', 'publisher', 'address', 'year', 'month', 'editor', 'title', 'ENTRYTYPE', 'ID'])

In [40]:
def filter_paper(title, abstract):
    data_terms = ["Knowledge Base", "Knowledge Graph", "DBpedia", "Wikidata", "YAGO", "Semantic Web", "Linked Data", "RDF", "data web", "SPARQL", "Query Graph", "Web data", "WWW", "web of data", "QALD", "SimpleQuestions", "WebQuestions", "WebQSP", "LC-QuAD", "RuBQ", "SimpleDBpediaQA", "ComplexWebQuestions", "CWQ"]
    system_terms = ["Semantic search", "Question Answer", "Question-Answer", "KBQA", "KGQA", "KB QA", "KB-QA", "KG-QA", "KG QA", "NLI", "NLIDB", "QA", "Natural Language Interface"]
    feature_terms = ["multilingual", "multi-lingual", "crosslingual", "cross-lingual", "cross linguistic", "internationalized", "multilingualism", "multilinguistic", "multilanguage", "bilingual", "many languages", "multiple languages", "several languages", "more than one language"]

    if (any(term.lower() in title.lower() for term in data_terms) and any(term.lower() in title.lower() for term in system_terms) and any(term.lower() in title.lower() for term in feature_terms)) or (any(term.lower() in abstract.lower() for term in data_terms) and any(term.lower() in abstract.lower() for term in system_terms) and any(term.lower() in abstract.lower() for term in feature_terms)):
        return True
    return False

In [41]:
authors, titles, abstracts, publication_years, dois, urls = [], [], [], [], [], []
for record in tqdm.tqdm(bib_database.entries):
    if filter_paper(title=record['title'] if 'title' in record.keys() else "", abstract=record['abstract'] if 'abstract' in record.keys() else ""):
        authors.append(record['author'] if 'author' in record.keys() else None)
        titles.append(record['title'] if 'title' in record.keys() else None)
        abstracts.append(record['abstract'] if 'abstract' in record.keys() else None)
        publication_years.append(record['year'])
        dois.append(('https://doi.org/' + record['doi']) if 'doi' in record.keys() else None)
        urls.append(record['url'] if 'url' in record.keys() else None)

100%|██████████| 74539/74539 [00:01<00:00, 46906.70it/s]


In [43]:
acl_df = pd.DataFrame.from_dict({
    'authors': authors,
    'title': titles,
    'abstract': abstracts,
    'publication_year': publication_years,
    'doi': dois,
    'url': urls
})

In [44]:
acl_df.to_csv('../data/literature_search/acl-anthology.csv', index=False)