In [3]:
import pandas as pd 
import numpy as np
from semanticscholar import SemanticScholar
from tqdm import tqdm
import json

In [4]:
sc = SemanticScholar(timeout=200)

In [5]:
## Choose Keywords
domain = {
    "Computer Science":["data management", "indexing", "data modeling", "big data", 
          "data processing", "data storage", "data querying"]
    # "Medicine":["diagnosis","health","disease","Genomic"],
    # "Biology": ["genome","Epigenomic","phylogenetics","receptor"],
    # "Chemistry":["synthesis","Bioinorganic","Molecular","Chromatographic"],
    # "Mathematics":["curves","probability","Harmonic","hyperbolic"]
}
fields = ["paperId", "corpusId", "externalIds", "url", "title", "abstract", "venue",
    "publicationVenue", "year", "referenceCount", "citationCount", 
    "influentialCitationCount", "isOpenAccess", "openAccessPdf", 
    "fieldsOfStudy", "s2FieldsOfStudy", "publicationTypes", "publicationDate",
    "journal", "citationStyles", "authors", "citations", "references", "tldr"]

In [11]:
"| ".join(domain[keyword])

'data management| indexing| data modeling| big data| data processing| data storage| data querying'

In [14]:
journal_papers = []
conference_papers = []

for keyword in tqdm(domain.keys()):
        response_journal = sc.search_paper(query="| ".join(domain[keyword]),fields=fields,publication_types=["JournalArticle"],fields_of_study=[f"{keyword}"],min_citation_count=5)
        response_conference = sc.search_paper(query="| ".join(domain[keyword]),fields=fields,publication_types=["Conference"],fields_of_study=[f"{keyword}"],min_citation_count=5)
        journal_papers.extend(response_journal.items)
        conference_papers.extend(response_conference.items)
  

100%|██████████| 1/1 [00:13<00:00, 13.95s/it]


In [15]:
print(len(conference_papers))
print(len(journal_papers))

100
100


In [16]:
new_conference_papers = [dict(x) for x in conference_papers]
new_journal_papers = [dict(x) for x in journal_papers]

In [17]:
with open("/Users/elnararb/Documents/UPC/Semantic Data Management/data/paper_details_new/conference_papers.json","w") as f:
    json.dump(new_conference_papers,f)

with open("/Users/elnararb/Documents/UPC/Semantic Data Management/data/paper_details_new/journal_papers.json","w") as f:
    json.dump(new_journal_papers,f)

In [18]:
lst_of_paperids_conference = []
for x in conference_papers:
    lst_of_paperids_conference.extend([cite['paperId'] for cite in x['citations'] if cite['paperId'] != None])   

lst_of_paperids_conference = list(set(lst_of_paperids_conference))

In [19]:
lst_of_paperids_journal = []
for x in journal_papers:
    lst_of_paperids_journal.extend([cite['paperId'] for cite in x['citations'] if cite['paperId'] != None])  

lst_of_paperids_journal = list(set(lst_of_paperids_journal)) 

In [20]:
len(lst_of_paperids_journal)

6394

In [21]:
len(lst_of_paperids_conference)

1932

In [22]:
chunk_size = 500 
cited_conference_papers = []

for i in tqdm(range(0, len(lst_of_paperids_conference), chunk_size)):
    
    chunk = lst_of_paperids_conference[i:min(i + chunk_size,len(lst_of_paperids_conference))]
    response = sc.get_papers(paper_ids=chunk,fields=fields)
    cited_conference_papers.extend(response)


100%|██████████| 4/4 [01:25<00:00, 21.29s/it]


In [24]:
len(cited_conference_papers)

1932

In [25]:
chunk_size = 500 
cited_journal_papers = []

for i in tqdm(range(0, len(lst_of_paperids_journal), chunk_size)):
    
    chunk = lst_of_paperids_journal[i:min(i + chunk_size,len(lst_of_paperids_journal))]
    response = sc.get_papers(paper_ids=chunk,fields=fields)
    cited_journal_papers.extend(response)

100%|██████████| 13/13 [07:24<00:00, 34.18s/it]


In [26]:
len(cited_journal_papers)

6394

In [27]:
new_cited_conference_papers = [dict(x) for x in cited_conference_papers]

In [28]:
new_cited_journal_papers = [dict(x) for x in cited_journal_papers]

In [31]:
with open("/Users/elnararb/Documents/UPC/Semantic Data Management/data/paper_details_new/conference_papers_citations.json","w") as f:
    json.dump(new_cited_conference_papers,f)

with open("/Users/elnararb/Documents/UPC/Semantic Data Management/data/paper_details_new/journal_papers_citations.json","w") as f:
    json.dump(new_cited_journal_papers,f)

In [29]:
lst_of_author_ids = []

for paper in conference_papers :
    lst_of_author_ids.extend([author['authorId'] for author in paper['authors']])

for paper in journal_papers :
    lst_of_author_ids.extend([author['authorId'] for author in paper['authors']])

for paper in cited_conference_papers :
    lst_of_author_ids.extend([author['authorId'] for author in paper['authors']])

for paper in cited_journal_papers :
    lst_of_author_ids.extend([author['authorId'] for author in paper['authors']])

In [30]:
print(len(lst_of_author_ids))
lst_of_author_ids = list(set(lst_of_author_ids))
print(len(lst_of_author_ids))

35396
25553


In [32]:
author_fields = ['authorId', 'externalIds', 'url', 'name', 'affiliations', 'homepage', 'paperCount', 'citationCount', 'hIndex']

In [33]:

chunk_size = 100
author_details = []

for i in tqdm(range(0, len(lst_of_author_ids), chunk_size)):
    
    chunk = lst_of_author_ids[i:min(i + chunk_size,len(lst_of_author_ids))]
    response = sc.get_authors(chunk,fields=author_fields)

    author_details.extend(response)
  

100%|██████████| 256/256 [22:26<00:00,  5.26s/it]


In [34]:
new_author_details = [dict(x) for x in author_details]

In [36]:
with open("/Users/elnararb/Documents/UPC/Semantic Data Management/data/authors_details_new.json","w") as f:
    json.dump(new_author_details,f)