In [None]:
# Last update October 6, 2024 18:27

from elsapy.elsclient import ElsClient
from elsapy.elsprofile import ElsAuthor, ElsAffil
from elsapy.elsdoc import FullDoc, AbsDoc
from elsapy.elssearch import ElsSearch
import pandas as pd
pd.set_option('display.max_columns', None)
import json
import requests
    
## Load configuration
con_file = open("config.json")
config = json.load(con_file)
con_file.close()

## Initialize client
client = ElsClient(config['apikey'])
client.inst_token = config['insttoken']


<h4>Helper functions</h4>

In [None]:
# Initialize variables and dataframes
scopus_stats_raw = pd.DataFrame(columns=['EID', 'Scopus ID', 'Cited by', 'Authors', 'Number of authors', 
                                         'Title', 'Year', 'Source title', 'Number of author keywords', 
                                         'Abstract', 'Number of references'])

references = []

# Compile a dataframe of referenced papers
# 'Cited by' - Scopus ID of a referencing paper
# 'Paper' - Scopus ID of a referenced paper
# 'Title' - Title of the referenced paper
# 'Year' - Publication year of the referenced paper

citation_df = pd.DataFrame(columns=['Cited by','Paper', 'Title','Year'])

not_in_db =[]

In [None]:
# Helper functions to retrieve metadata for papers on the eid_list

def scopus_search(search_query):
    doc_srch = ElsSearch(search_query,'scopus')
    doc_srch.execute(client, get_all = True)
    search_df = pd.DataFrame(doc_srch.results)
    print ("Search returned ", len(doc_srch.results), "results.")
    return search_df

def fetch_abstract(paper_id):
    # Set the base URL with a placeholder for the Scopus ID

    if paper_id[0:7] == "2-s2.0-":
        eid
    else:
        eid = "2-s2.0-" + paper_id
        
    url = f"https://api.elsevier.com/content/abstract/eid/{eid}"

    # Set the query parameters
    params = {
        # Substitute X's with your personal Scopus API key
        'apiKey': 'XXXXXXXXXXXXX',
        'insttoken': 'XXXXXXXXXXXXX'
    }

    # Set the headers
    headers = {
        'Accept': 'application/json'
    }

    # Make the GET request
    response = requests.get(url, headers=headers, params=params)

    # Check the status code of the response
    if response.status_code == 200:
        # Return the JSON content of the response
        return response.json()
    else:
        print(f"Failed to fetch data. Status code: {response.status_code}")
        return None

def extract_references(json_data):
    # Navigate to the 'item.bibrecord.tail.bibliography.reference' key
    try:
        references = json_data['abstracts-retrieval-response']['item']['bibrecord']['tail']['bibliography']['reference']
        # Convert the references to a DataFrame
        df = pd.json_normalize(references)
        return df
    except KeyError as e:
        print(f"Key not found: {e}")
        return None

def extract_metadata(data):

    # Number of authors
    num_authors = len(data['abstracts-retrieval-response']['item']['bibrecord']['head']['author-group'])
    # print(num_authors)

    # Names of authors

    authors = ''
    

    try:
        for author in data['abstracts-retrieval-response']['item']['bibrecord']['head']['author-group']:
            authors += author['author'][0]['ce:indexed-name'] + "; "
        authors = authors[:-2]
    except:
        authors += data['abstracts-retrieval-response']['item']['bibrecord']['head']['author-group']['author'][0]['ce:indexed-name']
        num_authors = 1
    # print(authors)
    
    # Title of the paper
    title = data['abstracts-retrieval-response']['item']['bibrecord']['head']['citation-title']
    # print(title)

    # Author keywords
    try:
        num_auth_keywords = len(data['abstracts-retrieval-response']['item']['bibrecord']['head']['citation-info']['author-keywords']['author-keyword'])
    except:
        num_auth_keywords = 0
    # print(num_auth_keywords)

    # Abstract
    abstract = data['abstracts-retrieval-response']['item']['bibrecord']['head']['abstracts']
    # print(abstract)
    
    # Source title - name of the journal
    source = data['abstracts-retrieval-response']['item']['bibrecord']['head']['source']['sourcetitle']
    # print(source)

    # Year of publication
    year = int(data['abstracts-retrieval-response']['item']['bibrecord']['head']['source']['publicationdate']['year'])
    # print(year)

    # Number of references used in the paper
    num_ref = int(data['abstracts-retrieval-response']['item']['bibrecord']['tail']['bibliography']['@refcount'])
    # print(num_ref)

    # Extract references
    references_df = extract_references(data)
    # print('ref_df')
    
    return [authors, num_authors, title, year, source, num_auth_keywords, abstract, num_ref , references_df]


def add_papers(scopus_ids):

    scopus_stats_raw = pd.DataFrame(columns=['EID', 'Scopus ID', 'Cited by', 'Authors', 'Number of authors', 
                                         'Title', 'Year', 'Source title', 'Number of author keywords', 
                                         'Abstract', 'Number of references'])

    # Turn scopus ids into a search query 
    prefix = 'EID("2-s2.0-'
    suffix = '") OR '
    search_query =''

    print(r'Received {} IDs to search'.format(len(scopus_ids)))
         
    for id in scopus_ids:
        search_query += prefix + str(id) + suffix
    search_query = search_query[:-4]

    search_df = scopus_search(search_query)

    num_papers = len(search_df)
    # num_papers = 4 # Limiting the number of papers for testing purposes

    for paper in range(num_papers):
        eid = search_df.loc[paper, 'eid']
        scopus_id = search_df.loc[paper, 'dc:identifier'].replace("SCOPUS_ID:","")
        cited_count = search_df.loc[paper,'citedby-count']
        try:
            data = fetch_abstract(scopus_id)
            metadata = extract_metadata(data)
            record_item = [eid, scopus_id, cited_count] + metadata[:-1]
            scopus_stats_raw.loc[len(scopus_stats_raw)]=record_item
            references.append([scopus_id, metadata[-1]])
        except:
            print('Could not fetch data for Scopus ID: ', scopus_id)
        
    
    scopus_add = scopus_stats_raw[['Authors', 'Title', 'EID', 'Scopus ID','Year',
           'Source title', 'Abstract','Cited by', 'Number of author keywords',
           'Number of authors', 'Number of references']]  

    return scopus_add

<h4>Start here</h4>

In [None]:
# Conduct search of the Scopus database

search_query = '( TITLE-ABS-KEY ( retail AND investor* ) AND TITLE-ABS-KEY ( "meme stock*" OR "social media" OR "gamestop" OR "robinhood" OR "roaring kitty" OR "reddit" OR "4chan" OR "wallstreetbets" OR "disinformation" OR "viral" OR "influencer*" OR "manipulation" OR "sentiment" OR "activism" OR "facebook" OR "twitter" OR "stocktwits" OR "pump and dump" ) ) AND ( LIMIT-TO ( DOCTYPE , "ar" ) ) AND ( LIMIT-TO ( SUBJAREA , "ECON" ) OR LIMIT-TO ( SUBJAREA , "BUSI" ) OR LIMIT-TO ( SUBJAREA , "SOCI" ) OR LIMIT-TO ( SUBJAREA , "PSYC" ) OR LIMIT-TO ( SUBJAREA , "MULT" ) OR LIMIT-TO ( SUBJAREA , "COMP" ) OR LIMIT-TO ( SUBJAREA , "ARTS" ) ) AND ( LIMIT-TO ( LANGUAGE , "English" ) )'

search_df = scopus_search(search_query)


In [None]:
# Go from paper to paper and pull relevant metadata

num_papers = len(search_df)
# num_papers = 4 # Limiting the number of papers for testing purposes

for paper in range(num_papers):
    eid = search_df.loc[paper, 'eid']
    scopus_id = search_df.loc[paper, 'dc:identifier'].replace("SCOPUS_ID:","")
    cited_count = search_df.loc[paper,'citedby-count']
    try:
        data = fetch_abstract(scopus_id)
        metadata = extract_metadata(data)
        record_item = [eid, scopus_id, cited_count] + metadata[:-1]
        scopus_stats_raw.loc[len(scopus_stats_raw)]=record_item
        references.append([scopus_id, metadata[-1]])
    except:
        print('Could not fetch data for Scopus ID: ', scopus_id)
    

scopus_stats = scopus_stats_raw[['Authors', 'Title', 'EID', 'Scopus ID','Year',
       'Source title', 'Abstract','Cited by', 'Number of author keywords',
       'Number of authors', 'Number of references']]

In [None]:
ref_error = []
for ref_paper in references:
    cited_by = ref_paper[0]
    num_refs = len(ref_paper[1])
    for ref in range(num_refs):
        try:
            # paper = ref_paper[1].loc[ref,'ref-info.refd-itemidlist.itemid'][1]['$']
            for id in ref_paper[1].loc[ref, 'ref-info.refd-itemidlist.itemid']:
                if id['@idtype'] == 'SGR':
                    paper = id['$']
                else:
                    pass
            title = ref_paper[1].loc[ref, 'ref-info.ref-title.ref-titletext']
            year = ref_paper[1].loc[ref, 'ref-info.ref-publicationyear.@first']
            citation_df.loc[len(citation_df)] = [cited_by, paper, title, year]
        except:
            paper = ref_paper[1].loc[ref,'ref-info.refd-itemidlist.itemid.$']
            # for id in ref_paper[1].loc[ref, 'ref-info.refd-itemidlist.itemid.$']:
            #     if id['@idtype'] == 'SGR':
            #         paper = id['$']
            #     else:
            #         pass
            try:
                title = ref_paper[1].loc[ref, 'ref-info.ref-title.ref-titletext']
            except:
                title = ref_paper[1].loc[ref, 'ref-info.ref-sourcetitle']
            year = ref_paper[1].loc[ref, 'ref-info.ref-publicationyear.@first']
            citation_df.loc[len(citation_df)] = [cited_by, paper, title, year]

        if str(paper) == 'CR1':
            ref_error.append(ref_paper[1].loc[ref,:])
            print('gotcha')
                            


citation_df.drop_duplicates(inplace=True)

In [None]:
local_citations_df = citation_df.groupby('Paper')['Cited by'].count().to_frame().sort_values(by='Cited by', ascending=False)

# Criteria for inclusion in  local citations

min_num_citations = local_citations_df.describe([0.995]).loc['99.5%','Cited by']
if min_num_citations > 3:
    pass
else:
    min_num_citations = 3

print(r'Using {} as minimum number of citations'.format(min_num_citations))
LCT = local_citations_df[local_citations_df['Cited by'] >= min_num_citations].reset_index()
# citation_df.merge(LCT.set_index('Paper'), on='Paper', how='outer', suffixes=('', '_local'))
print(r'Number of papers: {}'.format(len(LCT)))
LCT.head()

In [None]:
# Identify papers that are currently not in the database
not_in_db = []

for i in range(len(LCT)):
    scopus_id = str(LCT.loc[i,'Paper'])
    if scopus_id in list(scopus_stats['Scopus ID']):
        pass
    else:
        not_in_db.append(scopus_id)
print(r'There is a total of {} papers which were previously not included in the database'.format(len(not_in_db)))

In [None]:
df = add_papers(not_in_db)

In [None]:
scopus_stats = pd.concat([scopus_stats, df], ignore_index=True)
scopus_stats.shape

In [None]:
scopus_stats.to_csv('scopus_stats.csv', index=False)

In [None]:
citation_df.to_csv('citation_df.csv', index=False)