In [1]:
import pandas as pd

In [2]:
# read preprocessed data
df = pd.read_csv('preprocessed_data.csv')

In [3]:
# totoal number of rows
print('Total number of rows:', len(df))

Total number of rows: 19631


In [4]:
df.columns

Index(['Article Title', 'Source Title', 'Language',
       'Times Cited, All Databases', 'Highly Cited Status', 'Hot Paper Status',
       'Publication Year', 'Decade', 'Group', 'WoS Categories new',
       'Research Areas new', 'Keywords Plus lemmatized',
       'Author Keywords lemmatized', 'All Keywords', 'Addresses new',
       'Affiliations new', 'Abstract lemmatized'],
      dtype='object')

In [5]:
df.head()

Unnamed: 0,Article Title,Source Title,Language,"Times Cited, All Databases",Highly Cited Status,Hot Paper Status,Publication Year,Decade,Group,WoS Categories new,Research Areas new,Keywords Plus lemmatized,Author Keywords lemmatized,All Keywords,Addresses new,Affiliations new,Abstract lemmatized
0,Online payment fraud: from anomaly detection t...,FINANCIAL INNOVATION,English,7,,,2023,2020,7,"['Business, Finance', 'Social Sciences, Mathem...","['Business & Economics', 'Mathematical Methods...","['banking fraud', 'system']","['payment fraud risk management', 'anomaly det...",['integration of machine learning and statisti...,['Switzerland'],"['University of Basel', 'Novartis']","['online', 'banking', 'fraud', 'occurs', 'when..."
1,The effect of plough agriculture on gender rol...,JOURNAL OF APPLIED ECONOMETRICS,English,0,,,2024,2020,8,"['Economics', 'Social Sciences, Mathematical M...","['Business & Economics', 'Mathematical Methods...","['woman', 'work']","['average treatment effect', 'causal inference...","['causal inference', 'machine learning', 'woma...","['United Kingdom', 'Netherlands']","['Erasmus University Rotterdam', 'Erasmus Univ...","['paper', 'undertakes', 'replication', 'wide',..."
2,Sector-level equity returns predictability wit...,EMPIRICAL ECONOMICS,English,0,,,2023,2020,7,"['Economics', 'Social Sciences, Mathematical M...","['Business & Economics', 'Mathematical Methods...","['volatility', 'sample', 'jump', 'premium', 'm...","['equity return predictability', 'machine lear...","['equity return predictability', 'sample', 'hy...",['United States'],['Sacred Heart University'],"['paper', 'develop', 'new', 'latent', 'risk', ..."
3,Addressing sample selection bias for machine l...,JOURNAL OF APPLIED ECONOMETRICS,English,0,,,2024,2020,8,"['Economics', 'Social Sciences, Mathematical M...","['Business & Economics', 'Mathematical Methods...","['semiparametric regression model', 'incumbenc...","['control function', 'inverse probability weig...","['machine learning', 'inference', 'big data', ...",['United States'],"['University System of Georgia', 'Georgia Inst...","['study', 'approach', 'adjusting', 'machine', ..."
4,Does model complexity add value to asset alloc...,JOURNAL OF APPLIED ECONOMETRICS,English,1,,,2022,2020,6,"['Economics', 'Social Sciences, Mathematical M...","['Business & Economics', 'Mathematical Methods...","['independent component analysis', 'variable s...","['forecast combination', 'machine learning', '...","['risk', 'combination forecast', 'regression',...","['United Kingdom', 'Ireland']","['University College Dublin', 'University of E...","['study', 'evaluates', 'benefit', 'integrating..."


In [6]:
# check value distribution of Highly Cited Status
df['Highly Cited Status'].value_counts()

Y    264
Name: Highly Cited Status, dtype: int64

In [7]:
# check value distribution of Hot Paper Status
df['Hot Paper Status'].value_counts()

N    255
Y      9
Name: Hot Paper Status, dtype: int64

In [8]:
# get the name of 9 Hot Papers
df[df['Hot Paper Status'] == 'Y'][['Article Title', 'Source Title', 'Publication Year', 'Times Cited, All Databases']]

Unnamed: 0,Article Title,Source Title,Publication Year,"Times Cited, All Databases"
1816,Tourism development and U.S energy security ri...,CURRENT ISSUES IN TOURISM,2024,17
3435,Industry 4.0 enables supply chain resilience a...,TECHNOLOGICAL FORECASTING AND SOCIAL CHANGE,2022,66
4843,Extracting spatial effects from machine learni...,COMPUTERS ENVIRONMENT AND URBAN SYSTEMS,2022,127
7294,"Future smart cities requirements, emerging tec...",CITIES,2022,165
8034,Firm-Level Climate Change Exposure,JOURNAL OF FINANCE,2023,104
10512,"Out of One, Many: Using Language Models to Sim...",POLITICAL ANALYSIS,2023,45
11473,Study of Urban Heat Island Effect in Hangzhou ...,SAGE OPEN,2023,45
14920,Demystifying AI: Current State and Future Role...,ACADEMIC MEDICINE,2024,4
19319,Perception and sensing for autonomous vehicles...,ISPRS JOURNAL OF PHOTOGRAMMETRY AND REMOTE SEN...,2023,85


In [9]:
# new dataframe with only Highly Cited papers
df_highly_cited = df[df['Highly Cited Status'] == 'Y']

In [10]:
# total number of Highly Cited papers
print('Total number of Highly Cited papers:', len(df_highly_cited))

Total number of Highly Cited papers: 264


In [11]:
df_highly_cited['Addresses new']

11                  ['United States']
37                  ['United States']
95                    ['Netherlands']
117                 ['United States']
234                       ['Germany']
                     ...             
19573                     ['Germany']
19602                       ['China']
19603    ['Philippines', 'Australia']
19621               ['United States']
19629      ['India', 'United States']
Name: Addresses new, Length: 264, dtype: object

In [12]:
# Create a dictionary to count country co-occurrences
from itertools import combinations

co_occurrence = {}

for row in df_highly_cited['Addresses new']:
    row = eval(row)
    for combo in combinations(row, 2):
        combo = tuple(sorted(combo))
        if combo in co_occurrence:
            co_occurrence[combo] += 1
        else:
            co_occurrence[combo] = 1

threshold = 2
co_occurrence = {k: v for k, v in co_occurrence.items() if v > threshold}
len(co_occurrence)

25

In [13]:
# draw chord diagram for Address new 
# https://holoviews.org/reference/elements/bokeh/Chord.html

import holoviews as hv
from holoviews import opts, dim

hv.extension('bokeh')
hv.output(size=200)

# Prepare data for chord diagram
edges = [(combo[0], combo[1], count) for combo, count in co_occurrence.items()]
edges = pd.DataFrame(edges, columns=['source', 'target', 'value'])

# node is the country name appear in edges
nodes = edges[['source', 'target']].stack().unique()
# nodes = pd.DataFrame(nodes, columns=['name'])
nodes = pd.DataFrame({'index': range(len(nodes)), 'name': nodes})

# change the content of source and target to index
edges = edges.merge(nodes, left_on='source', right_on='name')
edges = edges.merge(nodes, left_on='target', right_on='name')
edges = edges[['index_x', 'index_y', 'value']]
edges.columns = ['source', 'target', 'value']

nodes = hv.Dataset(nodes, 'index')

# Create a holoviews chord element
chord = hv.Chord((edges, nodes))

# Set node positions to be circular
chord.opts(
    opts.Chord(cmap='Category20', edge_cmap='Category20', edge_color=dim('source').str(), 
               labels='name', node_color=dim('index').str()))

# Display the chord diagram
hv.output(chord)