In [1]:
import pandas as pd

In [2]:
# read preprocessed data
df = pd.read_csv('preprocessed_data.csv')

In [3]:
# totoal number of rows
print('Total number of rows:', len(df))

Total number of rows: 19631


In [4]:
df.columns

Index(['Article Title', 'Source Title', 'Language',
       'Times Cited, All Databases', 'Highly Cited Status', 'Hot Paper Status',
       'Publication Year', 'Decade', 'Group', 'WoS Categories new',
       'Research Areas new', 'Keywords Plus lemmatized',
       'Author Keywords lemmatized', 'All Keywords', 'Addresses new',
       'Affiliations new', 'Abstract lemmatized'],
      dtype='object')

In [5]:
# df.head()

In [6]:
# check how many empty list WoS Categories new
print('Number of empty list WoS Categories:', len(df[df['WoS Categories new'] == '[]']))
# check how many empty list Research Areas new
print('Number of empty list Research Areas:', len(df[df['Research Areas new'] == '[]']))

Number of empty list WoS Categories: 0
Number of empty list Research Areas: 0


In [7]:
df['WoS Categories new'] = df['WoS Categories new'].apply(eval)

In [8]:
# build a co-occurence network of WoS Categories for each group
import networkx as nx
import nx2vos

for group in range(1, 9):
    G = nx.Graph()
    for catergories in df[df['Group'] == group]['WoS Categories new']:
        for i in range(len(catergories)):
            for j in range(i+1, len(catergories)):
                if G.has_edge(catergories[i], catergories[j]):
                    G[catergories[i]][catergories[j]]['weight'] += 1
                else:
                    G.add_edge(catergories[i], catergories[j], weight=1)

    # Save network files to json for VosViewer
    nx2vos.write_vos_json(G, f'output/WC/WoS_Categories_G{group}.json')

    # save link in txt in weight descending order
    sorted_edges = sorted(G.edges(data=True), key=lambda x: x[2]['weight'], reverse=True)
    with open(f'output/WC/WoS_Categories_G{group}_links.txt', 'w') as f:
        for edge in sorted_edges:
            f.write(f"{edge[0]}\t{edge[1]}\t{edge[2]['weight']}\n")

In [9]:
df['Research Areas new'] = df['Research Areas new'].apply(eval)

In [10]:
# do the same for Research Areas new
for group in range(1, 9):
    G = nx.Graph()
    for catergories in df[df['Group'] == group]['Research Areas new']:
        for i in range(len(catergories)):
            for j in range(i+1, len(catergories)):
                if G.has_edge(catergories[i], catergories[j]):
                    G[catergories[i]][catergories[j]]['weight'] += 1
                else:
                    G.add_edge(catergories[i], catergories[j], weight=1)

    # Save network files to json for VosViewer
    nx2vos.write_vos_json(G, f'output/SU/Research_Areas_G{group}.json')

    # save link in txt in weight descending order
    sorted_edges = sorted(G.edges(data=True), key=lambda x: x[2]['weight'], reverse=True)
    with open(f'output/SU/Research_Areas_G{group}_links.txt', 'w') as f:
        for edge in sorted_edges:
            f.write(f"{edge[0]}\t{edge[1]}\t{edge[2]['weight']}\n")

In [11]:
# https://webofscience.help.clarivate.com/en-us/Content/research-areas.html?Highlight=research%20areas
research_fields = {
    "Arts & Humanities": [
        "Architecture",
        "Art",
        "Arts & Humanities - Other Topics",
        "Asian Studies",
        "Classics",
        "Dance",
        "Film, Radio & Television", 
        "History",
        "History & Philosophy of Science",
        "Literature",
        "Music",
        "Philosophy",
        "Religion",
        "Theater"
    ],
    "Life Sciences & Biomedicine": [
        "Agriculture",
        "Allergy",
        "Anatomy & Morphology",
        "Anesthesiology",
        "Anthropology",
        "Audiology & Speech-Language Pathology",
        "Behavioral Sciences",
        "Biochemistry & Molecular Biology",
        "Biodiversity & Conservation",
        "Biophysics",
        "Biotechnology & Applied Microbiology",
        "Cardiovascular System & Cardiology",
        "Cell Biology",
        "Critical Care Medicine",
        "Dentistry, Oral Surgery & Medicine",
        "Dermatology",
        "Developmental Biology",
        "Emergency Medicine",
        "Endocrinology & Metabolism",
        "Entomology",
        "Environmental Sciences & Ecology",
        "Evolutionary Biology",
        "Fisheries",
        "Food Science & Technology",
        "Forestry",
        "Gastroenterology & Hepatology",
        "General & Internal Medicine",
        "Genetics & Heredity",
        "Geriatrics & Gerontology",
        "Health Care Sciences & Services",
        "Hematology",
        "Immunology",
        "Infectious Diseases",
        "Integrative & Complementary Medicine",
        "Legal Medicine",
        "Life Sciences Biomedicine Other Topics",
        "Marine & Freshwater Biology",
        "Mathematical & Computational Biology",
        "Medical Ethics",
        "Medical Informatics",
        "Medical Laboratory Technology",
        "Microbiology",
        "Mycology",
        "Neurosciences & Neurology",
        "Nursing",
        "Nutrition & Dietetics",
        "Obstetrics & Gynecology",
        "Oncology",
        "Ophthalmology",
        "Orthopedics",
        "Otorhinolaryngology",
        "Paleontology",
        "Parasitology",
        "Pathology",
        "Pediatrics",
        "Pharmacology & Pharmacy",
        "Physiology",
        "Plant Sciences",
        "Psychiatry",
        "Public, Environmental & Occupational Health",
        "Radiology, Nuclear Medicine & Medical Imaging",
        "Rehabilitation",
        "Reproductive Biology",
        "Research & Experimental Medicine",
        "Respiratory System",
        "Rheumatology",
        "Sport Sciences",
        "Substance Abuse",
        "Surgery",
        "Toxicology",
        "Transplantation",
        "Tropical Medicine",
        "Urology & Nephrology",
        "Veterinary Sciences",
        "Virology",
        "Zoology"
    ],
    "Physical Sciences": [
        "Astronomy & Astrophysics",
        "Chemistry",
        "Crystallography",
        "Electrochemistry",
        "Geochemistry & Geophysics",
        "Geology",
        "Mathematics",
        "Meteorology & Atmospheric Sciences",
        "Mineralogy",
        "Mining & Mineral Processing",
        "Oceanography",
        "Optics",
        "Physical Geography",
        "Physics",
        "Polymer Science",
        "Thermodynamics",
        "Water Resources"
    ],
    "Social Sciences": [
        "Archaeology",
        "Area Studies",
        "Biomedical Social Sciences",
        "Business & Economics",
        "Communication",
        "Criminology & Penology",
        "Cultural Studies",
        "Demography",
        "Development Studies",
        "Education & Educational Research",
        "Ethnic Studies",
        "Family Studies",
        "Geography",
        "Government & Law",
        "International Relations",
        "Linguistics",
        "Mathematical Methods In Social Sciences",
        "Psychology",
        "Public Administration",
        "Social Issues",
        "Social Sciences - Other Topics",
        "Social Work",
        "Sociology",
        "Urban Studies",
        "Women's Studies"
    ],
    "Technology": [
        "Acoustics",
        "Automation & Control Systems",
        "Computer Science",
        "Construction & Building Technology",
        "Energy & Fuels",
        "Engineering",
        "Imaging Science & Photographic Technology",
        "Information Science & Library Science",
        "Instruments & Instrumentation",
        "Materials Science",
        "Mechanics",
        "Metallurgy & Metallurgical Engineering",
        "Microscopy",
        "Nuclear Science & Technology",
        "Operations Research & Management Science",
        "Remote Sensing",
        "Robotics",
        "Science & Technology - Other Topics",
        "Spectroscopy",
        "Telecommunications",
        "Transportation"
    ]
}

# check the length of each research field
for field in research_fields:
    print(field, len(research_fields[field]))

Arts & Humanities 14
Life Sciences & Biomedicine 76
Physical Sciences 17
Social Sciences 25
Technology 21


In [12]:
# draw a sunburst plot of the distribution of research fields
# df['Research Areas new'] is a list of research subfields
import pyecharts.options as opts
from pyecharts.charts import Sunburst

all_subfields = []
for field in research_fields:
    all_subfields += research_fields[field]

research_subfields_count = {subfield: 0 for subfield in all_subfields}
for subfields in df['Research Areas new']:
    for subfield in subfields:
        research_subfields_count[subfield] += 1

sunburst_data = []
for field in research_fields:
    sunburst_data.append({
        'name': field,
        'children': [{'name': subfield, 'value': research_subfields_count[subfield]} for subfield in research_fields[field] if research_subfields_count[subfield] > 0]
    })

sunburst = Sunburst(init_opts=opts.InitOpts(width='1500px', height='1500px')) \
    .add(series_name='', data_pair=sunburst_data, radius=[0, '90%'], sort_='null', 
         levels=[
            {},
            {"r0": "15%", "r": "35%"},
            {"r0": "35%", "r": "70%"},
            {"r0": "70%", "r": "72%"},
        ]) \
    .set_global_opts(title_opts=opts.TitleOpts(title="Research Areas Sunburst")) \
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}"))

sunburst.render('visualize/Research_Areas_Distribution_All.html')

'/Users/ZOU/Desktop/code/visualize/Research_Areas_Distribution_All.html'

In [13]:
# if subfield count less than 300, replace it with space
for field in sunburst_data:
    for subfield in field['children']:
        if subfield['value'] < 300:
            subfield['name'] = ''

sunburst = Sunburst(init_opts=opts.InitOpts(width='1500px', height='1500px')) \
    .add(series_name='', data_pair=sunburst_data, radius=[0, '90%'], sort_='null', 
         levels=[
            {},
            {"r0": "15%", "r": "35%"},
            {"r0": "35%", "r": "70%"},
            {"r0": "70%", "r": "72%"},
        ]) \
    .set_global_opts(title_opts=opts.TitleOpts(title="Research Areas Sunburst")) \
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}"))

sunburst.render('visualize/Research_Areas_Distribution.html')

'/Users/ZOU/Desktop/code/visualize/Research_Areas_Distribution.html'

In [14]:
# count how many publications for each social science subfield every group
social_science_subfields = research_fields['Social Sciences']
social_science_subfields_count = {subfield: [0]*8 for subfield in social_science_subfields}

for subfields, group in zip(df['Research Areas new'], df['Group']):
    group = int(group)
    for subfield in subfields:
        if subfield in social_science_subfields:
            social_science_subfields_count[subfield][group-1] += 1

# social_science_subfields_count

In [15]:
from pyecharts.charts import Pie, Timeline

group_dic = {1: '1970-2009', 2: '2010-2014', 3: '2015-2019', 4: '2020', 5: '2021', 6: '2022', 7: '2023', 8: '2024'}

attr = social_science_subfields
freq = list(social_science_subfields_count.values())
tl = Timeline(init_opts=opts.InitOpts(width='1500px', height='900px'))
tl.add_schema(is_auto_play=True, play_interval=1000)

for i in range(1, 9):
    pie = (
        Pie()
        .add(
            "Social Science Subfields Distribution",
            [list(z) for z in zip(attr, [x[i-1] for x in freq])],
            radius=["30%", "55%"],
        )
        .set_global_opts(title_opts=opts.TitleOpts(is_show=False),
                         legend_opts=opts.LegendOpts(is_show=False))
    )
    # set the timeline title as the actual year
    tl.add(pie, group_dic[i])
    # tl.add(pie, "{}".format(i))

tl.render("visualize/Social_Science_Subfields_Distribution_Timeline.html")

'/Users/ZOU/Desktop/code/visualize/Social_Science_Subfields_Distribution_Timeline.html'