In [27]:
import json
import pandas as pd
import os

In [44]:
df = pd.DataFrame()

# Loop through years from 2018 to 2023
for year in range(2018, 2024):
    directory = f"Project/{year}/"

    # Ensure the directory exists before proceeding
    if not os.path.exists(directory):
        print(f"Directory {directory} does not exist.")
        continue

    # Get all JSON files in the directory
    json_files = [f for f in os.listdir(directory) if f.endswith(".json")]

    # Sort the files to ensure they are processed in order
    json_files.sort()

    for file_name in json_files:
        file_path = os.path.join(directory, file_name)
        print("Reading", file_path)

        if os.path.exists(file_path):
            try:
                with open(file_path, "r", encoding="utf-8") as file:
                    data = json.load(file)

                temp_df = pd.json_normalize(data)
                df = pd.concat([df, temp_df], ignore_index=True)
            except UnicodeDecodeError as e:
                print(f"UnicodeDecodeError while processing {file_path}: {e}")
            except json.JSONDecodeError as e:
                print(f"JSONDecodeError in file {file_path}: {e}")
        else:
            print(f"File {file_path} does not exist.")
            
original = df

Reading Project/2018/201800000.json
Reading Project/2018/201800001.json
Reading Project/2018/201800002.json
Reading Project/2018/201800003.json
Reading Project/2018/201800004.json
Reading Project/2018/201800005.json
Reading Project/2018/201800006.json
Reading Project/2018/201800007.json
Reading Project/2018/201800008.json
Reading Project/2018/201800009.json
Reading Project/2018/201800010.json
Reading Project/2018/201800011.json
Reading Project/2018/201800012.json
Reading Project/2018/201800013.json
Reading Project/2018/201800014.json
Reading Project/2018/201800015.json
Reading Project/2018/201800016.json
Reading Project/2018/201800017.json
Reading Project/2018/201800018.json
Reading Project/2018/201800019.json
Reading Project/2018/201800020.json
Reading Project/2018/201800021.json
Reading Project/2018/201800022.json
Reading Project/2018/201800023.json
Reading Project/2018/201800024.json
Reading Project/2018/201800025.json
Reading Project/2018/201800026.json
Reading Project/2018/2018000

In [45]:
df

Unnamed: 0,abstracts-retrieval-response.item.ait:process-info.ait:status.@state,abstracts-retrieval-response.item.ait:process-info.ait:status.@type,abstracts-retrieval-response.item.ait:process-info.ait:status.@stage,abstracts-retrieval-response.item.ait:process-info.ait:date-delivered.@day,abstracts-retrieval-response.item.ait:process-info.ait:date-delivered.@timestamp,abstracts-retrieval-response.item.ait:process-info.ait:date-delivered.@year,abstracts-retrieval-response.item.ait:process-info.ait:date-delivered.@month,abstracts-retrieval-response.item.ait:process-info.ait:date-sort.@day,abstracts-retrieval-response.item.ait:process-info.ait:date-sort.@year,abstracts-retrieval-response.item.ait:process-info.ait:date-sort.@month,...,abstracts-retrieval-response.item.bibrecord.head.source.contributor-group.affiliation.@country,abstracts-retrieval-response.item.bibrecord.head.source.contributor-group.affiliation.city,abstracts-retrieval-response.item.bibrecord.head.source.contributor-group.affiliation.organization.$,abstracts-retrieval-response.item.bibrecord.head.related-item.source.part,abstracts-retrieval-response.item.bibrecord.head.source.volisspag.pages,abstracts-retrieval-response.item.bibrecord.head.related-item.citation-info.author-keywords.author-keyword.$,abstracts-retrieval-response.item.bibrecord.head.related-item.citation-info.author-keywords.author-keyword.@xml:lang,abstracts-retrieval-response.item.bibrecord.head.correspondence.person.ce:alias,abstracts-retrieval-response.item.bibrecord.head.related-item.source.bib-text,abstracts-retrieval-response.item.bibrecord.tail.bibliography.reference.@reference-instance-id
0,update,core,S300,19,2020-05-19T23:06:15.000015-04:00,2020,05,31,2018,12,...,,,,,,,,,,
1,update,core,S300,10,2020-02-10T15:56:06.000006-05:00,2020,02,31,2018,12,...,,,,,,,,,,
2,update,core,S300,23,2021-02-23T17:24:20.000020-05:00,2021,02,31,2018,12,...,,,,,,,,,,
3,update,core,S300,31,2021-07-31T11:32:34.000034-04:00,2021,07,31,2018,12,...,,,,,,,,,,
4,update,core,S300,13,2020-02-13T09:36:10.000010-05:00,2020,02,31,2018,12,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20211,update,core,S300,04,2023-07-04T20:47:40.000040-04:00,2023,07,01,2023,01,...,,,,,,,,,,
20212,update,core,S300,21,2023-02-21T10:36:34.000034-05:00,2023,02,01,2023,01,...,,,,,,,,,,
20213,update,core,S300,07,2023-06-07T01:07:03.000003-04:00,2023,06,01,2023,01,...,,,,,,,,,,
20214,update,core,S300,04,2023-03-04T04:52:04.000004-05:00,2023,03,01,2023,01,...,,,,,,,,,,


In [46]:
df.columns

Index(['abstracts-retrieval-response.item.ait:process-info.ait:status.@state',
       'abstracts-retrieval-response.item.ait:process-info.ait:status.@type',
       'abstracts-retrieval-response.item.ait:process-info.ait:status.@stage',
       'abstracts-retrieval-response.item.ait:process-info.ait:date-delivered.@day',
       'abstracts-retrieval-response.item.ait:process-info.ait:date-delivered.@timestamp',
       'abstracts-retrieval-response.item.ait:process-info.ait:date-delivered.@year',
       'abstracts-retrieval-response.item.ait:process-info.ait:date-delivered.@month',
       'abstracts-retrieval-response.item.ait:process-info.ait:date-sort.@day',
       'abstracts-retrieval-response.item.ait:process-info.ait:date-sort.@year',
       'abstracts-retrieval-response.item.ait:process-info.ait:date-sort.@month',
       ...
       'abstracts-retrieval-response.item.bibrecord.head.source.contributor-group.affiliation.@country',
       'abstracts-retrieval-response.item.bibrecord.head

In [47]:
# Remove null columns
threshold = 0.5

max_null = 0

for col in df.columns:
    n_null = df[col].isnull().mean()

    if n_null > max_null:
        max_null = n_null
    
    if df[col].isnull().mean() > threshold:
        df.drop(columns=col, axis=1, inplace=True)
        
print(max_null)

1.0


In [48]:
df.columns

Index(['abstracts-retrieval-response.item.ait:process-info.ait:status.@state',
       'abstracts-retrieval-response.item.ait:process-info.ait:status.@type',
       'abstracts-retrieval-response.item.ait:process-info.ait:status.@stage',
       'abstracts-retrieval-response.item.ait:process-info.ait:date-delivered.@day',
       'abstracts-retrieval-response.item.ait:process-info.ait:date-delivered.@timestamp',
       'abstracts-retrieval-response.item.ait:process-info.ait:date-delivered.@year',
       'abstracts-retrieval-response.item.ait:process-info.ait:date-delivered.@month',
       'abstracts-retrieval-response.item.ait:process-info.ait:date-sort.@day',
       'abstracts-retrieval-response.item.ait:process-info.ait:date-sort.@year',
       'abstracts-retrieval-response.item.ait:process-info.ait:date-sort.@month',
       ...
       'abstracts-retrieval-response.item.xocs:meta.xocs:funding-list.xocs:funding',
       'abstracts-retrieval-response.item.xocs:meta.xocs:funding-list.xocs:f

In [49]:
# Remove useless columns

useless_cols = [
    "abstracts-retrieval-response.item.ait:process-info.ait:status.@type",
    "abstracts-retrieval-response.item.ait:process-info.ait:status.@state",
    "abstracts-retrieval-response.item.ait:process-info.ait:status.@stage",  # just status
    "abstracts-retrieval-response.item.ait:process-info.ait:date-delivered.@day",
    "abstracts-retrieval-response.item.ait:process-info.ait:date-delivered.@timestamp",
    "abstracts-retrieval-response.item.ait:process-info.ait:date-delivered.@month",  # just year is enough
    "abstracts-retrieval-response.item.ait:process-info.ait:date-sort.@day",
    "abstracts-retrieval-response.item.ait:process-info.ait:date-sort.@month",
    "abstracts-retrieval-response.item.bibrecord.head.abstracts",
    "abstracts-retrieval-response.item.bibrecord.head.correspondence.affiliation.@country",  # is short form of country
    "abstracts-retrieval-response.item.bibrecord.head.citation-info.citation-type.@code",  # just type
    "abstracts-retrieval-response.item.bibrecord.head.citation-info.citation-language.@xml:lang",  # short form of language
    "abstracts-retrieval-response.item.bibrecord.head.source.sourcetitle-abbrev",  # title not needed
    "abstracts-retrieval-response.item.bibrecord.head.source.website.ce:e-address.$",  # website
    "abstracts-retrieval-response.item.bibrecord.head.source.website.ce:e-address.@type",  # type of above
    "abstracts-retrieval-response.item.bibrecord.head.source.volisspag.pagerange.@first",
    "abstracts-retrieval-response.item.bibrecord.head.source.volisspag.pagerange.@last",  # could be good if we do how much pages ref prediction
    "abstracts-retrieval-response.item.bibrecord.head.source.@type",  # just type
    "abstracts-retrieval-response.item.bibrecord.head.source.sourcetitle",  # just title
    "abstracts-retrieval-response.item.bibrecord.head.source.@srcid",  # id
    "abstracts-retrieval-response.item.bibrecord.head.source.publicationdate.month",
    "abstracts-retrieval-response.item.bibrecord.head.source.publicationdate.year",  # redundant publish year
    "abstracts-retrieval-response.item.bibrecord.head.source.publicationdate.date-text.@xfab-added",
    "abstracts-retrieval-response.item.bibrecord.head.source.publicationdate.date-text.$",
    "abstracts-retrieval-response.item.bibrecord.head.source.publicationdate.day",  # just year is enough
    "abstracts-retrieval-response.item.bibrecord.head.correspondence.person.ce:initials", # useless first char name
    "abstracts-retrieval-response.item.bibrecord.item-info.copyright.$",
    "abstracts-retrieval-response.item.bibrecord.item-info.copyright.@type",
    "abstracts-retrieval-response.item.bibrecord.item-info.history.date-created.@day",
    "abstracts-retrieval-response.item.bibrecord.item-info.history.date-created.@timestamp",
    "abstracts-retrieval-response.item.bibrecord.item-info.history.date-created.@year",
    "abstracts-retrieval-response.item.bibrecord.item-info.history.date-created.@month",  # why it created in 2020 when it is 2018 file 💀
    "abstracts-retrieval-response.item.bibrecord.item-info.itemidlist.itemid",
    "abstracts-retrieval-response.item.bibrecord.item-info.itemidlist.ce:doi",  # ids
    "abstracts-retrieval-response.coredata.srctype",  # just type
    "abstracts-retrieval-response.coredata.eid",  # ids
    "abstracts-retrieval-response.coredata.prism:url",
    "abstracts-retrieval-response.coredata.subtypeDescription",
    "abstracts-retrieval-response.coredata.link",
    "abstracts-retrieval-response.coredata.source-id",
    "abstracts-retrieval-response.coredata.prism:endingPage",
    "abstracts-retrieval-response.coredata.openaccess",
    "abstracts-retrieval-response.coredata.openaccessFlag",  # not necessary
    "abstracts-retrieval-response.coredata.prism:doi",
    "abstracts-retrieval-response.coredata.prism:startingPage",
    "abstracts-retrieval-response.coredata.subtype",
    "abstracts-retrieval-response.coredata.dc:identifier",  # also id
    "abstracts-retrieval-response.coredata.publishercopyright",
    # "abstracts-retrieval-response.coredata.dc:publisher",
    "abstracts-retrieval-response.language.@xml:lang",
    "abstracts-retrieval-response.authors.author",  # redundant author
    "abstracts-retrieval-response.item.xocs:meta.xocs:funding-list.@pui-match",  # what is this?
    "abstracts-retrieval-response.item.xocs:meta.xocs:funding-list.@has-funding-info",
    "abstracts-retrieval-response.item.xocs:meta.xocs:funding-list.xocs:funding-addon-generated-timestamp",
    "abstracts-retrieval-response.item.xocs:meta.xocs:funding-list.xocs:funding-addon-type",  # link
    "abstracts-retrieval-response.item.bibrecord.head.citation-info.abstract-language.@xml:lang",  # redundant with language
    "abstracts-retrieval-response.item.bibrecord.head.source.translated-sourcetitle.$", # title
    "abstracts-retrieval-response.item.bibrecord.head.source.volisspag.voliss.@volume",
    "abstracts-retrieval-response.item.bibrecord.head.source.issn",  # id?
    "abstracts-retrieval-response.coredata.dc:description",
    "abstracts-retrieval-response.coredata.prism:volume",
    "abstracts-retrieval-response.coredata.prism:issn",
    "abstracts-retrieval-response.item.xocs:meta.xocs:funding-list.xocs:funding-text",  # basically desc
    "abstracts-retrieval-response.item.bibrecord.head.correspondence.affiliation.postal-code",
    "abstracts-retrieval-response.item.bibrecord.head.citation-info.author-keywords.author-keyword",  # probably too many unique
    "abstracts-retrieval-response.authkeywords.author-keyword",  # probably too many unique
    "abstracts-retrieval-response.item.bibrecord.head.source.volisspag.voliss.@issue",
    "abstracts-retrieval-response.coredata.prism:issueIdentifier",
]

df.drop(columns=useless_cols, axis=1, inplace=True)

In [50]:
df.columns

Index(['abstracts-retrieval-response.item.ait:process-info.ait:date-delivered.@year',
       'abstracts-retrieval-response.item.ait:process-info.ait:date-sort.@year',
       'abstracts-retrieval-response.item.bibrecord.head.author-group',
       'abstracts-retrieval-response.item.bibrecord.head.citation-title',
       'abstracts-retrieval-response.item.bibrecord.head.correspondence.affiliation.country',
       'abstracts-retrieval-response.item.bibrecord.head.correspondence.affiliation.city',
       'abstracts-retrieval-response.item.bibrecord.head.correspondence.affiliation.organization',
       'abstracts-retrieval-response.item.bibrecord.head.correspondence.person.ce:given-name',
       'abstracts-retrieval-response.item.bibrecord.head.correspondence.person.ce:surname',
       'abstracts-retrieval-response.item.bibrecord.head.correspondence.person.ce:indexed-name',
       'abstracts-retrieval-response.item.bibrecord.head.citation-info.citation-language.@language',
       'abstracts-

In [51]:
column_renaming = {
    "abstracts-retrieval-response.item.ait:process-info.ait:date-delivered.@year": "date_delivered_year",
    "abstracts-retrieval-response.item.ait:process-info.ait:date-sort.@year": "date_sort_year",
    "abstracts-retrieval-response.item.bibrecord.head.author-group": "author_group",
    "abstracts-retrieval-response.item.bibrecord.head.citation-title": "citation_title",
    # "abstracts-retrieval-response.item.bibrecord.head.abstracts": "abstracts",
    "abstracts-retrieval-response.item.bibrecord.head.correspondence.affiliation.country": "affiliation_country",
    "abstracts-retrieval-response.item.bibrecord.head.correspondence.affiliation.city": "affiliation_city",
    "abstracts-retrieval-response.item.bibrecord.head.correspondence.affiliation.organization": "affiliation_organization",
    "abstracts-retrieval-response.item.bibrecord.head.correspondence.person.ce:given-name": "corresponding_author_given_name",
    # "abstracts-retrieval-response.item.bibrecord.head.correspondence.person.ce:initials": "corresponding_author_initials",
    "abstracts-retrieval-response.item.bibrecord.head.correspondence.person.ce:surname": "corresponding_author_surname",
    "abstracts-retrieval-response.item.bibrecord.head.correspondence.person.ce:indexed-name": "corresponding_author_indexed_name",
    "abstracts-retrieval-response.item.bibrecord.head.citation-info.citation-language.@language": "citation_language",
    "abstracts-retrieval-response.item.bibrecord.head.citation-info.abstract-language.@language": "abstract_language",
    "abstracts-retrieval-response.item.bibrecord.head.source.@country": "source_country",
    "abstracts-retrieval-response.item.bibrecord.head.source.translated-sourcetitle.@xml:lang": "source_translated_title_lang",
    "abstracts-retrieval-response.item.bibrecord.head.source.publicationyear.@first": "source_publication_year",
    "abstracts-retrieval-response.item.bibrecord.head.source.publisher.publishername": "source_publisher_name",
    "abstracts-retrieval-response.item.bibrecord.head.enhancement.classificationgroup.classifications": "classificationgroup",
    "abstracts-retrieval-response.item.bibrecord.item-info.dbcollection": "dbcollection",
    "abstracts-retrieval-response.item.bibrecord.tail.bibliography.@refcount": "ref_count",
    "abstracts-retrieval-response.item.bibrecord.tail.bibliography.reference": "reference",
    "abstracts-retrieval-response.affiliation": "affiliation",
    "abstracts-retrieval-response.coredata.prism:coverDate": "coverDate",
    "abstracts-retrieval-response.coredata.prism:aggregationType": "aggregationType",
    "abstracts-retrieval-response.coredata.dc:creator.author": "author",
    "abstracts-retrieval-response.coredata.prism:publicationName": "publicationName",
    "abstracts-retrieval-response.coredata.citedby-count": "citedby_count",
    # "abstracts-retrieval-response.coredata.subtype": "subtype",
    "abstracts-retrieval-response.coredata.prism:pageRange": "pageRange",
    "abstracts-retrieval-response.coredata.dc:title": "title",
    "abstracts-retrieval-response.subject-areas.subject-area": "subject_area",
    # "abstracts-retrieval-response.coredata.publishercopyright": "publishercopyright",
    "abstracts-retrieval-response.coredata.dc:publisher": "publisher",
    "abstracts-retrieval-response.idxterms.mainterm": "mainterm",
}

df.rename(columns=column_renaming, inplace=True)

In [52]:
# df.dropna(axis=0, inplace=True)

for col in df.columns:
    # if (df[col].isnull()):
    print(col, df[col].isnull().mean())

date_delivered_year 0.0
date_sort_year 0.0
author_group 0.1371191135734072
citation_title 4.9465769687376336e-05
affiliation_country 0.19316383062920459
affiliation_city 0.2485160269093787
affiliation_organization 0.24045310645033638
corresponding_author_given_name 0.22897704788286505
corresponding_author_surname 0.19158092599920853
corresponding_author_indexed_name 0.19158092599920853
citation_language 0.00034626038781163435
source_country 9.893153937475267e-05
source_translated_title_lang 0.0018302334784329245
source_publication_year 0.0
source_publisher_name 0.0005441234665611396
classificationgroup 0.0
dbcollection 0.0
ref_count 0.020330431341511673
reference 0.021369212504946576
affiliation 0.22877918480411555
coverDate 0.0
aggregationType 0.0
author 0.0
publicationName 0.0
citedby_count 0.00019786307874950534
pageRange 0.4024535021764939
title 4.9465769687376336e-05
publisher 0.0005441234665611396
subject_area 0.0
abstract_language 0.04437079540957657
mainterm 0.4279778393351801


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216 entries, 0 to 20215
Data columns (total 35 columns):
 #   Column                                                                           Non-Null Count  Dtype 
---  ------                                                                           --------------  ----- 
 0   date_delivered_year                                                              20216 non-null  object
 1   date_sort_year                                                                   20216 non-null  object
 2   author_group                                                                     17444 non-null  object
 3   citation_title                                                                   20215 non-null  object
 4   affiliation_country                                                              16311 non-null  object
 5   affiliation_city                                                                 15192 non-null  object
 6   affiliation_or

In [42]:
df

Unnamed: 0,date_delivered_year,date_sort_year,author_group,citation_title,affiliation_country,affiliation_city,affiliation_organization,corresponding_author_given_name,corresponding_author_surname,corresponding_author_indexed_name,...,aggregationType,author,publicationName,citedby_count,pageRange,title,publisher,subject_area,abstract_language,mainterm
0,2020,2018,"[{'affiliation': {'country': 'Thailand', '@afi...",Public health and international epidemiology f...,Thailand,Bangkok,[{'$': 'Department of Preventive and Social Me...,Krit,Pongpirul,Pongpirul K.,...,Book,"[{'ce:given-name': 'Krit', 'preferred-name': {...","Radiology in Global Health: Strategies, Implem...",1,175-183,Public health and international epidemiology f...,Springer International Publishing,"[{'@_fa': 'true', '$': 'Medicine (all)', '@cod...",,
1,2020,2018,,Flexible Printed Active Antenna for Digital Te...,,,,,,,...,Conference Proceeding,"[{'ce:given-name': 'Teerapong', 'preferred-nam...",Progress in Electromagnetics Research Symposium,1,1538-1541,Flexible Printed Active Antenna for Digital Te...,Institute of Electrical and Electronics Engine...,"[{'@_fa': 'true', '$': 'Electrical and Electro...",English,"[{'$': 'Antenna dimensions', '@weight': 'b', '..."
2,2021,2018,"[{'affiliation': {'country': 'Thailand', 'post...",Parametric study of hydrogen production via so...,Thailand,Bangkok,"[{'$': 'Fuels Research Center'}, {'$': 'Depart...",Benjapon,Chalermsinsuwan,Chalermsinsuwan B.,...,Journal,"[{'ce:given-name': 'Kiattikhoon', 'preferred-n...",Chemical Engineering Science,21,1041-1057,Parametric study of hydrogen production via so...,Elsevier Ltd,"[{'@_fa': 'true', '$': 'Chemistry (all)', '@co...",English,"[{'$': 'Circulating fluidized bed', '@weight':..."
3,2021,2018,"[{'affiliation': {'country': 'Thailand', 'post...",Superhydrophobic coating from fluoroalkylsilan...,Thailand,Pathumthani,"[{'$': 'Faculty of Science and Technology'}, {...",Suwadee,Kongparakul,Kongparakul S.,...,Journal,"[{'ce:given-name': 'Jittraporn', 'preferred-na...",Applied Surface Science,37,164-174,Superhydrophobic coating from fluoroalkylsilan...,Elsevier B.V.,"[{'@_fa': 'true', '$': 'Chemistry (all)', '@co...",English,"[{'$': 'Core-shell morphologies', '@weight': '..."
4,2020,2018,"[{'affiliation': {'country': 'Thailand', 'addr...",Electrochemical impedance-based DNA sensor usi...,United States,Fort Collins,"[{'$': 'Department of Chemistry'}, {'$': 'Colo...",Charles S.,Henry,Henry C.S.,...,Journal,"[{'ce:given-name': 'Prinjaporn', 'preferred-na...",Analytica Chimica Acta,68,102-109,Electrochemical impedance-based DNA sensor usi...,Elsevier B.V.,"[{'@_fa': 'true', '$': 'Analytical Chemistry',...",English,"[{'$': 'acpcPNA', '@weight': 'b', '@candidate'..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2787,2020,2018,"[{'affiliation': {'country': 'Thailand', '@afi...",Association between leukocyte telomere length ...,Thailand,Bangkok,"[{'$': 'Department of Biochemistry'}, {'$': 'F...",Sittisak,Honsawek,Honsawek S.,...,Journal,"[{'ce:given-name': 'Thitiya', 'preferred-name'...",International Journal of Rheumatic Diseases,21,118-125,Association between leukocyte telomere length ...,Blackwell Publishing,"[{'@_fa': 'true', '$': 'Rheumatology', '@code'...",English,"[{'$': 'Aged', '@weight': 'b', '@candidate': '..."
2788,2020,2018,"[{'affiliation': {'country': 'Thailand', 'addr...",Anaerobic co-digestion of hydrolysate from alk...,Thailand,Bangkok,[{'$': 'Department of Environmental Engineerin...,Orathai,Chavalparit,Chavalparit O.,...,Journal,"[{'ce:given-name': 'Orathai', 'preferred-name'...",Journal of Material Cycles and Waste Management,5,336-344,Anaerobic co-digestion of hydrolysate from alk...,Springer Tokyoorders@springer.jp,"[{'@_fa': 'true', '$': 'Waste Management and D...",English,
2789,2021,2018,"[{'affiliation': {'country': 'United Kingdom',...",Bullying at work: Cognitive appraisal of negat...,United Kingdom,London,"[{'$': 'Business School'}, {'$': 'University o...",Rebecca,Hewett,Hewett R.,...,Journal,"[{'ce:given-name': 'Rebecca', 'preferred-name'...",Journal of Occupational Health Psychology,49,71-84,Bullying at work: Cognitive appraisal of negat...,American Psychological Association Inc.journal...,"[{'@_fa': 'true', '$': 'Applied Psychology', '...",English,"[{'$': 'Adaptation, Psychological', '@weight':..."
2790,2020,2018,,Three-dimensional interaction diagram for the ...,Thailand,Bangkok,"[{'$': 'Geotechnical Research Unit'}, {'$': 'F...",Boonchai,Ukritchon,Ukritchon B.,...,Journal,"[{'ce:given-name': 'Suraparb', 'preferred-name...",International Journal of Geotechnical Engineering,7,133-146,Three-dimensional interaction diagram for the ...,Taylor and Francis Ltd.michael.wagreich@univie...,"[{'@_fa': 'true', '$': 'Environmental Engineer...",English,


In [43]:
df["affiliation_country"].value_counts()

affiliation_country
Thailand                1777
United States            136
Japan                     75
Australia                 45
United Kingdom            28
China                     27
Malaysia                  19
Canada                    16
Singapore                 11
Brazil                    11
South Korea               11
Germany                   10
France                    10
India                      8
Pakistan                   8
Austria                    7
Armenia                    7
Taiwan                     6
Poland                     6
Belgium                    4
Netherlands                4
Italy                      4
Sweden                     4
Iceland                    4
Hong Kong                  4
Denmark                    4
Switzerland                4
Viet Nam                   3
Spain                      3
Indonesia                  2
New Zealand                2
Saudi Arabia               2
Norway                     2
Romania                