In [1]:
import pandas as pd
import json
import os

# Class for Restructuring

#### Note for Restructuring

There are 7 columns.

| Column         | Description                         |
|----------------|-------------------------------------|
| id             | Unique identifier                   |
| title          | Title of research                   |
| category       | tuple (code, subject name, abbrev)  |
| authors        | List of authors                     |
| year           | Year of publication                 |
| abstracts      | Abstract text                       |
| references     | List of references (id of research) |


In [2]:
class utils:
    def id_apply(x):
        return x["prism:url"].split("/")[-1]
    
    def title_apply(x):
        if "dc:title" not in x or x["dc:title"] == None:
            return None
        return x["dc:title"]
    
    def author_apply(x):
        # name_list = []
        combine_name = ""
        for author in x["author"]:
            name_to_append = ""
            if "ce:given-name" in author:
                name_to_append = f"{author['ce:given-name']} {author['ce:surname']}"
            else: 
                name_to_append = author["ce:indexed-name"]
            # name_list.append(name_to_append)
            combine_name += name_to_append + ", "
        return combine_name
    
    def abstracts_apply(x):
        if "dc:description" not in x or x["dc:description"] == None:
            return None
        return x["dc:description"]
    
    def reference_apply(x):
        if x["bibrecord"] == None or x["bibrecord"]["tail"] == None or x["bibrecord"]["tail"]["bibliography"] == None or x["bibrecord"]["tail"]["bibliography"]["reference"] == None:
            return None
        reference_list = []
        temp = x["bibrecord"]["tail"]["bibliography"]["reference"]
        if type(temp) != type([]):
            temp = [temp]
        for ref in temp:
            if "ref-info" not in ref or "refd-itemidlist" not in ref["ref-info"]:
                continue
            if type(ref["ref-info"]["refd-itemidlist"]["itemid"]) == type([]):
                for j in ref["ref-info"]["refd-itemidlist"]["itemid"]:
                    if "@idtype" in j and j["@idtype"] == "SGR":
                        reference_list.append(j["$"])
            else:
                if "@idtype" in ref["ref-info"]["refd-itemidlist"]["itemid"] and ref["ref-info"]["refd-itemidlist"]["itemid"]["@idtype"] == "SGR":
                    reference_list.append(ref["ref-info"]["refd-itemidlist"]["itemid"]["$"])
        if len(reference_list) == 0:
            return None
        return reference_list
    
    def subject_apply(x):
        subject_list = []
        for subject in x["subject-area"]:
            name = subject["$"]
            code = subject["@code"]
            abbrev = subject["@abbrev"]
            subject_list.append((name, code, abbrev))
        return subject_list

In [3]:
class PrepareData:
    def __init__(self) -> None:
        self.data = None
        self.data_dir = "./data"
    
    def load_data(self):
        dfs = []
        dir_list = os.listdir("./data")
        cnt = 0
        for i in dir_list:
            if i == ".DS_Store":
                continue

            # Handle too many data
            # if not (i == "2018" or i == "2019"):
            #     continue
            
            sub_dir = os.path.join("./data", i)
            files = os.listdir(sub_dir)
            for f in files:
                if f == ".DS_Store":
                    continue

                file_path = os.path.join(sub_dir, f)
                with open(file_path, "r") as file:
                    json_data = json.load(file)

                row = {key: value for key, value in json_data["abstracts-retrieval-response"].items()}
                row["year"] = int(i)
                dfs.append(pd.DataFrame([row]))
                cnt += 1

        df = pd.concat(dfs, ignore_index=True)
        print("Total data: ", cnt)
        self.data = df

    def save_data(self, file_name:str):
        if file_name.split(".")[-1] != "csv":
            file_name += ".csv"

        self.data.to_csv(file_name, index=False)

    def restructure(self):
        df = self.data.copy()
        
        df["id"] = df["coredata"].apply(utils.id_apply)
        df["title"] = df["coredata"].apply(utils.title_apply)
        df["authors"] = df["authors"].apply(utils.author_apply)
        df["abstract"] = df["coredata"].apply(utils.abstracts_apply)
        df["references"] = df["item"].apply(utils.reference_apply)
        df["category"] = df["subject-areas"].apply(utils.subject_apply)
        df["source"] = "scopus"
        
        # Drop unnecessary columns
        df.drop(columns=["item", "affiliation", "coredata", "idxterms", "language", "authkeywords", "subject-areas"], inplace=True)
        
        # Explode subject-areas
        # df = df.explode("category", ignore_index=True)
        
        self.data = df

    def null_count(self):
        for c in self.data.columns:
            print(f"Column {c} have {self.data[c].isnull().sum()} null values")

In [4]:
data = PrepareData()

In [5]:
data.load_data()

Total data:  20216


In [6]:
print(f"Before restructure: {data.data.shape}")

Before restructure: (20216, 9)


In [7]:
data.data

Unnamed: 0,item,affiliation,coredata,idxterms,language,authkeywords,subject-areas,authors,year
0,{'ait:process-info': {'ait:status': {'@state':...,"{'affiliation-city': 'Bangkok', '@id': '600281...","{'srctype': 'j', 'eid': '2-s2.0-85131139456', ...",,{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'Ant...","{'subject-area': [{'@_fa': 'true', '$': 'Food ...","{'author': [{'ce:given-name': 'Sureerat', 'pre...",2022
1,{'ait:process-info': {'ait:status': {'@state':...,"[{'affiliation-city': 'Abu Dhabi', '@id': '601...","{'srctype': 'j', 'eid': '2-s2.0-85121351780', ...","{'mainterm': [{'$': 'COVID-19', '@weight': 'a'...",{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'car...","{'subject-area': [{'@_fa': 'true', '$': 'Surge...","{'author': [{'ce:given-name': 'Abhijit V.', 'p...",2022
2,{'ait:process-info': {'ait:status': {'@state':...,"[{'affiliation-city': 'Bangkok', '@id': '60002...","{'srctype': 'j', 'eid': '2-s2.0-85131660961', ...",,{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'Alz...","{'subject-area': [{'@_fa': 'true', '$': 'Neuro...","{'author': [{'ce:given-name': 'Solaphat', 'pre...",2022
3,{'ait:process-info': {'ait:status': {'@state':...,"[{'affiliation-city': 'Linkoping', '@id': '600...","{'srctype': 'j', 'prism:issueIdentifier': '6',...","{'mainterm': [{'$': 'Anisotropic strain', '@we...",{'@xml:lang': 'eng'},,"{'subject-area': [{'@_fa': 'true', '$': 'Physi...","{'author': [{'ce:given-name': 'Erik', 'preferr...",2022
4,{'ait:process-info': {'ait:status': {'@state':...,"[{'affiliation-city': 'Bangkok', '@id': '60028...","{'srctype': 'j', 'eid': '2-s2.0-85143878806', ...",,{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'Bra...","{'subject-area': [{'@_fa': 'true', '$': 'Physi...","{'author': [{'ce:given-name': 'Kunanya', 'pref...",2022
...,...,...,...,...,...,...,...,...,...
20211,{'ait:process-info': {'ait:status': {'@state':...,"[{'affiliation-city': 'Khon Kaen', '@id': '600...","{'srctype': 'k', 'eid': '2-s2.0-85063382127', ...",{'mainterm': [{'$': 'Actual evapotranspiration...,{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'Act...","{'subject-area': [{'@_fa': 'true', '$': 'Artif...","{'author': [{'ce:given-name': 'Panath', 'prefe...",2018
20212,{'ait:process-info': {'ait:status': {'@state':...,"[{'affiliation-city': 'Tokyo', '@id': '6017860...","{'srctype': 'j', 'eid': '2-s2.0-85047370082', ...","{'mainterm': [{'$': 'Carcinoma, Hepatocellular...",{'@xml:lang': 'eng'},,"{'subject-area': [{'@_fa': 'true', '$': 'Multi...","{'author': [{'ce:given-name': 'Hiromi', 'prefe...",2018
20213,{'ait:process-info': {'ait:status': {'@state':...,"[{'affiliation-city': 'Bangkok', '@id': '60028...","{'srctype': 'j', 'eid': '2-s2.0-85042389729', ...",,{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'Mot...","{'subject-area': [{'@_fa': 'true', '$': 'Medic...","{'author': [{'ce:given-name': 'Sirinuch', 'pre...",2018
20214,{'ait:process-info': {'ait:status': {'@state':...,"{'affiliation-city': 'Bangkok', '@id': '600281...","{'srctype': 'j', 'eid': '2-s2.0-85055194236', ...",,{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'Can...","{'subject-area': [{'@_fa': 'true', '$': 'Veter...","{'author': [{'ce:given-name': 'Chutamas', 'pre...",2018


In [8]:
data.restructure()

In [9]:
print(f"After restructure: {data.data.shape}")

After restructure: (20216, 8)


In [10]:
data.data

Unnamed: 0,authors,year,id,title,abstract,references,category,source
0,"Sureerat Thuekeaw, Kris Angkanaporn, Chackrit ...",2022,85131139456,Microencapsulated basil oil (Ocimum basilicum ...,Objective: Microencapsulation is a technique t...,"[85039040394, 85050697915, 84920164411, 849664...","[(Food Science, 1106, AGRI), (Physiology, 1314...",scopus
1,"Abhijit V. Lele, Sarah Wahlster, Bhunyawee Alu...",2022,85121351780,Perceptions Regarding the SARS-CoV-2 Pandemic'...,Background: The SARS-CoV-2 (COVID-19) pandemic...,"[85104589379, 85083241171, 85078262578, 850841...","[(Surgery, 2746, MEDI), (Neurology (clinical),...",scopus
2,"Solaphat Hemrungrojn, Arisara Amrapala, Michae...",2022,85131660961,Construction of a short version of the Montrea...,Background: The Montreal Cognitive Assessment ...,"[84982975791, 84871671961, 85097597113, 751491...","[(Neuroscience (all), 2800, NEUR)]",scopus
3,"Erik Johansson, Ferenc Tasnádi, Annop Ektarawo...",2022,85124670542,The effect of strain and pressure on the elect...,Different theoretical methodologies are employ...,"[0035282206, 0035508561, 0035858409, 183444013...","[(Physics and Astronomy (all), 3100, PHYS)]",scopus
4,"Kunanya Masodsai, Rungchai Chaunchaiyakul,",2022,85143878806,Dynamic Cardiopulmonary and Metabolic Function...,The purpose of this study was to investigate a...,"[25444452457, 49949090130, 84860884417, 848934...","[(Physiology (medical), 2737, MEDI)]",scopus
...,...,...,...,...,...,...,...,...
20211,"Panath Jermthaisong, Sununtha Kingpaiboon, Pet...",2018,85063382127,Estimating actual evapotranspiration from NDVI...,Evapotranspiration (ET) is the sum of evaporat...,"[0003491818, 79952741157, 0028978678, 00247710...","[(Artificial Intelligence, 1702, COMP)]",scopus
20212,"Hiromi Sawai, Nao Nishida, Seik-Soon Khor, Mas...",2018,85047370082,Genome-wide association study identified new s...,We have performed a genome-wide association st...,"[84947346338, 0033935745, 14944385553, 0035467...","[(Multidisciplinary, 1000, MULT)]",scopus
20213,"Sirinuch Utarapichat, Wasuwat Kitisomprayoonkul,",2018,85042389729,Effects of transcranial direct current stimula...,Background: Anodal transcranial direct current...,"[33947494560, 33745243839, 36849022625, 751491...","[(Medicine (all), 2700, MEDI)]",scopus
20214,"Chutamas Benjanirut, Chanakarn Wongsangchan, P...",2018,85055194236,Prevalence and risk factors for canine cogniti...,Canine cognitive dysfunction syndrome (CDS) is...,"[59649128451, 0035379688, 84860848937, 8505515...","[(Veterinary (all), 3400, VETE)]",scopus


In [11]:
data.data.sample(5)

Unnamed: 0,authors,year,id,title,abstract,references,category,source
16844,"Suwanna Satha-Anand,",2020,85121090745,Caring detachment in buddhism and implications...,,"[33749608511, 84944723134, 84884054857, 000881...","[(Arts and Humanities (all), 1200, ARTS), (Soc...",scopus
13562,"Atchara Phumee, Supaporn Wacharapluesadee, Sin...",2021,85116357288,Detection of Changuinola virus (Reoviridae: Or...,Background: Phlebotomine sand flies are vector...,"[84962467987, 85051020221, 85074108369, 850669...","[(Parasitology, 2405, IMMU), (Public Health, E...",scopus
11709,"Chayapol Tungphatthong, Santhosh Kumar J. Urum...",2021,85103347890,"Differentiation of Mitragyna speciosa, a narco...","Mitragyna speciosa (Korth.) Havil. [MS], or “k...","[85016137630, 34447134127, 85103357518, 002420...","[(Multidisciplinary, 1000, MULT)]",scopus
6646,"Onsurang Wattanathamsan, Varisa Pongrakhananon,",2023,85117372713,Post-translational modifications of tubulin: t...,Microtubules play an important role in regulat...,"[0034743675, 85065562990, 0035667263, 85070377...","[(Molecular Medicine, 1313, BIOC), (Molecular ...",scopus
13881,"Siraphop Thapmongkol, Jarun Sayasathid, Jessad...",2021,85112770433,Growth of the pulmonary valve annulus after th...,Background: The surgical outcomes of tetralogy...,"[80855144819, 33645332975, 77956844737, 770492...","[(Pediatrics, Perinatology and Child Health, 2...",scopus


In [12]:
data.null_count()

Column authors have 0 null values
Column year have 0 null values
Column id have 0 null values
Column title have 1 null values
Column abstract have 665 null values
Column references have 411 null values
Column category have 0 null values
Column source have 0 null values


In [13]:
data.data.dropna(subset=["abstract"], inplace=True)
data.data.dropna(subset=["references"], inplace=True)
data.data.dropna(subset=["title"], inplace=True)

data.data.shape

(19275, 8)

In [14]:
data.data

Unnamed: 0,authors,year,id,title,abstract,references,category,source
0,"Sureerat Thuekeaw, Kris Angkanaporn, Chackrit ...",2022,85131139456,Microencapsulated basil oil (Ocimum basilicum ...,Objective: Microencapsulation is a technique t...,"[85039040394, 85050697915, 84920164411, 849664...","[(Food Science, 1106, AGRI), (Physiology, 1314...",scopus
1,"Abhijit V. Lele, Sarah Wahlster, Bhunyawee Alu...",2022,85121351780,Perceptions Regarding the SARS-CoV-2 Pandemic'...,Background: The SARS-CoV-2 (COVID-19) pandemic...,"[85104589379, 85083241171, 85078262578, 850841...","[(Surgery, 2746, MEDI), (Neurology (clinical),...",scopus
2,"Solaphat Hemrungrojn, Arisara Amrapala, Michae...",2022,85131660961,Construction of a short version of the Montrea...,Background: The Montreal Cognitive Assessment ...,"[84982975791, 84871671961, 85097597113, 751491...","[(Neuroscience (all), 2800, NEUR)]",scopus
3,"Erik Johansson, Ferenc Tasnádi, Annop Ektarawo...",2022,85124670542,The effect of strain and pressure on the elect...,Different theoretical methodologies are employ...,"[0035282206, 0035508561, 0035858409, 183444013...","[(Physics and Astronomy (all), 3100, PHYS)]",scopus
4,"Kunanya Masodsai, Rungchai Chaunchaiyakul,",2022,85143878806,Dynamic Cardiopulmonary and Metabolic Function...,The purpose of this study was to investigate a...,"[25444452457, 49949090130, 84860884417, 848934...","[(Physiology (medical), 2737, MEDI)]",scopus
...,...,...,...,...,...,...,...,...
20211,"Panath Jermthaisong, Sununtha Kingpaiboon, Pet...",2018,85063382127,Estimating actual evapotranspiration from NDVI...,Evapotranspiration (ET) is the sum of evaporat...,"[0003491818, 79952741157, 0028978678, 00247710...","[(Artificial Intelligence, 1702, COMP)]",scopus
20212,"Hiromi Sawai, Nao Nishida, Seik-Soon Khor, Mas...",2018,85047370082,Genome-wide association study identified new s...,We have performed a genome-wide association st...,"[84947346338, 0033935745, 14944385553, 0035467...","[(Multidisciplinary, 1000, MULT)]",scopus
20213,"Sirinuch Utarapichat, Wasuwat Kitisomprayoonkul,",2018,85042389729,Effects of transcranial direct current stimula...,Background: Anodal transcranial direct current...,"[33947494560, 33745243839, 36849022625, 751491...","[(Medicine (all), 2700, MEDI)]",scopus
20214,"Chutamas Benjanirut, Chanakarn Wongsangchan, P...",2018,85055194236,Prevalence and risk factors for canine cogniti...,Canine cognitive dysfunction syndrome (CDS) is...,"[59649128451, 0035379688, 84860848937, 8505515...","[(Veterinary (all), 3400, VETE)]",scopus


In [15]:
data.save_data(file_name="scopus_unexplode_data.csv")