In [1]:
import pandas as pd
import json
import os

# Class for Restructuring

#### Note for Destructuring

- dc:description can be unexist
- abstract can be null
- authkeywords can be null
- reference can be null (tail can be null)

In [2]:
class utils:
    def author_apply(x):
        name_list = []
        for author in x["author"]:
            name_to_append = ""
            if "ce:given-name" in author:
                name_to_append = f"{author['ce:given-name']} {author['ce:surname']}"
            else: 
                name_to_append = author["ce:indexed-name"]
            name_list.append(name_to_append)
        return name_list
    
    def citation_apply(x):
        citation_list = []
        if x == None:
            return [None]
        if type(x["author-keyword"]) == type(dict()):
            return [x["author-keyword"]["$"]]
        for citation in x["author-keyword"]:
            citation_list.append(citation["$"])
        return citation_list
    
    def reference_apply(x):
        reference_list = []
        if x["bibrecord"]["tail"] == None:
            return []
        if x["bibrecord"]["tail"]["bibliography"] == None:
            print("a")
            return []
        if x["bibrecord"]["tail"]["bibliography"]["reference"] == None:
            print("b")
            return []
        if type(x["bibrecord"]["tail"]["bibliography"]["reference"]) == type(dict()):
            reference_list.append(x["bibrecord"]["tail"]["bibliography"]["reference"]["ref-fulltext"] if "ref-fulltext" in x["bibrecord"]["tail"]["bibliography"]["reference"] else "")
            return reference_list
        for ref in x["bibrecord"]["tail"]["bibliography"]["reference"]:
            reference_list.append(ref["ref-fulltext"] if "ref-fulltext" in ref else "")
        return reference_list
    
    def subject_apply(x):
        subject_list = []
        for subject in x["subject-area"]:
            subject_list.append(subject["$"])
        return subject_list

In [3]:
class PrepareData:
    def __init__(self) -> None:
        self.data = None
        self.data_dir = "./data"
    
    def load_data(self):
        dfs = []
        dir_list = os.listdir("./data")
        cnt = 0
        for i in dir_list:
            if i == ".DS_Store":
                continue

            # Handle too many data
            # if not (i == "2018" or i == "2019"):
            #     continue
            
            sub_dir = os.path.join("./data", i)
            files = os.listdir(sub_dir)
            for f in files:
                if f == ".DS_Store":
                    continue

                file_path = os.path.join(sub_dir, f)
                with open(file_path, "r") as file:
                    json_data = json.load(file)

                row = {key: value for key, value in json_data["abstracts-retrieval-response"].items()}
                row["year"] = int(i)
                dfs.append(pd.DataFrame([row]))
                cnt += 1

        df = pd.concat(dfs, ignore_index=True)
        print("Total data: ", cnt)
        self.data = df

    def save_data(self):
        self.data.to_csv("data.csv", index=False)

    def restructure(self):
        df = self.data.copy()
        
        df["id"] = df["coredata"].apply(lambda x: x["prism:url"].split("/")[-1])
        df["authors"] = df["authors"].apply(utils.author_apply)
        df["description"] = df["coredata"].apply(lambda x: x["dc:description"] if "dc:description" in x else "")
        df["abstracts"] = df["item"].apply(lambda x: x["bibrecord"]["head"]["abstracts"])
        df["citation"] = df["authkeywords"].apply(utils.citation_apply)
        df["references"] = df["item"].apply(utils.reference_apply)
        df["subject-areas"] = df["subject-areas"].apply(utils.subject_apply)
        df = df.explode("citation", ignore_index=True)
        
        df.drop(columns=["item", "affiliation", "coredata", "idxterms", "language", "authkeywords"], inplace=True)
        self.data = df
    
    

In [4]:
data = PrepareData()

In [5]:
data.load_data()

Total data:  20216


In [6]:
print(f"Before restructure: {data.data.shape}")

Before restructure: (20216, 9)


In [7]:
data.data

Unnamed: 0,item,affiliation,coredata,idxterms,language,authkeywords,subject-areas,authors,year
0,{'ait:process-info': {'ait:status': {'@state':...,"{'affiliation-city': 'Bangkok', '@id': '600281...","{'srctype': 'j', 'eid': '2-s2.0-85131139456', ...",,{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'Ant...","{'subject-area': [{'@_fa': 'true', '$': 'Food ...","{'author': [{'ce:given-name': 'Sureerat', 'pre...",2022
1,{'ait:process-info': {'ait:status': {'@state':...,"[{'affiliation-city': 'Abu Dhabi', '@id': '601...","{'srctype': 'j', 'eid': '2-s2.0-85121351780', ...","{'mainterm': [{'$': 'COVID-19', '@weight': 'a'...",{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'car...","{'subject-area': [{'@_fa': 'true', '$': 'Surge...","{'author': [{'ce:given-name': 'Abhijit V.', 'p...",2022
2,{'ait:process-info': {'ait:status': {'@state':...,"[{'affiliation-city': 'Bangkok', '@id': '60002...","{'srctype': 'j', 'eid': '2-s2.0-85131660961', ...",,{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'Alz...","{'subject-area': [{'@_fa': 'true', '$': 'Neuro...","{'author': [{'ce:given-name': 'Solaphat', 'pre...",2022
3,{'ait:process-info': {'ait:status': {'@state':...,"[{'affiliation-city': 'Linkoping', '@id': '600...","{'srctype': 'j', 'prism:issueIdentifier': '6',...","{'mainterm': [{'$': 'Anisotropic strain', '@we...",{'@xml:lang': 'eng'},,"{'subject-area': [{'@_fa': 'true', '$': 'Physi...","{'author': [{'ce:given-name': 'Erik', 'preferr...",2022
4,{'ait:process-info': {'ait:status': {'@state':...,"[{'affiliation-city': 'Bangkok', '@id': '60028...","{'srctype': 'j', 'eid': '2-s2.0-85143878806', ...",,{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'Bra...","{'subject-area': [{'@_fa': 'true', '$': 'Physi...","{'author': [{'ce:given-name': 'Kunanya', 'pref...",2022
...,...,...,...,...,...,...,...,...,...
20211,{'ait:process-info': {'ait:status': {'@state':...,"[{'affiliation-city': 'Khon Kaen', '@id': '600...","{'srctype': 'k', 'eid': '2-s2.0-85063382127', ...",{'mainterm': [{'$': 'Actual evapotranspiration...,{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'Act...","{'subject-area': [{'@_fa': 'true', '$': 'Artif...","{'author': [{'ce:given-name': 'Panath', 'prefe...",2018
20212,{'ait:process-info': {'ait:status': {'@state':...,"[{'affiliation-city': 'Tokyo', '@id': '6017860...","{'srctype': 'j', 'eid': '2-s2.0-85047370082', ...","{'mainterm': [{'$': 'Carcinoma, Hepatocellular...",{'@xml:lang': 'eng'},,"{'subject-area': [{'@_fa': 'true', '$': 'Multi...","{'author': [{'ce:given-name': 'Hiromi', 'prefe...",2018
20213,{'ait:process-info': {'ait:status': {'@state':...,"[{'affiliation-city': 'Bangkok', '@id': '60028...","{'srctype': 'j', 'eid': '2-s2.0-85042389729', ...",,{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'Mot...","{'subject-area': [{'@_fa': 'true', '$': 'Medic...","{'author': [{'ce:given-name': 'Sirinuch', 'pre...",2018
20214,{'ait:process-info': {'ait:status': {'@state':...,"{'affiliation-city': 'Bangkok', '@id': '600281...","{'srctype': 'j', 'eid': '2-s2.0-85055194236', ...",,{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'Can...","{'subject-area': [{'@_fa': 'true', '$': 'Veter...","{'author': [{'ce:given-name': 'Chutamas', 'pre...",2018


In [8]:
data.restructure()

In [9]:
print(f"After restructure: {data.data.shape}")

After restructure: (86836, 8)


In [10]:
data.data

Unnamed: 0,subject-areas,authors,year,id,description,abstracts,citation,references
0,"[Food Science, Physiology, Animal Science and ...","[Sureerat Thuekeaw, Kris Angkanaporn, Chackrit...",2022,85131139456,Objective: Microencapsulation is a technique t...,Copyright © 2022 by Animal BioscienceObjective...,Antioxidant Capacity,"[Attia YA, Al-Harthi MA, Sh. Elnaggar A. Produ..."
1,"[Food Science, Physiology, Animal Science and ...","[Sureerat Thuekeaw, Kris Angkanaporn, Chackrit...",2022,85131139456,Objective: Microencapsulation is a technique t...,Copyright © 2022 by Animal BioscienceObjective...,Basil Oil,"[Attia YA, Al-Harthi MA, Sh. Elnaggar A. Produ..."
2,"[Food Science, Physiology, Animal Science and ...","[Sureerat Thuekeaw, Kris Angkanaporn, Chackrit...",2022,85131139456,Objective: Microencapsulation is a technique t...,Copyright © 2022 by Animal BioscienceObjective...,Broiler Chicken,"[Attia YA, Al-Harthi MA, Sh. Elnaggar A. Produ..."
3,"[Food Science, Physiology, Animal Science and ...","[Sureerat Thuekeaw, Kris Angkanaporn, Chackrit...",2022,85131139456,Objective: Microencapsulation is a technique t...,Copyright © 2022 by Animal BioscienceObjective...,Gut Morphology,"[Attia YA, Al-Harthi MA, Sh. Elnaggar A. Produ..."
4,"[Food Science, Physiology, Animal Science and ...","[Sureerat Thuekeaw, Kris Angkanaporn, Chackrit...",2022,85131139456,Objective: Microencapsulation is a technique t...,Copyright © 2022 by Animal BioscienceObjective...,Microencapsulation,"[Attia YA, Al-Harthi MA, Sh. Elnaggar A. Produ..."
...,...,...,...,...,...,...,...,...
86831,[Veterinary (all)],"[Chutamas Benjanirut, Chanakarn Wongsangchan, ...",2018,85055194236,Canine cognitive dysfunction syndrome (CDS) is...,© 2018 Chulalongkorn University Printing House...,Canine cognitive dysfunction syndrome,"[Azkona G, Garciá-Belenguer S, Chacón G, Rosad..."
86832,[Veterinary (all)],"[Chutamas Benjanirut, Chanakarn Wongsangchan, ...",2018,85055194236,Canine cognitive dysfunction syndrome (CDS) is...,© 2018 Chulalongkorn University Printing House...,Prevalence,"[Azkona G, Garciá-Belenguer S, Chacón G, Rosad..."
86833,[Veterinary (all)],"[Chutamas Benjanirut, Chanakarn Wongsangchan, ...",2018,85055194236,Canine cognitive dysfunction syndrome (CDS) is...,© 2018 Chulalongkorn University Printing House...,Risk factors,"[Azkona G, Garciá-Belenguer S, Chacón G, Rosad..."
86834,[Veterinary (all)],"[Chutamas Benjanirut, Chanakarn Wongsangchan, ...",2018,85055194236,Canine cognitive dysfunction syndrome (CDS) is...,© 2018 Chulalongkorn University Printing House...,Thailand,"[Azkona G, Garciá-Belenguer S, Chacón G, Rosad..."


In [11]:
data.data

Unnamed: 0,subject-areas,authors,year,id,description,abstracts,citation,references
0,"[Food Science, Physiology, Animal Science and ...","[Sureerat Thuekeaw, Kris Angkanaporn, Chackrit...",2022,85131139456,Objective: Microencapsulation is a technique t...,Copyright © 2022 by Animal BioscienceObjective...,Antioxidant Capacity,"[Attia YA, Al-Harthi MA, Sh. Elnaggar A. Produ..."
1,"[Food Science, Physiology, Animal Science and ...","[Sureerat Thuekeaw, Kris Angkanaporn, Chackrit...",2022,85131139456,Objective: Microencapsulation is a technique t...,Copyright © 2022 by Animal BioscienceObjective...,Basil Oil,"[Attia YA, Al-Harthi MA, Sh. Elnaggar A. Produ..."
2,"[Food Science, Physiology, Animal Science and ...","[Sureerat Thuekeaw, Kris Angkanaporn, Chackrit...",2022,85131139456,Objective: Microencapsulation is a technique t...,Copyright © 2022 by Animal BioscienceObjective...,Broiler Chicken,"[Attia YA, Al-Harthi MA, Sh. Elnaggar A. Produ..."
3,"[Food Science, Physiology, Animal Science and ...","[Sureerat Thuekeaw, Kris Angkanaporn, Chackrit...",2022,85131139456,Objective: Microencapsulation is a technique t...,Copyright © 2022 by Animal BioscienceObjective...,Gut Morphology,"[Attia YA, Al-Harthi MA, Sh. Elnaggar A. Produ..."
4,"[Food Science, Physiology, Animal Science and ...","[Sureerat Thuekeaw, Kris Angkanaporn, Chackrit...",2022,85131139456,Objective: Microencapsulation is a technique t...,Copyright © 2022 by Animal BioscienceObjective...,Microencapsulation,"[Attia YA, Al-Harthi MA, Sh. Elnaggar A. Produ..."
...,...,...,...,...,...,...,...,...
86831,[Veterinary (all)],"[Chutamas Benjanirut, Chanakarn Wongsangchan, ...",2018,85055194236,Canine cognitive dysfunction syndrome (CDS) is...,© 2018 Chulalongkorn University Printing House...,Canine cognitive dysfunction syndrome,"[Azkona G, Garciá-Belenguer S, Chacón G, Rosad..."
86832,[Veterinary (all)],"[Chutamas Benjanirut, Chanakarn Wongsangchan, ...",2018,85055194236,Canine cognitive dysfunction syndrome (CDS) is...,© 2018 Chulalongkorn University Printing House...,Prevalence,"[Azkona G, Garciá-Belenguer S, Chacón G, Rosad..."
86833,[Veterinary (all)],"[Chutamas Benjanirut, Chanakarn Wongsangchan, ...",2018,85055194236,Canine cognitive dysfunction syndrome (CDS) is...,© 2018 Chulalongkorn University Printing House...,Risk factors,"[Azkona G, Garciá-Belenguer S, Chacón G, Rosad..."
86834,[Veterinary (all)],"[Chutamas Benjanirut, Chanakarn Wongsangchan, ...",2018,85055194236,Canine cognitive dysfunction syndrome (CDS) is...,© 2018 Chulalongkorn University Printing House...,Thailand,"[Azkona G, Garciá-Belenguer S, Chacón G, Rosad..."


In [12]:
data.save_data()