In [10]:
import pandas as pd
import json
import os

# Class for Restructuring

#### Note for Restructuring

There are 7 columns.

| Column         | Description                         |
|----------------|-------------------------------------|
| id             | Unique identifier                   |
| title          | Title of research                   |
| subject-areas  | tuple (code, subject name, abbrev)  |
| authors        | List of authors                     |
| year           | Year of publication                 |
| abstracts      | Abstract text                       |
| references     | List of references (id of research) |


In [11]:
class utils:
    def id_apply(x):
        return x["prism:url"].split("/")[-1]
    
    def title_apply(x):
        if "dc:title" not in x or x["dc:title"] == None:
            return None
        return x["dc:title"]
    
    def author_apply(x):
        name_list = []
        for author in x["author"]:
            name_to_append = ""
            if "ce:given-name" in author:
                name_to_append = f"{author['ce:given-name']} {author['ce:surname']}"
            else: 
                name_to_append = author["ce:indexed-name"]
            name_list.append(name_to_append)
        return name_list
    
    def abstracts_apply(x):
        if "dc:description" not in x or x["dc:description"] == None:
            return None
        return x["dc:description"]
    
    def reference_apply(x):
        if x["bibrecord"] == None or x["bibrecord"]["tail"] == None or x["bibrecord"]["tail"]["bibliography"] == None or x["bibrecord"]["tail"]["bibliography"]["reference"] == None:
            return None
        reference_list = []
        temp = x["bibrecord"]["tail"]["bibliography"]["reference"]
        if type(temp) != type([]):
            temp = [temp]
        for ref in temp:
            if "ref-info" not in ref or "refd-itemidlist" not in ref["ref-info"]:
                continue
            if type(ref["ref-info"]["refd-itemidlist"]["itemid"]) == type([]):
                for j in ref["ref-info"]["refd-itemidlist"]["itemid"]:
                    if "@idtype" in j and j["@idtype"] == "SGR":
                        reference_list.append(j["$"])
            else:
                if "@idtype" in ref["ref-info"]["refd-itemidlist"]["itemid"] and ref["ref-info"]["refd-itemidlist"]["itemid"]["@idtype"] == "SGR":
                    reference_list.append(ref["ref-info"]["refd-itemidlist"]["itemid"]["$"])
        if len(reference_list) == 0:
            return None
        return reference_list
    
    def subject_apply(x):
        subject_list = []
        for subject in x["subject-area"]:
            name = subject["$"]
            code = subject["@code"]
            abbrev = subject["@abbrev"]
            subject_list.append((name, code, abbrev))
        return subject_list

In [None]:
class PrepareData:
    def __init__(self) -> None:
        self.data = None
        self.data_dir = "./data"
    
    def load_data(self):
        dfs = []
        dir_list = os.listdir("./data")
        cnt = 0
        for i in dir_list:
            if i == ".DS_Store":
                continue

            # Handle too many data
            # if not (i == "2018" or i == "2019"):
            #     continue
            
            sub_dir = os.path.join("./data", i)
            files = os.listdir(sub_dir)
            for f in files:
                if f == ".DS_Store":
                    continue

                file_path = os.path.join(sub_dir, f)
                with open(file_path, "r") as file:
                    json_data = json.load(file)

                row = {key: value for key, value in json_data["abstracts-retrieval-response"].items()}
                row["year"] = int(i)
                dfs.append(pd.DataFrame([row]))
                cnt += 1

        df = pd.concat(dfs, ignore_index=True)
        print("Total data: ", cnt)
        self.data = df

    def save_data(self):
        self.data.to_csv("data.csv", index=False)

    def restructure(self):
        df = self.data.copy()
        
        df["id"] = df["coredata"].apply(utils.id_apply)
        df["title"] = df["coredata"].apply(utils.title_apply)
        df["authors"] = df["authors"].apply(utils.author_apply)
        df["abstracts"] = df["coredata"].apply(utils.abstracts_apply)
        df["references"] = df["item"].apply(utils.reference_apply)
        df["category"] = df["subject-areas"].apply(utils.subject_apply)
        
        # Drop unnecessary columns
        df.drop(columns=["item", "affiliation", "coredata", "idxterms", "language", "authkeywords"], inplace=True)
        
        # Explode subject-areas
        df = df.explode("subject-areas", ignore_index=True)
        
        self.data = df

    def null_count(self):
        for c in self.data.columns:
            print(f"Column {c} have {self.data[c].isnull().sum()} null values")

In [13]:
data = PrepareData()

In [14]:
data.load_data()

Total data:  20216


In [15]:
print(f"Before restructure: {data.data.shape}")

Before restructure: (20216, 9)


In [16]:
data.data

Unnamed: 0,item,affiliation,coredata,idxterms,language,authkeywords,subject-areas,authors,year
0,{'ait:process-info': {'ait:status': {'@state':...,"{'affiliation-city': 'Bangkok', '@id': '600281...","{'srctype': 'j', 'eid': '2-s2.0-85131139456', ...",,{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'Ant...","{'subject-area': [{'@_fa': 'true', '$': 'Food ...","{'author': [{'ce:given-name': 'Sureerat', 'pre...",2022
1,{'ait:process-info': {'ait:status': {'@state':...,"[{'affiliation-city': 'Abu Dhabi', '@id': '601...","{'srctype': 'j', 'eid': '2-s2.0-85121351780', ...","{'mainterm': [{'$': 'COVID-19', '@weight': 'a'...",{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'car...","{'subject-area': [{'@_fa': 'true', '$': 'Surge...","{'author': [{'ce:given-name': 'Abhijit V.', 'p...",2022
2,{'ait:process-info': {'ait:status': {'@state':...,"[{'affiliation-city': 'Bangkok', '@id': '60002...","{'srctype': 'j', 'eid': '2-s2.0-85131660961', ...",,{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'Alz...","{'subject-area': [{'@_fa': 'true', '$': 'Neuro...","{'author': [{'ce:given-name': 'Solaphat', 'pre...",2022
3,{'ait:process-info': {'ait:status': {'@state':...,"[{'affiliation-city': 'Linkoping', '@id': '600...","{'srctype': 'j', 'prism:issueIdentifier': '6',...","{'mainterm': [{'$': 'Anisotropic strain', '@we...",{'@xml:lang': 'eng'},,"{'subject-area': [{'@_fa': 'true', '$': 'Physi...","{'author': [{'ce:given-name': 'Erik', 'preferr...",2022
4,{'ait:process-info': {'ait:status': {'@state':...,"[{'affiliation-city': 'Bangkok', '@id': '60028...","{'srctype': 'j', 'eid': '2-s2.0-85143878806', ...",,{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'Bra...","{'subject-area': [{'@_fa': 'true', '$': 'Physi...","{'author': [{'ce:given-name': 'Kunanya', 'pref...",2022
...,...,...,...,...,...,...,...,...,...
20211,{'ait:process-info': {'ait:status': {'@state':...,"[{'affiliation-city': 'Khon Kaen', '@id': '600...","{'srctype': 'k', 'eid': '2-s2.0-85063382127', ...",{'mainterm': [{'$': 'Actual evapotranspiration...,{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'Act...","{'subject-area': [{'@_fa': 'true', '$': 'Artif...","{'author': [{'ce:given-name': 'Panath', 'prefe...",2018
20212,{'ait:process-info': {'ait:status': {'@state':...,"[{'affiliation-city': 'Tokyo', '@id': '6017860...","{'srctype': 'j', 'eid': '2-s2.0-85047370082', ...","{'mainterm': [{'$': 'Carcinoma, Hepatocellular...",{'@xml:lang': 'eng'},,"{'subject-area': [{'@_fa': 'true', '$': 'Multi...","{'author': [{'ce:given-name': 'Hiromi', 'prefe...",2018
20213,{'ait:process-info': {'ait:status': {'@state':...,"[{'affiliation-city': 'Bangkok', '@id': '60028...","{'srctype': 'j', 'eid': '2-s2.0-85042389729', ...",,{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'Mot...","{'subject-area': [{'@_fa': 'true', '$': 'Medic...","{'author': [{'ce:given-name': 'Sirinuch', 'pre...",2018
20214,{'ait:process-info': {'ait:status': {'@state':...,"{'affiliation-city': 'Bangkok', '@id': '600281...","{'srctype': 'j', 'eid': '2-s2.0-85055194236', ...",,{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'Can...","{'subject-area': [{'@_fa': 'true', '$': 'Veter...","{'author': [{'ce:given-name': 'Chutamas', 'pre...",2018


In [17]:
data.restructure()

In [18]:
print(f"After restructure: {data.data.shape}")

After restructure: (50064, 7)


In [19]:
data.data

Unnamed: 0,subject-areas,authors,year,id,title,abstracts,references
0,"(Food Science, 1106, AGRI)","[Sureerat Thuekeaw, Kris Angkanaporn, Chackrit...",2022,85131139456,Microencapsulated basil oil (Ocimum basilicum ...,Objective: Microencapsulation is a technique t...,"[85039040394, 85050697915, 84920164411, 849664..."
1,"(Physiology, 1314, BIOC)","[Sureerat Thuekeaw, Kris Angkanaporn, Chackrit...",2022,85131139456,Microencapsulated basil oil (Ocimum basilicum ...,Objective: Microencapsulation is a technique t...,"[85039040394, 85050697915, 84920164411, 849664..."
2,"(Animal Science and Zoology, 1103, AGRI)","[Sureerat Thuekeaw, Kris Angkanaporn, Chackrit...",2022,85131139456,Microencapsulated basil oil (Ocimum basilicum ...,Objective: Microencapsulation is a technique t...,"[85039040394, 85050697915, 84920164411, 849664..."
3,"(Genetics, 1311, BIOC)","[Sureerat Thuekeaw, Kris Angkanaporn, Chackrit...",2022,85131139456,Microencapsulated basil oil (Ocimum basilicum ...,Objective: Microencapsulation is a technique t...,"[85039040394, 85050697915, 84920164411, 849664..."
4,"(Veterinary (all), 3400, VETE)","[Sureerat Thuekeaw, Kris Angkanaporn, Chackrit...",2022,85131139456,Microencapsulated basil oil (Ocimum basilicum ...,Objective: Microencapsulation is a technique t...,"[85039040394, 85050697915, 84920164411, 849664..."
...,...,...,...,...,...,...,...
50059,"(Artificial Intelligence, 1702, COMP)","[Panath Jermthaisong, Sununtha Kingpaiboon, Pe...",2018,85063382127,Estimating actual evapotranspiration from NDVI...,Evapotranspiration (ET) is the sum of evaporat...,"[0003491818, 79952741157, 0028978678, 00247710..."
50060,"(Multidisciplinary, 1000, MULT)","[Hiromi Sawai, Nao Nishida, Seik-Soon Khor, Ma...",2018,85047370082,Genome-wide association study identified new s...,We have performed a genome-wide association st...,"[84947346338, 0033935745, 14944385553, 0035467..."
50061,"(Medicine (all), 2700, MEDI)","[Sirinuch Utarapichat, Wasuwat Kitisomprayoonkul]",2018,85042389729,Effects of transcranial direct current stimula...,Background: Anodal transcranial direct current...,"[33947494560, 33745243839, 36849022625, 751491..."
50062,"(Veterinary (all), 3400, VETE)","[Chutamas Benjanirut, Chanakarn Wongsangchan, ...",2018,85055194236,Prevalence and risk factors for canine cogniti...,Canine cognitive dysfunction syndrome (CDS) is...,"[59649128451, 0035379688, 84860848937, 8505515..."


In [20]:
data.data.sample(5)

Unnamed: 0,subject-areas,authors,year,id,title,abstracts,references
14506,"(Anthropology, 3314, SOCI)","[Sureenate Jaratjarungkiat, Kyung-Eun Park, Ni...",2023,85170245185,The Grammaticalization of the Copulas /pēn/ an...,The present study has two main aims: 1) to con...,"[85170222908, 85170267186, 85170267186, 851702..."
41115,"(Mechanical Engineering, 2210, ENGI)","[Narunchara Phurahong, Nantana Jiratumnukul]",2020,85087031013,Preparation and characterization of surface-mo...,Nanocellulose fibers (NCF) is a renewable biod...,"[85042631554, 85101425564, 85034047986, 849227..."
5345,"(Mechanics of Materials, 2211, ENGI)","[Jadetapong Klahan, Gamolwan Tumcharern, Mongk...",2022,85142168042,Extraction-free techniques for sensitive detec...,Fuel adulteration and cross-contamination lead...,"[85142146418, 85136680439, 85142176141, 338473..."
25818,"(Environmental Engineering, 2305, ENVI)","[Soklida Hong, Thunyalux Ratpukdi, Bunleu Sung...",2019,85060065951,A sustainable solution for removal of glutaral...,Glutaraldehyde (GA) is the most common biocide...,"[0003425384, 85041708127, 40549094633, 8504781..."
12753,"(Health (social science), 3306, SOCI)","[Khursheed Ahmad Shiekh, Thitirat Luanglaor, N...",2023,85146791299,Antioxidants and Quality Changes of Thermally ...,Purple corn kernels were subjected to boiling ...,"[85118807411, 84946491882, 84907814100, 850473..."


In [23]:
data.null_count()

Column subject-areas have 0 null values
Column authors have 0 null values
Column year have 0 null values
Column id have 0 null values
Column title have 2 null values
Column abstracts have 1251 null values
Column references have 899 null values


In [25]:
data.data.dropna(subset=["abstracts"], inplace=True)
data.data.dropna(subset=["references"], inplace=True)
data.data.dropna(subset=["title"], inplace=True)

data.data.shape

(48227, 7)

In [26]:
data.data

Unnamed: 0,subject-areas,authors,year,id,title,abstracts,references
0,"(Food Science, 1106, AGRI)","[Sureerat Thuekeaw, Kris Angkanaporn, Chackrit...",2022,85131139456,Microencapsulated basil oil (Ocimum basilicum ...,Objective: Microencapsulation is a technique t...,"[85039040394, 85050697915, 84920164411, 849664..."
1,"(Physiology, 1314, BIOC)","[Sureerat Thuekeaw, Kris Angkanaporn, Chackrit...",2022,85131139456,Microencapsulated basil oil (Ocimum basilicum ...,Objective: Microencapsulation is a technique t...,"[85039040394, 85050697915, 84920164411, 849664..."
2,"(Animal Science and Zoology, 1103, AGRI)","[Sureerat Thuekeaw, Kris Angkanaporn, Chackrit...",2022,85131139456,Microencapsulated basil oil (Ocimum basilicum ...,Objective: Microencapsulation is a technique t...,"[85039040394, 85050697915, 84920164411, 849664..."
3,"(Genetics, 1311, BIOC)","[Sureerat Thuekeaw, Kris Angkanaporn, Chackrit...",2022,85131139456,Microencapsulated basil oil (Ocimum basilicum ...,Objective: Microencapsulation is a technique t...,"[85039040394, 85050697915, 84920164411, 849664..."
4,"(Veterinary (all), 3400, VETE)","[Sureerat Thuekeaw, Kris Angkanaporn, Chackrit...",2022,85131139456,Microencapsulated basil oil (Ocimum basilicum ...,Objective: Microencapsulation is a technique t...,"[85039040394, 85050697915, 84920164411, 849664..."
...,...,...,...,...,...,...,...
50059,"(Artificial Intelligence, 1702, COMP)","[Panath Jermthaisong, Sununtha Kingpaiboon, Pe...",2018,85063382127,Estimating actual evapotranspiration from NDVI...,Evapotranspiration (ET) is the sum of evaporat...,"[0003491818, 79952741157, 0028978678, 00247710..."
50060,"(Multidisciplinary, 1000, MULT)","[Hiromi Sawai, Nao Nishida, Seik-Soon Khor, Ma...",2018,85047370082,Genome-wide association study identified new s...,We have performed a genome-wide association st...,"[84947346338, 0033935745, 14944385553, 0035467..."
50061,"(Medicine (all), 2700, MEDI)","[Sirinuch Utarapichat, Wasuwat Kitisomprayoonkul]",2018,85042389729,Effects of transcranial direct current stimula...,Background: Anodal transcranial direct current...,"[33947494560, 33745243839, 36849022625, 751491..."
50062,"(Veterinary (all), 3400, VETE)","[Chutamas Benjanirut, Chanakarn Wongsangchan, ...",2018,85055194236,Prevalence and risk factors for canine cogniti...,Canine cognitive dysfunction syndrome (CDS) is...,"[59649128451, 0035379688, 84860848937, 8505515..."


In [27]:
data.save_data()