In [6]:
import pandas as pd
import json
import os

# Class for Restructuring

#### Note for Restructuring

There are 7 columns.

| Column         | Description                         |
|----------------|-------------------------------------|
| id             | Unique identifier                   |
| title          | Title of research                   |
| category       | tuple (code, subject name, abbrev)  |
| authors        | List of authors                     |
| year           | Year of publication                 |
| abstracts      | Abstract text                       |
| references     | List of references (id of research) |


In [7]:
class utils:
    def id_apply(x):
        return x["prism:url"].split("/")[-1]
    
    def title_apply(x):
        if "dc:title" not in x or x["dc:title"] == None:
            return None
        return x["dc:title"]
    
    def author_apply(x):
        # name_list = []
        combine_name = ""
        for author in x["author"]:
            name_to_append = ""
            if "ce:given-name" in author:
                name_to_append = f"{author['ce:given-name']} {author['ce:surname']}"
            else: 
                name_to_append = author["ce:indexed-name"]
            # name_list.append(name_to_append)
            combine_name += name_to_append + ", "
        return combine_name
    
    def abstracts_apply(x):
        if "dc:description" not in x or x["dc:description"] == None:
            return None
        return x["dc:description"]
    
    def reference_apply(x):
        if x["bibrecord"] == None or x["bibrecord"]["tail"] == None or x["bibrecord"]["tail"]["bibliography"] == None or x["bibrecord"]["tail"]["bibliography"]["reference"] == None:
            return None
        reference_list = []
        temp = x["bibrecord"]["tail"]["bibliography"]["reference"]
        if type(temp) != type([]):
            temp = [temp]
        for ref in temp:
            if "ref-info" not in ref or "refd-itemidlist" not in ref["ref-info"]:
                continue
            if type(ref["ref-info"]["refd-itemidlist"]["itemid"]) == type([]):
                for j in ref["ref-info"]["refd-itemidlist"]["itemid"]:
                    if "@idtype" in j and j["@idtype"] == "SGR":
                        reference_list.append(j["$"])
            else:
                if "@idtype" in ref["ref-info"]["refd-itemidlist"]["itemid"] and ref["ref-info"]["refd-itemidlist"]["itemid"]["@idtype"] == "SGR":
                    reference_list.append(ref["ref-info"]["refd-itemidlist"]["itemid"]["$"])
        if len(reference_list) == 0:
            return None
        return reference_list
    
    def subject_apply(x):
        subject_list = []
        for subject in x["subject-area"]:
            name = subject["$"]
            code = subject["@code"]
            abbrev = subject["@abbrev"]
            subject_list.append((name, code, abbrev))
        return subject_list

In [8]:
class PrepareData:
    def __init__(self) -> None:
        self.data = None
        self.data_dir = "./data"
    
    def load_data(self):
        dfs = []
        dir_list = os.listdir("./data")
        cnt = 0
        for i in dir_list:
            if i == ".DS_Store":
                continue

            # Handle too many data
            # if not (i == "2018" or i == "2019"):
            #     continue
            
            sub_dir = os.path.join("./data", i)
            files = os.listdir(sub_dir)
            for f in files:
                if f == ".DS_Store":
                    continue

                file_path = os.path.join(sub_dir, f)
                with open(file_path, "r") as file:
                    json_data = json.load(file)

                row = {key: value for key, value in json_data["abstracts-retrieval-response"].items()}
                row["year"] = int(i)
                dfs.append(pd.DataFrame([row]))
                cnt += 1

        df = pd.concat(dfs, ignore_index=True)
        print("Total data: ", cnt)
        self.data = df

    def save_data(self, file_name:str):
        if file_name.split(".")[-1] != "csv":
            file_name += ".csv"

        self.data.to_csv(file_name, index=False)

    def restructure(self):
        df = self.data.copy()
        
        df["id"] = df["coredata"].apply(utils.id_apply)
        df["title"] = df["coredata"].apply(utils.title_apply)
        df["authors"] = df["authors"].apply(utils.author_apply)
        df["abstract"] = df["coredata"].apply(utils.abstracts_apply)
        df["references"] = df["item"].apply(utils.reference_apply)
        df["category"] = df["subject-areas"].apply(utils.subject_apply)
        df["source"] = "scopus"
        
        # Drop unnecessary columns
        df.drop(columns=["item", "affiliation", "coredata", "idxterms", "language", "authkeywords", "subject-areas"], inplace=True)
        
        # Explode subject-areas
        # df = df.explode("category", ignore_index=True)
        
        self.data = df

    def null_count(self):
        for c in self.data.columns:
            print(f"Column {c} have {self.data[c].isnull().sum()} null values")

In [9]:
data = PrepareData()

In [10]:
data.load_data()

Total data:  20216


In [11]:
print(f"Before restructure: {data.data.shape}")

Before restructure: (20216, 9)


In [12]:
data.data

Unnamed: 0,item,affiliation,coredata,idxterms,language,authkeywords,subject-areas,authors,year
0,{'ait:process-info': {'ait:status': {'@state':...,"[{'affiliation-city': 'Bangkok', '@id': '60199...","{'srctype': 'j', 'eid': '2-s2.0-85114881247', ...","{'mainterm': [{'$': 'Adult', '@weight': 'b', '...",{'@xml:lang': 'eng'},,"{'subject-area': [{'@_fa': 'true', '$': 'Derma...",{'author': [{'preferred-name': {'ce:given-name...,2022
1,{'ait:process-info': {'ait:status': {'@state':...,"{'affiliation-city': 'Bangkok', '@id': '600281...","{'srctype': 'j', 'eid': '2-s2.0-85130612598', ...",,{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'Bus...","{'subject-area': [{'@_fa': 'true', '$': 'Compu...","{'author': [{'ce:given-name': 'Pakkaporn', 'pr...",2022
2,{'ait:process-info': {'ait:status': {'@state':...,"[{'affiliation-city': 'Bangkok', '@id': '60091...","{'srctype': 'j', 'prism:issueIdentifier': '3',...","{'mainterm': [{'$': 'Animals', '@weight': 'b',...",{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'aqu...","{'subject-area': [{'@_fa': 'true', '$': 'Physi...","{'author': [{'ce:given-name': 'Chonchanok', 'p...",2022
3,{'ait:process-info': {'ait:status': {'@state':...,"[{'affiliation-city': 'Riverside', '@id': '600...","{'srctype': 'j', 'eid': '2-s2.0-85138732480', ...","{'mainterm': [{'$': 'Atrial Fibrillation', '@w...",{'@xml:lang': 'eng'},,"{'subject-area': [{'@_fa': 'true', '$': 'Advan...","{'author': [{'ce:given-name': 'Narut', 'prefer...",2022
4,{'ait:process-info': {'ait:status': {'@state':...,"{'affiliation-city': 'Bangkok', '@id': '600221...","{'srctype': 'j', 'eid': '2-s2.0-85123852067', ...",,{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'Cad...","{'subject-area': [{'@_fa': 'true', '$': 'Anato...","{'author': [{'ce:given-name': 'Danaithep', 'pr...",2022
...,...,...,...,...,...,...,...,...,...
20211,{'ait:process-info': {'ait:status': {'@state':...,"[{'affiliation-city': 'Bangkok', '@id': '60022...","{'srctype': 'j', 'eid': '2-s2.0-85049127661', ...",,{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'End...","{'subject-area': [{'@_fa': 'true', '$': 'Medic...","{'author': [{'ce:given-name': 'Pradermchai', '...",2018
20212,{'ait:process-info': {'ait:status': {'@state':...,"{'affiliation-city': 'Bangkok', '@id': '600281...","{'srctype': 'k', 'eid': '2-s2.0-85026629813', ...",,{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'And...","{'subject-area': [{'@_fa': 'true', '$': 'Artif...","{'author': [{'ce:given-name': 'Kwandee', 'pref...",2018
20213,{'ait:process-info': {'ait:status': {'@state':...,"{'affiliation-city': 'Bangkok', '@id': '600281...","{'srctype': 'j', 'eid': '2-s2.0-85051822557', ...","{'mainterm': [{'$': 'Antifungal Agents', '@wei...",{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'Cor...","{'subject-area': [{'@_fa': 'true', '$': 'Medic...","{'author': [{'ce:given-name': 'Marisa', 'prefe...",2018
20214,{'ait:process-info': {'ait:status': {'@state':...,"[{'affiliation-city': 'Bangkok', '@id': '60028...","{'srctype': 'j', 'prism:issueIdentifier': '2',...",,{'@xml:lang': 'eng'},"{'author-keyword': [{'@_fa': 'true', '$': 'Bio...","{'subject-area': [{'@_fa': 'true', '$': 'Chemi...","{'author': [{'ce:given-name': 'Pimsuda', 'pref...",2018


In [13]:
data.restructure()

In [14]:
print(f"After restructure: {data.data.shape}")

After restructure: (20216, 8)


In [15]:
data.data

Unnamed: 0,authors,year,id,title,abstract,references,category,source
0,"Rujitharanawong C., Tuchinda P., Chularojanamo...",2022,85114881247,Direct immunofluorescence staining patterns co...,This retrospective study performed to investig...,"[37549006776, 85063567734, 84899635287, 850251...","[(Dermatology, 2708, MEDI)]",scopus
1,"Pakkaporn Rungruengkultorn, Somjai Boonsiri,",2022,85130612598,Warehouse Processes Improvement Using Lean Six...,Warehouse processes are the primary activities...,"[3843106887, 35948936348, 0003858901, 84923090...","[(Computer Science (miscellaneous), 1701, COMP...",scopus
2,"Chonchanok Muangnapoh, Eakapong Tamboon, Neung...",2022,85133214239,Multilocus Sequence Typing and Virulence Poten...,"Vibrio parahaemolyticus is a Gram-negative, fo...","[85074709058, 0024526678, 85042353919, 8490683...","[(Physiology, 1314, BIOC), (Ecology, 2303, ENV...",scopus
3,"Narut Prasitlumkum, Ronpichai Chokesuwattanask...",2022,85138732480,Atrial fibrillation ablation in hemodialysis p...,,"[84899623290, 76749163232, 85041417343, 850630...","[(Advanced and Specialized Nursing, 2902, NURS...",scopus
4,"Danaithep Limskul, Jirun Apinun, Thanasil Huan...",2022,85123852067,Anatomy of the coracoid process in Thais: Cada...,Introduction: The aim of this study was to inv...,"[0023103425, 0000627515, 0346996881, 848635193...","[(Anatomy, 2702, MEDI)]",scopus
...,...,...,...,...,...,...,...,...
20211,"Pradermchai Kongkam, Veeravich Jaruvongvanich,...",2018,85049127661,Multi-detector computed tomography versus endo...,Objective: Endoscopic ultrasonography [EUS] ha...,"[78649352788, 77954344963, 78349285165, 849587...","[(Medicine (all), 2700, MEDI)]",scopus
20212,"Kwandee Phetcharakarn, Twittie Senivongse,",2018,85026629813,Heuristic-based usability evaluation tool for ...,Heuristic evaluation is a popular method for e...,"[0004257599, 85022187784, 33744832544, 8502666...","[(Artificial Intelligence, 1702, COMP)]",scopus
20213,"Marisa Taechajongjintana, Ngamjit Kasetsuwan, ...",2018,85051822557,Effectiveness of voriconazole and corneal cros...,Background: We report a rare case of Phialopho...,"[84908551974, 0014002794, 0017140495, 58149174...","[(Medicine (all), 2700, MEDI)]",scopus
20214,"Pimsuda Pansa-Ngat, Trin Jedsukontorn, Mali Hu...",2018,85041960467,Optimal hydrogen production coupled with pollu...,This work aimed to produce hydrogen (H2) simul...,"[0022873628, 84940061463, 29144508454, 8497090...","[(Chemical Engineering (all), 1500, CENG), (Ma...",scopus


In [16]:
data.data.sample(5)

Unnamed: 0,authors,year,id,title,abstract,references,category,source
1775,"Claire M. Lawley, Matthew Tester, Shubhayan Sa...",2022,85140055861,Life-threatening cardiac arrhythmia and sudden...,Background: Electronic gaming has recently bee...,"[85072387788, 85084791889, 84889856570, 850633...","[(Cardiology and Cardiovascular Medicine, 2705...",scopus
6727,"Chetan Chauhan, Rajesh Kumar, Anju Saini, Raj ...",2023,85169978316,Rare coordination behavior of triethanolamine ...,"Herein, unusual and rare coordination behavior...","[85040186048, 84855577188, 85092175918, 337481...","[(Analytical Chemistry, 1602, CHEM), (Spectros...",scopus
17908,"Rodtassana C., Tanner E.V.J.,",2018,85042637153,Litter removal in a tropical rain forest reduc...,Many old-growth lowland tropical rain forests ...,"[72449172894, 77958608000, 0003461899, 0022889...","[(Ecology, Evolution, Behavior and Systematics...",scopus
5622,"Asanee Somdee, Surangkana Wannapop, Nuchanapor...",2023,85143724257,Enhanced charge carrier density of a p-n BiOCl...,Pure and doped p-n BiOCl/BiVO4 heterostructure...,"[0035891138, 85064490044, 85120574487, 8507388...","[(Mechanics of Materials, 2211, ENGI), (Mechan...",scopus
16220,"Anca Chelariu-Raicu, Charles F. Levenback, Bri...",2020,85095457074,Phase Ib/II study of weekly topotecan and dail...,Introduction 50-70% of epithelial ovarian canc...,"[85034257569, 4143087249, 0036499282, 79961139...","[(Oncology, 2730, MEDI), (Obstetrics and Gynec...",scopus


In [17]:
data.null_count()

Column authors have 0 null values
Column year have 0 null values
Column id have 0 null values
Column title have 1 null values
Column abstract have 665 null values
Column references have 411 null values
Column category have 0 null values
Column source have 0 null values


In [18]:
data.data.dropna(subset=["abstract"], inplace=True)
data.data.dropna(subset=["references"], inplace=True)
data.data.dropna(subset=["title"], inplace=True)

data.data.shape

(19275, 8)

In [19]:
data.data

Unnamed: 0,authors,year,id,title,abstract,references,category,source
0,"Rujitharanawong C., Tuchinda P., Chularojanamo...",2022,85114881247,Direct immunofluorescence staining patterns co...,This retrospective study performed to investig...,"[37549006776, 85063567734, 84899635287, 850251...","[(Dermatology, 2708, MEDI)]",scopus
1,"Pakkaporn Rungruengkultorn, Somjai Boonsiri,",2022,85130612598,Warehouse Processes Improvement Using Lean Six...,Warehouse processes are the primary activities...,"[3843106887, 35948936348, 0003858901, 84923090...","[(Computer Science (miscellaneous), 1701, COMP...",scopus
2,"Chonchanok Muangnapoh, Eakapong Tamboon, Neung...",2022,85133214239,Multilocus Sequence Typing and Virulence Poten...,"Vibrio parahaemolyticus is a Gram-negative, fo...","[85074709058, 0024526678, 85042353919, 8490683...","[(Physiology, 1314, BIOC), (Ecology, 2303, ENV...",scopus
4,"Danaithep Limskul, Jirun Apinun, Thanasil Huan...",2022,85123852067,Anatomy of the coracoid process in Thais: Cada...,Introduction: The aim of this study was to inv...,"[0023103425, 0000627515, 0346996881, 848635193...","[(Anatomy, 2702, MEDI)]",scopus
5,"Wichanan Wannasrichan, Htut Htut Htoo, Rubsade...",2022,85140221720,Phage-resistant Pseudomonas aeruginosa against...,"Pseudomonas aeruginosa, a major cause of nosoc...","[0038392706, 85017428187, 84860243334, 8510269...","[(Microbiology, 2404, IMMU), (Microbiology (me...",scopus
...,...,...,...,...,...,...,...,...
20211,"Pradermchai Kongkam, Veeravich Jaruvongvanich,...",2018,85049127661,Multi-detector computed tomography versus endo...,Objective: Endoscopic ultrasonography [EUS] ha...,"[78649352788, 77954344963, 78349285165, 849587...","[(Medicine (all), 2700, MEDI)]",scopus
20212,"Kwandee Phetcharakarn, Twittie Senivongse,",2018,85026629813,Heuristic-based usability evaluation tool for ...,Heuristic evaluation is a popular method for e...,"[0004257599, 85022187784, 33744832544, 8502666...","[(Artificial Intelligence, 1702, COMP)]",scopus
20213,"Marisa Taechajongjintana, Ngamjit Kasetsuwan, ...",2018,85051822557,Effectiveness of voriconazole and corneal cros...,Background: We report a rare case of Phialopho...,"[84908551974, 0014002794, 0017140495, 58149174...","[(Medicine (all), 2700, MEDI)]",scopus
20214,"Pimsuda Pansa-Ngat, Trin Jedsukontorn, Mali Hu...",2018,85041960467,Optimal hydrogen production coupled with pollu...,This work aimed to produce hydrogen (H2) simul...,"[0022873628, 84940061463, 29144508454, 8497090...","[(Chemical Engineering (all), 1500, CENG), (Ma...",scopus


In [20]:
data.save_data(file_name="scopus_unexplode_data.csv")