In [164]:
import os
import json
import pandas as pd
import os
import numpy as np

from dotenv import load_dotenv

In [165]:
load_dotenv()

path_to_project_folder = os.getenv('project_folder')
current_folder = os.getenv('current_folder')
literatures_list_path = os.path.join(current_folder, 'data/literatures_list.txt')

In [None]:
subject_areas_df = pd.read_csv(r'./data/subject_areas.csv')

subject_areas_dict = dict(zip(subject_areas_df['Scopus Code'].astype(str), subject_areas_df['Subject Areas']))

In [167]:
# Create all path in the folder
# 
# with open(literatures_list_path, 'w') as file:
#     for path in os.listdir(path_to_project_folder):
#         folder_path = os.path.join(path_to_project_folder, path)
#         if os.path.isdir(folder_path):
#             for lit in os.listdir(folder_path):
#                 file_path = os.path.join(folder_path, lit)
#                 if os.path.isfile(file_path):
#                     file.write(file_path + '\n')

In [168]:
class Scopus:
    def __init__(self, article, code):
        self.article = article
        self.code = code

        self.title = None
        self.abstract = None
        self.publication_date = None
        self.prism_type = None
        self.keywords = None
        self.prism_type = None
        self.ref_count = None
        self.subject_areas = None
        self.publisher = None
        self.authors = None
        self.affiliation = None

    def title_convert(self):
        self.title = self.article['abstracts-retrieval-response']['coredata'].get("dc:title", None)

    def abstract_convert(self):
        self.abstract = self.article['abstracts-retrieval-response']['coredata'].get("dc:description", None)

    def publication_date_convert(self):
        publication = self.article['abstracts-retrieval-response']['item']['bibrecord']['head']['source']['publicationdate']
        
        year = publication.get("year", "Unknown")
        month = publication.get("month", "01")
        day = publication.get("day", "01")
        
        if year == "Unknown":
            self.publication_date = "Unknown"
        else:
            month = f"{int(month):02}" if month.isdigit() else "01"
            day = f"{int(day):02}" if day.isdigit() else "01"
            
            # ISO 8601 format
            self.publication_date = f"{year}-{month}-{day}"

    def prism_type_convert(self):
        self.prism_type = self.article['abstracts-retrieval-response']['coredata'].get("prism:aggregationType", None)
                                                                       
    def keywords_convert(self):
        if self.article['abstracts-retrieval-response'].get('authkeywords') is not None:
            keywords = self.article['abstracts-retrieval-response']['authkeywords'].get("author-keyword", None)
            if isinstance(keywords, list):
                k = ';'.join(keyword['$'].replace(' ', '') for keyword in keywords)
                self.keywords = k
            else:
                self.keywords = None
        else:
            self.keywords = None
    
    def subject_areas_convert(self):
        areas = self.article['abstracts-retrieval-response']['subject-areas'].get('subject-area', None) #array

        # for area in areas:
        #     code = round(int(area['@code'])/100)
        #     code = str(code*100)    
        #     name = subject_areas_dict[code]
        if areas is not None:
            a = ';'.join(subject_areas_dict[str(round(int(area['@code'])/100)*100)] for area in areas)
            self.subject_areas = a

    def ref_count_convert(self):
        tail = self.article['abstracts-retrieval-response']['item']['bibrecord']['tail']

        if tail is not None:
            count = tail.get("bibliography").get("@refcount", None)
            if count is not None:
                self.ref_count = int(count)
    
    def publisher_convert(self):
        self.publisher = self.article['abstracts-retrieval-response']['coredata'].get("dc:publisher", None)
                                                            
    def affiliation_convert(self):
        affiliations = self.article['abstracts-retrieval-response']['affiliation']
        if isinstance(affiliations, list):
            af = ';'.join(
                f"{aff['affilname']}, {aff['affiliation-country']}"
                for aff in affiliations
            )

            self.affiliation = af
            
        else:
            self.affiliation = f"{affiliations['affilname']}, {affiliations['affiliation-country']}"
        
    def authors_convert(self):
        authors = self.article['abstracts-retrieval-response']['authors']['author']
        
        if isinstance(authors, list):
            # for author in authors:
            #     f_name = author['preferred-name']['ce:given-name']
            #     s_name = author['preferred-name']['ce:surname']

            #     s += "{} {}, ".format(s_name, f_name)

            s = ';'.join(
                f"{author['preferred-name']['ce:surname']} {author['preferred-name']['ce:given-name']}"
                for author in authors
                )

            self.authors = s
        else:
            self.authors = None

    def extract(self):
        self.title_convert()
        self.abstract_convert()
        self.publication_date_convert()
        self.prism_type_convert()
        self.keywords_convert()
        self.subject_areas_convert()
        self.ref_count_convert()
        self.publisher_convert()
        self.affiliation_convert()
        self.authors_convert()

        return {
            "code": self.code,
            "title": self.title,
            "abstract": self.abstract,
            "publication_date": self.publication_date,
            "prism_type": self.prism_type,
            "keywords": self.keywords,
            "subject_area": self.subject_areas,
            "ref_count": self.ref_count,
            "publisher": self.publisher,
            "affiliation": self.affiliation,
            "authors": self.authors
        }

    def append_article(self, df):
        article_data = self.extract()
        
        new_row = pd.DataFrame([article_data])
        df = pd.concat([df, new_row], ignore_index=True)
        return df



In [169]:
df = pd.DataFrame(columns=[
    "code", "title", "abstract", "publication_date", 
    "prism_type", "keywords", "subject_area", 
    "ref_count", "publisher", "affiliation", "authors"
])

In [170]:
with open(literatures_list_path, 'r') as file:
    for line in file:
        literature_path = line.strip()
        try:
            with open(literature_path, 'r', encoding='utf-8') as lit:
                code = lit.name[-9:]

                article = json.load(lit)
                scopus = Scopus(article=article, code=code)
                df = scopus.append_article(df)
                break

        except FileNotFoundError:
            print(f'File not found: {literature_path}')
        except Exception as e:
            print(f'Error processing file {literature_path}: {e}')

In [171]:
with open(literatures_list_path, 'r') as file:
    for line in file:
        literature_path = line.strip()
        try:
            with open(literature_path, 'r', encoding='utf-8') as lit:
                code = lit.name[-9:]

                article = json.load(lit)
                scopus = Scopus(article=article, code=code)
                df = scopus.append_article(df)

        except FileNotFoundError:
            print(f'File not found: {literature_path}')
        except Exception as e:
            print(f'Error processing file {literature_path}: {e}')


In [172]:
df

Unnamed: 0,code,title,abstract,publication_date,prism_type,keywords,subject_area,ref_count,publisher,affiliation,authors
0,201800000,Public health and international epidemiology f...,,2018-12-31,Book,,Medicine,76,Springer International Publishing,"Stanford University School of Medicine, United...",Pongpirul Krit;Lungren Matthew P.
1,201800000,Public health and international epidemiology f...,,2018-12-31,Book,,Medicine,76,Springer International Publishing,"Stanford University School of Medicine, United...",Pongpirul Krit;Lungren Matthew P.
2,201800001,Flexible Printed Active Antenna for Digital Te...,This paper presents the development of a flexi...,2018-12-31,Conference Proceeding,,Engineering;Materials Science,4,Institute of Electrical and Electronics Engine...,"Chulalongkorn University, Thailand",Pratumsiri Teerapong;Janpugdee Panuwat
3,201800002,Parametric study of hydrogen production via so...,Computational fluid dynamics was applied for s...,2018-12-31,Journal,Circulatingfluidizedbed;Computationalfluiddyna...,Chemistry;Chemical Engineering;Engineering,42,Elsevier Ltd,"Chulalongkorn University, Thailand",Phuakpunk Kiattikhoon;Chalermsinsuwan Benjapon...
4,201800003,Superhydrophobic coating from fluoroalkylsilan...,A superhydrophobic/superoleophilic mesh was su...,2018-12-31,Journal,Encapsulation;Fluoroalkylsilane;Naturalrubber;...,Chemistry;Physics and Astronomy;Physics and As...,45,Elsevier B.V.,"Hirosaki University, Japan;Chulalongkorn Unive...",Saengkaew Jittraporn;Le Duy;Samart Chanatip;Sa...
...,...,...,...,...,...,...,...,...,...,...,...
20212,202302885,Long-chain bio-olefins production via oxidativ...,Long-chain α-olefins (≥ C10) are normally appl...,2023-01-01,Journal,Long-chainolefins;MesoporousKIT-6;Oleicacid;Ox...,Chemical Engineering;Chemistry,63,Elsevier B.V.,"Chulalongkorn University, Thailand;Center of E...",Le Duy;Chaidherasuwet Nattaporn;Rueangthaweep ...
20213,202302886,Recent Developments and Applications of Microf...,"Nowadays, food safety has become a major conce...",2023-01-01,Journal,Biologicalhazards;chemicalhazards;foodcontamin...,Chemistry,115,Taylor and Francis Ltd.,"Chulalongkorn University, Thailand",Alahmad Waleed;Varanusupakul Puttaruksa;Varanu...
20214,202302887,"Social justice, education and peacebuilding: c...",Education is increasingly becoming central to ...,2023-01-01,Journal,conflict;Education;peacebuilding;socialjustice...,Social Sciences,76,Routledge,"Chulalongkorn University, Thailand;UCL Institu...",Pherali Tejendra
20215,202302888,Effects of black soldier fly (Hermetia illucen...,The effects of replacing fish meal protein wit...,2023-01-01,Journal,Anabastestudineus;Blacksoldierfly;fishmealrepl...,Environmental Science;Agricultural and Biologi...,44,Taylor and Francis Ltd.,"Chulalongkorn University, Thailand;Khon Kaen U...",Mapanao Ratchaneegorn;Jiwyam Wirat;Nithikulwor...


Approximately Under 10 minutes

In [173]:
path_articles = r'data/scopus_data.csv'

df.to_csv(path_articles, index=False)