In [33]:
import json
import os
import pandas as pd

# Define the folder path where your files are located
folder_path = '/Users/pookansmacbookpro/Documents/CEDT Computer Engineering/Sem1_2024/2110403_DataSci/Project/Data_2018-2023/2023/'

# Function to extract data from a single JSON file
def extract_data(file_path, filename):
    with open(file_path, 'r') as f:
        data = json.load(f)

    # Extract relevant information from the JSON structure
    records = []

    item = data.get('abstracts-retrieval-response', {}).get('item', {})

    # Extract relevant fields with default values for missing data
    title = item.get('bibrecord', {}).get('head', {}).get('citation-title', '')
    abstract = item.get('bibrecord', {}).get('head', {}).get('abstracts', '')

    authors = []
    for author in data.get('abstracts-retrieval-response', {}).get('authors', {}).get('author', []):
        name = author.get('preferred-name', {}).get('ce:indexed-name', '')
        if name:
            authors.append(name)

    # Initialize list to hold all classification codes
    subject_areas = []
    subject_area_groups = data.get('abstracts-retrieval-response', {}).get('subject-areas', {})
    if subject_area_groups and isinstance(subject_area_groups.get('subject-area'), list):
        for subject in subject_area_groups['subject-area']:
            if isinstance(subject, dict):
                subject_text = subject.get('@code')
                if subject_text:
                    subject_areas.append(subject_text)

    # Extract date of publication
    publication_date = data.get('abstracts-retrieval-response', {}).get('coredata', {}).get('prism:coverDate', '')

    # Process author affiliations
    affiliations = set()
    author_group = item.get('bibrecord', {}).get('head', {}).get('author-group', {})

    # Check if author_group is a list; if not, wrap it in a list to iterate over it
    if isinstance(author_group, dict):
        author_group = [author_group]  # Convert single dictionary to a list

    for group in author_group:
        affiliation = group.get('affiliation', {})
        affiliation_id = affiliation.get('@afid', '')
        if affiliation_id:
            affiliations.add(affiliation_id)

    # Extract citations/references
    references = []
    tail = item.get('bibrecord', {}).get('tail', None)
    if tail and isinstance(tail, dict):
        bibliography = tail.get('bibliography', {})
        reference = bibliography.get('reference', {})
        if isinstance(reference, dict):  # Check if reference is a dictionary
            ref_title = reference.get('ref-info', {}).get('ref-title', {}).get('ref-titletext', '')
            if ref_title:
                references.append(ref_title)
        elif isinstance(reference, list):
            for ref in reference:
                ref_title = ref.get('ref-info', {}).get('ref-title', {}).get('ref-titletext', '')
                if ref_title:
                    references.append(ref_title)

    # Extract keywords
    keywords = []
    authkeywords = data.get('abstracts-retrieval-response', {}).get('authkeywords')

    if authkeywords and isinstance(authkeywords.get('author-keyword'), list):
        for keyword in authkeywords['author-keyword']:
            if isinstance(keyword, dict):
                keyword_text = keyword.get('$')
                if keyword_text:
                    keywords.append(keyword_text)

    records.append({
        'Title': str(title) if title else None,
        'Abstract': str(abstract) if abstract else None,
        'Author': "; ".join(authors) if authors else None,
        'Subject_Area': "; ".join(subject_areas) if subject_areas else None,
        'Publication_Date': str(publication_date) if publication_date else None,
        'Affiliations_ID': "; ".join(affiliations) if affiliations else None,
        'References': "; ".join(references) if references else None,
        'Keywords': "; ".join(keywords) if keywords else None,
        'Filename': str(filename)  # Add filename to each record
    })

    return records

# Loop through all JSON files in the specified folder and extract data
all_data = []
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    try:
        data = extract_data(file_path, filename)  # Pass filename to the function
        all_data.extend(data)
    except Exception as e:
        print(f"Error processing file: {filename}. Error: {e}")

# Create a DataFrame from the extracted data
df = pd.DataFrame(all_data)
df.shape


(2890, 9)

In [23]:
df.isnull().sum()

Title                 0
Abstract            106
Author                0
Subject_Area          0
Publication_Date      0
Affiliations_ID       0
References          227
Keywords            633
Filename              0
dtype: int64

In [15]:
df[df['References'].isnull()]

Unnamed: 0,Title,Abstract,Author,Subject_Area,Publication_Date,Affiliations_ID,References,Keywords,Filename
5,Optimization of cinnamon oil nanoemulsions usi...,"© 2017 Elsevier Inc.Essential oils, such as th...",Chuesiang P.; Siripatrawan U.; Sanguandeekul R...,2504; 2502; 2508; 1505,2018-03-15,60014313; 60028190,,Cinnamon oil; Nanoemulsions; Natural product; ...,201801734
12,WS2 and C-TiO2 Nanorods Acting as Effective Ch...,"© 2018 Wiley-VCH Verlag GmbH & Co. KGaA, Weinh...",Yang C.; Qin J.; Rajendran S.; Zhang X.; Liu R.,2304; 1500; 2500; 2100,2018-12-11,60091507; 60030782; 60018465,,charge separators; g-C3N4 composite; hydrogen;...,201800044
21,Rational design and synthesis of SiC/TiC@SiOx/...,© The Royal Society of Chemistry.Rational desi...,Li C.; Qin J.; Sawangphruk M.; Zhang X.; Liu R.,2504; 1503; 2503; 1600; 2508; 2506; 2505,2018-01-01,60091507; 60110787; 60018465,,,201802348
43,ALMA twenty-six arcmin2 survey of GOODS-S at o...,© The Author(s) 2018.We present the survey des...,Hatsukade B.; Kohno K.; Yamaguchi Y.; Umehata ...,3103; 1912,2018-12-01,60030699; 60000264; 60007171; 60103895; 600235...,,Cosmology: observations; Galaxies: evolution; ...,201800088
45,Synthesis of PET-PLA copolymer from recycle pl...,© 2017 Elsevier Ltd. All rights reserved.In th...,Buasri A.; Ongmali D.; Sriboonpeng P.; Prompan...,2500,2018-01-01,60018809; 60199582,,Copolymerization; Electrochromic devices; Grap...,201802189
...,...,...,...,...,...,...,...,...,...
2741,Syntheses and UV spectroscopic study of mono-a...,"© 2018, Chiang Mai University. All rights rese...",Cheewawisuttichai T.; Khamkaew L.; Tantayanon ...,1600; 2600; 2500; 1300; 3100,2018-09-01,60021099; 60028190,,Dialkyloxycoumarin; Dimerization; Monoalkyloxy...,201800667
2749,Green transportation system to promote sustain...,"© The Authors, published by EDP Sciences, 2018...",Pinthong J.; Limsuwan K.; Stitmannaithum B.,2300; 2100; 1900,2018-08-08,60028190,,,201800832
2753,"Correction to: Cypripedin, a phenanthrenequino...","© 2018, The Japanese Society of Pharmacognosy ...",Wattanathamsan O.; Treesuwan S.; Sritularak B....,1313,2018-06-01,60028190,,,201801313
2756,Pyrrolidinyl PNA polypyrrole/silver nanofoam e...,© 2017 Elsevier B.V.A label-free electrochemic...,Kangkamano T.; Numnuam A.; Limbut W.; Kanathar...,1305; 1304; 2204; 1603,2018-04-15,60006314; 60028190,,Label-free electrochemical sensor; miRNA-21; P...,201801545


In [34]:
df.sort_values('Filename', ascending=True, inplace=True)
df.to_csv('/Users/pookansmacbookpro/Documents/CEDT Computer Engineering/Sem1_2024/2110403_DataSci/Project/ExtractedData/2023.csv', encoding='utf-8' ,index=False)