In [7]:
import json
import os
import pandas as pd


# Function to extract data from a single JSON file
def extract_data(file_path, filename):
     with open(file_path, 'r') as f:
        data = json.load(f)

        # Extract relevant information from the JSON structure
        records = []

        item = data.get('abstracts-retrieval-response', {}).get('item', {})

        # Extract relevant fields with default values for missing data
        title = item.get('bibrecord', {}).get('head', {}).get('citation-title', '')
        abstract = item.get('bibrecord', {}).get('head', {}).get('abstracts', '')

        authors = []
        for author in data.get('abstracts-retrieval-response', {}).get('authors',{}).get('author',[]):
            name=author.get('preferred-name', {}).get('ce:indexed-name', '')
            if name:
                authors.append(name)

        # Initialize list to hold all classification codes
        subject_areas = []
        subject_area_groups = data.get('abstracts-retrieval-response', {}).get('subject-areas', {})
        if subject_area_groups and isinstance(subject_area_groups.get('subject-area'), list):
            for subject in subject_area_groups['subject-area']:
                if isinstance(subject, dict):
                    subject_text = subject.get('@code')
                    if subject_text:
                        subject_areas.append(subject_text)
        
        aggregation_type = data.get('abstracts-retrieval-response', {}).get('coredata', {}).get('prism:aggregationType', {})
        publisher = data.get('abstracts-retrieval-response', {}).get('coredata', {}).get('dc:publisher', {})

        # Extract date of publication
        publication_date = data.get('abstracts-retrieval-response', {}).get('coredata', {}).get('prism:coverDate', {})

        # Process author affiliations
        institutions = []
        author_group = item.get('bibrecord', {}).get('head', {}).get('author-group', {})

        # Check if author_group is a list; if not, wrap it in a list to iterate over it
        if isinstance(author_group, dict):
            author_group = [author_group]  # Convert single dictionary to a list

        for group in author_group:
            affiliation = group.get('affiliation', {})
            if isinstance(affiliation, dict):
                # Extract organizations (institutions)
                organizations = affiliation.get('organization', [])
                if isinstance(organizations, dict):  # If only one organization (dictionary)
                    institution_name = organizations.get('$', '')
                    if institution_name:
                        institutions.append(f"{institution_name} ({affiliation.get('country', '')})")
                elif isinstance(organizations, list):  # If multiple organizations (list)
                    if len(organizations) > 0:  # Check if the list is not empty
                        last_org = organizations[-1]  # Get the last organization in the list
                        if isinstance(last_org, dict):  # Ensure it's a dictionary
                            institution_name = last_org.get('$', '')
                            if institution_name:
                                institutions.append(f"{institution_name} ({affiliation.get('country', '')})")
            elif isinstance(affiliation, list):  # Handle multiple affiliations
                for aff in affiliation:
                    if isinstance(aff, dict):
                        organizations = aff.get('organization', [])
                        if isinstance(organizations, dict):  # If only one organization (dictionary)
                            institution_name = organizations.get('$', '')
                            if institution_name:
                                institutions.append(f"{institution_name} ({aff.get('country', '')})")
                        elif isinstance(organizations, list):  # If multiple organizations (list)
                            if len(organizations) > 0:  # Check if the list is not empty
                                last_org = organizations[-1]  # Get the last organization in the list
                                if isinstance(last_org, dict):  # Ensure it's a dictionary
                                    institution_name = last_org.get('$', '')
                                    if institution_name:
                                        institutions.append(f"{institution_name} ({aff.get('country', '')})")



        # Extract citations/references
        references = []
        tail = item.get('bibrecord', {}).get('tail', None)
        if tail and isinstance(tail, dict):
            bibliography = tail.get('bibliography', {})
            reference = bibliography.get('reference', {})
            if isinstance(reference, dict):  # Check if reference is a dictionary
                ref_title = reference.get('ref-info', {}).get('ref-title', {}).get('ref-titletext', '')
                if ref_title:
                    references.append(ref_title)
            elif isinstance(reference, list):
                for ref in reference:
                    ref_title = ref.get('ref-info', {}).get('ref-title', {}).get('ref-titletext', '')
                    if ref_title:
                        references.append(ref_title)

        # Extract keywords
        keywords = []
        authkeywords = data.get('abstracts-retrieval-response', {}).get('authkeywords')

        if authkeywords and isinstance(authkeywords.get('author-keyword'), list):
            for keyword in authkeywords['author-keyword']:
                if isinstance(keyword, dict):
                    keyword_text = keyword.get('$')
                    if keyword_text:
                        keywords.append(keyword_text)


        records.append({
            'Title': title if title else None,
            'Abstract': abstract if abstract else None,
            'Author': "; ".join(authors) if authors else None,
            'Aggregation_Type': aggregation_type if aggregation_type else None,
            'Subject_Area': "; ".join(subject_areas) if subject_areas else None,
            'Publisher': publisher if publisher else None,
            'Publication_Date': publication_date if publication_date else None,
            'Institutions': "; ".join(institutions) if institutions else None,
            'References': "; ".join(references) if references else None,
            'Keywords': "; ".join(keywords) if keywords else None,
            'Filename': str(filename)
        })


        return records

# Loop through all JSON files in the specified folder and extract data
def extract_all_files(folder_path):
    all_data = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        try:
            data = extract_data(file_path, filename)  # Pass filename to the function
            all_data.extend(data)
        except Exception as e:
            print(f"Error processing file: {filename}. Error: {e}")

    # Create a DataFrame from the extracted data
    df = pd.DataFrame(all_data)
    return df


In [20]:
year='2023'
folder_path = f'/Users/pookansmacbookpro/Documents/CEDT Computer Engineering/Sem1_2024/2110403_DataSci/Project/Data_2018-2023/{year}/'
df=extract_all_files(folder_path)

In [21]:
df.sort_values('Filename', ascending=True, inplace=True)
df.to_csv(f'/Users/pookansmacbookpro/Documents/CEDT Computer Engineering/Sem1_2024/2110403_DataSci/Project/ExtractedData/{year}.csv', encoding='utf-8' ,index=False)