In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import os
import json
from flatten_json import flatten
from datetime import datetime
from collections import defaultdict

In [3]:
directory = ['2018','2019','2020','2021','2022','2023']

In [5]:

def get_affiliation_groups(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    author_groups = data['abstracts-retrieval-response']['item']['bibrecord']['head']['author-group']
    try:
        flat_data = [flatten(item) for item in author_groups]
    except AssertionError as e:
        flat_data = [flatten(item) for item in [author_groups]]


    df = pd.DataFrame(flat_data)
    combined_organization = []
    for i in(df.columns):
        if("affiliation_organization" in i):
            combined_organization.append(i)

    df['combined_organization'] = df[combined_organization].apply(lambda row: ', '.join([str(val) for val in row if pd.notna(val)]), axis=1)

    return list(df['combined_organization'])


def process_file(file_path):
    """Process a single JSON file to extract year and subject areas."""
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        # Extract subject areas
        subject_areas = data.get('abstracts-retrieval-response', {}).get('subject-areas', {}).get('subject-area', [])

        return [(area['@abbrev']) for area in subject_areas]
    

def process_all_files(base_dir):
    """Process all files in the directory structure."""
    interested_areas = {'AGRI':[], 'MEDI':[], 'BIOC':[]}
    
    for year_folder in os.listdir(base_dir):
        year_path = os.path.join(base_dir, year_folder)
        if os.path.isdir(year_path):
            for file_name in os.listdir(year_path):
                file_path = os.path.join(year_path, file_name)
                if file_path.endswith('.json'):
                    # Process each JSON file
                    results = process_file(file_path)
                    for area in (set(results)):
                        if area in interested_areas:
                            interested_areas[area] += get_affiliation_groups(file_path)
                            interested_areas[area] = list(set(interested_areas[area]))

    return interested_areas
                    
# Base directory where project files are stored
base_dir = "/Users/im/Documents/Data Sci/Project/Project/chulaDatabase"

# Process all files and get subject area trends
affiliation_data = process_all_files(base_dir)

df = pd.DataFrame({key: pd.Series(value) for key, value in affiliation_data.items()})

df = df.dropna()
df = df.drop_duplicates()

df

Unnamed: 0,AGRI,MEDI,BIOC
0,,,
1,"Department of Plant Production Technology, Fac...","Pulmonary Medicine and Critical Care, King Chu...","Comenius University, Department of Biochemistry"
2,Scientific and Technological Research Equipmen...,Research Center for Smart Sustainable Circular...,"Division of Anatomy, School of Medical Science..."
3,Synchrotron Light Research Institute,"Department of Surgery, University Hospitals/Cl...",UKZN
4,"Glaucoma Research Unit, Faculty of Medicine, C...",State of Sao Paulo Workers' Welfare Institute,"School of Mathematics, Computer Science and En..."
...,...,...,...
4405,Kitakyushu Museum of Natural History and Human...,US Oncology Research,UMR 8576 - UGSF - Unité de Glycobiologie Struc...
4406,Institute for Problems of Ecology and Evolutio...,Center of Excellence in Natural Products Chemi...,"Abramson Cancer Center, University of Pennsylv..."
4407,Immunomodulation of Natural Products Research ...,"Department of Clinical Sciences, Gastroenterol...","School of Science, University of Phayao"
4408,Institut des Sciences de l’Evolution de Montpe...,St. Hedwig of Silesia Hospital,"Clinic of Neonatology, Department of Women, Mo..."
