In [11]:
import pandas as pd
import os
from dataclasses import dataclass
from typing import List
import json

# Get the current working directory (since __file__ is not available in Jupyter notebooks)
SCRIPT_DIR = os.getcwd()

# Define the data directory relative to the current working directory
DATA_DIR = os.path.join(SCRIPT_DIR, "../data/ncsesTaxonomy")

JSON_OUTPUT_DIR = os.path.join(SCRIPT_DIR, "../data/taxonomyJson")

@dataclass
class DetailedCategory:
    name: str
    
    def to_dict(self):
        return self.name  # Return just the name as a string

@dataclass
class MajorCategory:
    name: str
    detailed_categories: List[DetailedCategory]
    
    def to_dict(self):
        return {
            self.name: [detailed.to_dict() for detailed in self.detailed_categories]
        }

@dataclass
class BroadCategory:
    name: str
    inner_categories: List[MajorCategory]
    
    def to_dict(self):
        return {
            self.name: {major.name: major.to_dict()[major.name] for major in self.inner_categories}
        }

@dataclass
class AreaCategory:
    name: str
    broad_categories: List[BroadCategory]
    
    def to_dict(self):
        return {
            self.name: {broad.name: broad.to_dict()[broad.name] for broad in self.broad_categories}
        }

@dataclass
class CategoryHierarchy:
    area_categories: List[AreaCategory]
    
    def to_dict(self):
        return {area.name: area.to_dict()[area.name] for area in self.area_categories}
    

# Function to read Excel files from a directory and convert them to a single DataFrame
def read_excel_file(file_path):
    df = pd.read_excel(file_path)
    return df

def get_ncses_taxonomy_data(data_dir):
    # Collect all spreadsheet files in the data directory to account for if we add more data
    all_data = []
    for file in os.listdir(data_dir):
        if file.endswith('.xlsx'):
            file_path = os.path.join(data_dir, file)
            df = read_excel_file(file_path)
            all_data.append(df)
    return all_data

def get_ncses_taxonomy_data_as_df(data_dir):
    all_data = get_ncses_taxonomy_data(data_dir)
    return pd.concat(all_data, ignore_index=True)

def construct_category_hierarchy(df):
    # Initialize the category hierarchy
    area_categories = []
    
    current_area = None
    current_broad = None
    current_major = None
    
    # Iterate through the DataFrame to build the hierarchy
    for i, row in df.iterrows():
        # Check if the row is a new area category
        if row['Unnamed: 1'] == 'Area':
            # Create a new AreaCategory and reset the current broad and major categories
            current_area = AreaCategory(name=row['Table 4'], broad_categories=[])
            area_categories.append(current_area)
        
        # Check if the row is a new broad category
        elif row['Unnamed: 1'] == 'Broad':
            # Create a new BroadCategory and reset the current major category
            current_broad = BroadCategory(name=row['Table 4'], inner_categories=[])
            current_area.broad_categories.append(current_broad)
        
        # Check if the row is a new major category
        elif row['Unnamed: 1'] == 'Major':
            # Create a new MajorCategory and reset the current detailed category
            current_major = MajorCategory(name=row['Table 4'], detailed_categories=[])
            current_broad.inner_categories.append(current_major)
        
        elif row['Unnamed: 1'] == 'Detailed':
            # Create a new DetailedCategory
            detailed_category = DetailedCategory(name=row['Table 4'])
            current_major.detailed_categories.append(detailed_category)
    
    # Create a final CategoryHierarchy object
    category_hierarchy = CategoryHierarchy(area_categories=area_categories)
    return category_hierarchy

def category_hierarchy_to_json(category_hierarchy):
    # Use the to_dict method of CategoryHierarchy to convert to a dictionary
    # which will cascade to the other classes and their to_dict methods
    hierarchy_dict = category_hierarchy.to_dict()
    
    # Convert the dictionary to a JSON string
    return json.dumps(hierarchy_dict, indent=4)

df = get_ncses_taxonomy_data_as_df(DATA_DIR)
category_hierarchy = construct_category_hierarchy(df)
category_hierarchy_json = category_hierarchy_to_json(category_hierarchy)
print(category_hierarchy_json)

json_file_path = os.path.join(JSON_OUTPUT_DIR, 'taxonomy_hierarchy.json')

# Save the JSON data to a file
# with open(json_file_path, 'w') as json_file:
#     json.dump(category_hierarchy.to_dict(), json_file, indent=4)

print(f"JSON data has been saved to {json_file_path}")


{
    "Science and engineering": {
        "Agricultural sciences and natural resources ": {
            "Agricultural, animal, plant, and veterinary sciences": [
                "Agronomy and crop science",
                "Animal sciences",
                "Food science and technology",
                "Plant sciences",
                "Soil sciences",
                "Veterinary biomedical and clinical sciences",
                "Agricultural, animal, plant, and veterinary sciences nec"
            ],
            "Natural resources and conservation": [
                "Environmental science",
                "Environmental/ natural resources management and policy",
                "Forestry",
                "Natural resources conservation and research",
                "Natural resources and conservation nec"
            ]
        },
        "Biological and biomedical sciences": {
            "Biochemistry, biophysics, and molecular biology": [
                "Biochemistry",
     