In [1]:
import json
import os
from pathlib import Path
import pandas as pd


In [None]:
#Data_clearning  here would be the fisrt step to cleaning the spending and combing the data. Once this script is activated we can then move on to the data_processing steps.  

In [4]:
def combine_spending_for_year(year, project_name="WeThePeopleAudit"):
#finding the pathing for this documents
    current_path = Path.cwd()
    project_root = None
    
    if current_path.name == project_name:
        project_root = current_path
    else:
        for parent in current_path.parents:
            if parent.name == project_name:
                project_root = parent
                break

    if not project_root:
        for root, dirs, _ in os.walk(Path.home()):
            if project_name in dirs:
                project_root = Path(root) / project_name
                break
    
    if not project_root:
        raise FileNotFoundError(f"Project root '{project_name}' not found")

    year_path = project_root / "data" / "spending" / year
    if not year_path.exists():
        raise FileNotFoundError(f"{year} not found")
    output_path = project_root / "data" / "spending" / f"combined_{year}.json"
    json_files = sorted(list(year_path.glob("*.json")))
    total_files = len(json_files)
    
    if total_files == 0:
        print(f"No files found for {year}")
        return None
    print(f"Processing {total_files} files")
    combined_data = []
    
    for i, json_file in enumerate(json_files):
        try:
            print(f"File {i+1}/{total_files}: {json_file.name}")
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                if isinstance(data, list):
                    combined_data.extend(data)
                elif isinstance(data, dict):
                    combined_data.append(data)
        except Exception as e:
            print(f"Error: {json_file.name} - {e}")
    

    print(f"Saving {len(combined_data)} records")
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(combined_data, f, indent=2)

    print(f"Complete: {output_path}")
    return output_path

In [7]:
def combine_all_spending_years(project_name="WeThePeopleAudit"):
#Combining all the spending years into one file and getting the latest updates
    current_path = Path.cwd()
    project_root = None

    if current_path.name == project_name:
        project_root = current_path
    else:
        for parent in current_path.parents:
            if parent.name == project_name:
                project_root = parent
                break
    
    if not project_root:
        for root, dirs, _ in os.walk(Path.home()):
            if project_name in dirs:
                project_root = Path(root) / project_name
                break
                
    if not project_root:
        raise FileNotFoundError(f"Project root '{project_name}' not found")
    spending_dir = project_root / "data" / "spending"
    
    year_dirs = [d.name for d in spending_dir.iterdir() 
                if d.is_dir() and d.name.isdigit()]
    
    print(f"Found years: {', '.join(sorted(year_dirs))}")
    
    for year in sorted(year_dirs):
        print(f"\n{'='*50}\nProcessing year: {year}\n{'='*50}")
        try:
            combine_spending_for_year(year, project_name)
        except Exception as e:
            print(f"Error processing {year}: {e}")
    
    print(f"\n{'='*50}\nAll years processed\n{'='*50}")


In [8]:
combine_all_spending_years()

Found years: 2022, 2023, 2024

Processing year: 2022
Found 12 JSON files to combine
[8.3%] Processing 1/12: dataset_2022_01_20250315T070723.000.json
  Added 230792 records (list)
[16.7%] Processing 2/12: dataset_2022_02_20250315T070723.000.json
  Added 222471 records (list)
[25.0%] Processing 3/12: dataset_2022_03_20250315T070723.000.json
  Added 245010 records (list)
[33.3%] Processing 4/12: dataset_2022_04_20250315T070723.000.json
  Added 219674 records (list)
[41.7%] Processing 5/12: dataset_2022_05_20250315T070723.000.json
  Added 243518 records (list)
[50.0%] Processing 6/12: dataset_2022_06_20250315T070723.000.json
  Added 243197 records (list)
[58.3%] Processing 7/12: dataset_2022_07_20250315T070723.000.json
  Added 127296 records (list)
[66.7%] Processing 8/12: dataset_2022_08_20250315T070723.000.json
  Added 54342 records (list)
[75.0%] Processing 9/12: dataset_2022_09_20250315T070723.000.json
  Added 7154 records (list)
[83.3%] Processing 10/12: dataset_2022_10_20250315T07072

In [10]:

def combine_spending_files(project_name="WeThePeopleAudit", output_base="combined_spending"):
    here = Path.cwd()
    root = None
    if here.name == project_name:
        root = here
    else:
        for p in here.parents:
            if p.name == project_name:
                root = p
                break

    if not root:
        for r, dirs, _ in os.walk(Path.home()):
            if project_name in dirs:
                root = Path(r) / project_name
                break

    if not root:
        raise FileNotFoundError(f"Can't find project folder '{project_name}'")
    
    spend_dir = root / "data" / "spending"
    if not spend_dir.exists():
        raise FileNotFoundError(f"No spending folder at {spend_dir}")
    
    json_out = spend_dir / f"{output_base}.json"
    csv_out = spend_dir / f"{output_base}.csv"

    files = [f for f in spend_dir.glob("*.json") 
            if f.is_file() and not f.name.startswith(output_base)]
    
    if not files:
        print(f"No JSON files in {spend_dir}")
        return None
    print(f"Found {len(files)} files")
    all_data = []
    for i, f in enumerate(sorted(files)):
        try:
            print(f"File {i+1}/{len(files)}: {f.name}")
            
            with open(f, 'r', encoding='utf-8') as file:
                data = json.load(file)
                
                if isinstance(data, list):
                    all_data.extend(data)
                elif isinstance(data, dict):
                    all_data.append(data)
        except Exception as e:
            print(f"Problem with {f.name}: {e}")
    
    if not all_data:
        print("Couldn't get any data from the files")
        return None
    print(f"Creating dataframe from {len(all_data)} records")
    df = pd.DataFrame(all_data)
    
    print(f"Saving JSON")
    with open(json_out, 'w', encoding='utf-8') as f:
        json.dump(all_data, f, indent=2)
    
    print(f"Saving CSV")
    df.to_csv(csv_out, index=False)
    print(f"Done! Combined {len(files)} files:")
    print(f"- JSON: {json_out}")
    print(f"- CSV: {csv_out}")
    print(f"- Shape: {df.shape}")
    
    return df

In [11]:
df = combine_spending_files()
if df is not None:
    print(df.head())

Found 3 JSON files in spending directory
[1/3] Processing: combined_2022.json
  Added 1594613 records (list)
[2/3] Processing: combined_2023.json
  Added 1569395 records (list)
[3/3] Processing: combined_2024.json
  Added 1590018 records (list)
Converting 4754026 records to DataFrame
Saving combined JSON to /Users/phatngo/Desktop/WeThePeopleAudit/data/spending/combined_spending.json
Saving CSV to /Users/phatngo/Desktop/WeThePeopleAudit/data/spending/combined_spending.csv
✅ Complete! Combined 3 files into:
  - JSON: /Users/phatngo/Desktop/WeThePeopleAudit/data/spending/combined_spending.json
  - CSV: /Users/phatngo/Desktop/WeThePeopleAudit/data/spending/combined_spending.csv
  - DataFrame shape: (4754026, 25)
                base_id budget_fiscal_year fiscal_period  \
0  B0000000000360065574               2022             7   
1  B0000000000360065575               2022             7   
2  B0000000000360067383               2022             7   
3  B0000000000360072140               2022

In [2]:
df.columns

NameError: name 'df' is not defined

In [13]:
df.head().T

Unnamed: 0,0,1,2,3,4
base_id,B0000000000360065574,B0000000000360065575,B0000000000360067383,B0000000000360072140,B0000000000360072141
budget_fiscal_year,2022,2022,2022,2022,2022
fiscal_period,7,7,7,7,7
date,2022-01-02T00:00:00.000,2022-01-02T00:00:00.000,2022-01-02T00:00:00.000,2022-01-02T00:00:00.000,2022-01-02T00:00:00.000
cabinet_secretariat,ENVIRONMENTAL AFFAIRS,ENVIRONMENTAL AFFAIRS,ENVIRONMENTAL AFFAIRS,ENVIRONMENTAL AFFAIRS,ENVIRONMENTAL AFFAIRS
department,DEPARTMENT OF FISH AND GAME (FWE),DEPARTMENT OF FISH AND GAME (FWE),DEPARTMENT OF FISH AND GAME (FWE),DEPARTMENT OF FISH AND GAME (FWE),DEPARTMENT OF FISH AND GAME (FWE)
appropriation_type,(3TN) TRUSTS,(1CS) DIRECT APPROPRIATIONS/SUBSIDIARIZED,(3TN) TRUSTS,(3TN) TRUSTS,(3TN) TRUSTS
appropriation_name,(23001300) DFG ECOLOGICAL MITIGATION TRUST,(23000100) OFFICE OF COMMISSIONER-ADMINISTRATION,(23001300) DFG ECOLOGICAL MITIGATION TRUST,(23001300) DFG ECOLOGICAL MITIGATION TRUST,(23001300) DFG ECOLOGICAL MITIGATION TRUST
object_class,(AA) REGULAR EMPLOYEE COMPENSATION,(AA) REGULAR EMPLOYEE COMPENSATION,(AA) REGULAR EMPLOYEE COMPENSATION,(AA) REGULAR EMPLOYEE COMPENSATION,(AA) REGULAR EMPLOYEE COMPENSATION
object_code,(A01) SALARIES: INCLUSIVE,(AA1) SALARIES: SUPPLEMENTAL,(A01) SALARIES: INCLUSIVE,(A01) SALARIES: INCLUSIVE,(AA1) SALARIES: SUPPLEMENTAL
