In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import re
import json
import os


In [None]:

# === Input Folder ===
folder_path = "/content/drive/MyDrive/Socioeconomic dataset/Education Attainment"  # change to your folder path

# === All Years ===
years = range(2010, 2024)

# === Final JSON list ===
jsonl_data = []


In [None]:
for year in years:
    print(f"📦 Processing {year}...")

    data_file = os.path.join(folder_path, f"ACSST5Y{year}.S1501-Data.csv")
    meta_file = os.path.join(folder_path, f"ACSST5Y{year}.S1501-Column-Metadata.csv")

    # Read files
    try:
        df_data = pd.read_csv(data_file, skiprows=1)
        df_meta = pd.read_csv(meta_file)
    except Exception as e:
        print(f"❌ Skipping {year} due to error: {e}")
        continue

    # Only Estimate columns, no margin columns
    estimate_cols = [col for col in df_data.columns if "!!Estimate!!" in col]

    # Meta dictionary
    meta_dict = {
        row['Column Name']: row['Label']
        for _, row in df_meta.iterrows()
        if pd.notna(row['Column Name']) and pd.notna(row['Label'])
    }

    for _, row in df_data.iterrows():
        state = row['Geographic Area Name']
        facts = []

        for est_col in estimate_cols:
            label = meta_dict.get(est_col, est_col)
            est_val = row[est_col]

            if est_val not in ('-', '(X)', '', None):
                est_clean = re.sub(r'[^\d.\-]', '', str(est_val))

                # Simply add the estimate value without margin of error
                facts.append(f"{label.strip()}: {est_clean}")

        if facts:
            full_text = f"In {year}, in {state}, the following statistics were recorded: " + "; ".join(facts) + "."
            jsonl_data.append({
                "year": year,
                "state": state,
                "text": full_text
            })

# === Save all years into one file ===
output_file = os.path.join(folder_path, "structured_to_text_all_years2.json")
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(jsonl_data, f, indent=4)

print(f"\n✅ Final JSON file created with {len(jsonl_data)} entries: structured_to_text_all_years2.json")

📦 Processing 2010...
📦 Processing 2011...
📦 Processing 2012...
📦 Processing 2013...
📦 Processing 2014...
📦 Processing 2015...
📦 Processing 2016...
📦 Processing 2017...
📦 Processing 2018...
📦 Processing 2019...
📦 Processing 2020...
📦 Processing 2021...
📦 Processing 2022...
📦 Processing 2023...

✅ Final JSON file created with 364 entries: structured_to_text_all_years2.json
