# JSON Parsing and Processing

In [1]:
import json
import os

os.makedirs('data/json_files', exist_ok=True)

In [4]:
# Simple JSON data
json_data = {
    "company": "TechCorp",
    "employees": [
        {
         "id": 1,
         "name": "Alice",
         "role": "Engineer",
         "skills": ["Python", "Machine Learning", "Data Analysis"],
         "projects": [
             {"name": "Project A", "duration_months": 6 },
             {"name": "Project B", "duration_months": 12 }]
        },
        {
         "id": 2,
         "name": "Bob",
         "role": "Designer",
         "skills": ["UI/UX", "Graphic Design"],
         "projects": [
             {"name": "Project C", "duration_months": 4 },
             {"name": "Project D", "duration_months": 8 }]
        },
        {
         "id": 3,
         "name": "Charlie",
         "role": "Product Manager",
         "skills": ["Agile", "Scrum", "Leadership"],
         "projects": [
             {"name": "Project E", "duration_months": 10 },
             {"name": "Project F", "duration_months": 14 }]
        }
    ],
    "departments": {
        "Engineering": {
            "head": "Diana",
            "budget": 500000
        },
        "Design": {
            "head": "Ethan",
            "budget": 200000
        },
        "Product": {
            "head": "Fiona",
            "budget": 300000
        }
    }
}

In [5]:
json_data

{'company': 'TechCorp',
 'employees': [{'id': 1,
   'name': 'Alice',
   'role': 'Engineer',
   'skills': ['Python', 'Machine Learning', 'Data Analysis'],
   'projects': [{'name': 'Project A', 'duration_months': 6},
    {'name': 'Project B', 'duration_months': 12}]},
  {'id': 2,
   'name': 'Bob',
   'role': 'Designer',
   'skills': ['UI/UX', 'Graphic Design'],
   'projects': [{'name': 'Project C', 'duration_months': 4},
    {'name': 'Project D', 'duration_months': 8}]},
  {'id': 3,
   'name': 'Charlie',
   'role': 'Product Manager',
   'skills': ['Agile', 'Scrum', 'Leadership'],
   'projects': [{'name': 'Project E', 'duration_months': 10},
    {'name': 'Project F', 'duration_months': 14}]}],
 'departments': {'Engineering': {'head': 'Diana', 'budget': 500000},
  'Design': {'head': 'Ethan', 'budget': 200000},
  'Product': {'head': 'Fiona', 'budget': 300000}}}

In [6]:
with open('data/json_files/company_data.json', 'w') as f:
    json.dump(json_data, f, indent=2)

In [7]:
# Save JSON Lines Format
jsonl_data = [
    {"timestamp": "2024-01-01T10:00:00Z", "event": "login", "user_id": 101},
    {"timestamp": "2024-01-01T10:05:00Z", "event": "view_page", "user_id": 101, "page": "/homepage"},
    {"timestamp": "2024-01-01T10:10:00Z", "event": "logout", "user_id": 101}
]

with open('data/json_files/user_events.jsonl', 'w') as f:
    for entry in jsonl_data:
        f.write(json.dumps(entry) + '\n')

# JSON processing strategies

In [None]:
from langchain_community.document_loaders import JSONLoader
import json

# JSONloader with jq_schema
print("JSONLoader - Extract specific fields")

employee_loader = JSONLoader(
    file_path='data/json_files/company_data.json',
    jq_schema='.employees[]',  # jq query to extract each employee object
    text_content=False  # Treat content as structured data
)

employee_docs = employee_loader.load()
print(f"Loaded {len(employee_docs)} employee documents.")
print(f"First employee: {employee_docs[0].page_content[:200]}")

JSONLoader - Extract specific fields
Loaded 3 employee documents.
First employee: {"id": 1, "name": "Alice", "role": "Engineer", "skills": ["Python", "Machine Learning", "Data Analysis"], "projects": [{"name": "Project A", "duration_months": 6}, {"name": "Project B", "duration_mont


In [14]:
# Custom JSON parsing function

from typing import List
from langchain_core.documents import Document

def process_json_intelligently(file_path: str) -> List[Document]:
    """Process JSON file intelligently based on its structure."""
    with open(file_path, 'r') as f:
        data = json.load(f)

    documents = []

    for emp in data.get('employees', []):
        content = f"Name: {emp['name']}\nRole: {emp['role']}\nSkills: {', '.join(emp['skills'])}\n"
        project_details = "\n".join([f"- {proj['name']} ({proj['duration_months']} months)" for proj in emp.get('projects', [])])
        content += f"Projects:\n{project_details}\n"
        # documents.append(Document(page_content=content))
        doc = Document(page_content=content,
                       metadata={
                           "source": file_path,
                           "data_type": "employee_record",
                           "employee_id": emp['id'],
                           "employee_name": emp['name'],
                           "role": emp['role']
                       }
                 )   
        documents.append(doc)      

    return documents


In [15]:
process_json_intelligently('data/json_files/company_data.json')

[Document(metadata={'source': 'data/json_files/company_data.json', 'data_type': 'employee_record', 'employee_id': 1, 'employee_name': 'Alice', 'role': 'Engineer'}, page_content='Name: Alice\nRole: Engineer\nSkills: Python, Machine Learning, Data Analysis\nProjects:\n- Project A (6 months)\n- Project B (12 months)\n'),
 Document(metadata={'source': 'data/json_files/company_data.json', 'data_type': 'employee_record', 'employee_id': 2, 'employee_name': 'Bob', 'role': 'Designer'}, page_content='Name: Bob\nRole: Designer\nSkills: UI/UX, Graphic Design\nProjects:\n- Project C (4 months)\n- Project D (8 months)\n'),
 Document(metadata={'source': 'data/json_files/company_data.json', 'data_type': 'employee_record', 'employee_id': 3, 'employee_name': 'Charlie', 'role': 'Product Manager'}, page_content='Name: Charlie\nRole: Product Manager\nSkills: Agile, Scrum, Leadership\nProjects:\n- Project E (10 months)\n- Project F (14 months)\n')]