# **Basic JSON File Handling In Pandas**

## **Flattening the nested JSON content in noob way**

In [1]:
import pandas as pd
import json

In [8]:
# In order to use 'json' package, JSON data is to be passed as string into a var

json_data = '''{
  "user_id": 101,
  "name": "Alice",
  "contact": {
    "email": "alice@example.com",
    "phone": "123-456-7890"
  },
  "address": {
    "street": "123 Maple St.",
    "city": "Springfield",
    "zip": "12345"
  },
  "orders": [
    {
      "order_id": 1001,
      "order_date": "2023-01-01",
      "items": [
        {"item_id": "A1", "item_name": "Laptop", "price": 800},
        {"item_id": "A2", "item_name": "Mouse", "price": 25}
      ]
    },
    {
      "order_id": 1002,
      "order_date": "2023-01-10",
      "items": [
        {"item_id": "B1", "item_name": "Phone", "price": 500},
        {"item_id": "B2", "item_name": "Charger", "price": 30}
      ]
    }
  ]
}'''

In [12]:
data = json.loads(json_data)  # Parse above JSON string in var 'data' into Python object

In [14]:
df_main = pd.json_normalize(data, sep = '_')  # Flatten the JSON data

In [15]:
df_main

Unnamed: 0,user_id,name,orders,contact_email,contact_phone,address_street,address_city,address_zip
0,101,Alice,"[{'order_id': 1001, 'order_date': '2023-01-01'...",alice@example.com,123-456-7890,123 Maple St.,Springfield,12345


In [18]:
df_expanded = df_main.explode('orders')  # col 'orders' in above o/p is flattened into separate rows

In [19]:
df_expanded

Unnamed: 0,user_id,name,orders,contact_email,contact_phone,address_street,address_city,address_zip
0,101,Alice,"{'order_id': 1001, 'order_date': '2023-01-01',...",alice@example.com,123-456-7890,123 Maple St.,Springfield,12345
0,101,Alice,"{'order_id': 1002, 'order_date': '2023-01-10',...",alice@example.com,123-456-7890,123 Maple St.,Springfield,12345


In [22]:
orders_df = pd.json_normalize(df_expanded['orders'])

In [23]:
orders_df

Unnamed: 0,order_id,order_date,items
0,1001,2023-01-01,"[{'item_id': 'A1', 'item_name': 'Laptop', 'pri..."
1,1002,2023-01-10,"[{'item_id': 'B1', 'item_name': 'Phone', 'pric..."


In [25]:
items_df = orders_df.explode('items')

In [26]:
items_df

Unnamed: 0,order_id,order_date,items
0,1001,2023-01-01,"{'item_id': 'A1', 'item_name': 'Laptop', 'pric..."
0,1001,2023-01-01,"{'item_id': 'A2', 'item_name': 'Mouse', 'price..."
1,1002,2023-01-10,"{'item_id': 'B1', 'item_name': 'Phone', 'price..."
1,1002,2023-01-10,"{'item_id': 'B2', 'item_name': 'Charger', 'pri..."


In [27]:
pre_final_df = pd.json_normalize(items_df['items'])

In [28]:
pre_final_df

Unnamed: 0,item_id,item_name,price
0,A1,Laptop,800
1,A2,Mouse,25
2,B1,Phone,500
3,B2,Charger,30


In [29]:
items_df = items_df.drop(columns = 'items').reset_index(drop=True)
items_df

Unnamed: 0,order_id,order_date
0,1001,2023-01-01
1,1001,2023-01-01
2,1002,2023-01-10
3,1002,2023-01-10


In [30]:
df_expanded = df_expanded.drop(columns = 'orders').reset_index(drop=True)
df_expanded

Unnamed: 0,user_id,name,contact_email,contact_phone,address_street,address_city,address_zip
0,101,Alice,alice@example.com,123-456-7890,123 Maple St.,Springfield,12345
1,101,Alice,alice@example.com,123-456-7890,123 Maple St.,Springfield,12345


In [35]:
final_df = pd.concat([df_expanded, items_df, pre_final_df], axis = 1)
final_df

Unnamed: 0,user_id,name,contact_email,contact_phone,address_street,address_city,address_zip,order_id,order_date,item_id,item_name,price
0,101.0,Alice,alice@example.com,123-456-7890,123 Maple St.,Springfield,12345.0,1001,2023-01-01,A1,Laptop,800
1,101.0,Alice,alice@example.com,123-456-7890,123 Maple St.,Springfield,12345.0,1001,2023-01-01,A2,Mouse,25
2,,,,,,,,1002,2023-01-10,B1,Phone,500
3,,,,,,,,1002,2023-01-10,B2,Charger,30


In [36]:
final_df.ffill()

Unnamed: 0,user_id,name,contact_email,contact_phone,address_street,address_city,address_zip,order_id,order_date,item_id,item_name,price
0,101.0,Alice,alice@example.com,123-456-7890,123 Maple St.,Springfield,12345,1001,2023-01-01,A1,Laptop,800
1,101.0,Alice,alice@example.com,123-456-7890,123 Maple St.,Springfield,12345,1001,2023-01-01,A2,Mouse,25
2,101.0,Alice,alice@example.com,123-456-7890,123 Maple St.,Springfield,12345,1002,2023-01-10,B1,Phone,500
3,101.0,Alice,alice@example.com,123-456-7890,123 Maple St.,Springfield,12345,1002,2023-01-10,B2,Charger,30


## **It is impractical to create sub-frames repeatedly for every un-normalized/un-exploded column. One way to tackle this is as ahead:**

In [58]:
data = json.loads(json_data)

In [59]:
df = pd.json_normalize(data, sep = '_')  # cols 'contact' and 'address' are flattened
df

Unnamed: 0,user_id,name,orders,contact_email,contact_phone,address_street,address_city,address_zip
0,101,Alice,"[{'order_id': 1001, 'order_date': '2023-01-01'...",alice@example.com,123-456-7890,123 Maple St.,Springfield,12345


In [60]:
df_orders = pd.json_normalize(data, record_path = 'orders', record_prefix = 'orders_', meta = ['user_id', 'name'], sep = '_')
df_orders

Unnamed: 0,orders_order_id,orders_order_date,orders_items,user_id,name
0,1001,2023-01-01,"[{'item_id': 'A1', 'item_name': 'Laptop', 'pri...",101,Alice
1,1002,2023-01-10,"[{'item_id': 'B1', 'item_name': 'Phone', 'pric...",101,Alice


In [61]:
df_items = pd.json_normalize(df_orders['orders_items'].explode())
df_items

Unnamed: 0,item_id,item_name,price
0,A1,Laptop,800
1,A2,Mouse,25
2,B1,Phone,500
3,B2,Charger,30


In [62]:
df = df.drop(columns = ['user_id', 'name', 'orders']).reset_index(drop=True)
df

Unnamed: 0,contact_email,contact_phone,address_street,address_city,address_zip
0,alice@example.com,123-456-7890,123 Maple St.,Springfield,12345


In [63]:
df_orders = df_orders.drop(columns=['orders_items']).reset_index(drop=True)

In [66]:
df_final = pd.concat([df_orders, df_items, df], axis=1)

In [70]:
df_final.ffill()

Unnamed: 0,orders_order_id,orders_order_date,user_id,name,item_id,item_name,price,contact_email,contact_phone,address_street,address_city,address_zip
0,1001.0,2023-01-01,101,Alice,A1,Laptop,800,alice@example.com,123-456-7890,123 Maple St.,Springfield,12345
1,1002.0,2023-01-10,101,Alice,A2,Mouse,25,alice@example.com,123-456-7890,123 Maple St.,Springfield,12345
2,1002.0,2023-01-10,101,Alice,B1,Phone,500,alice@example.com,123-456-7890,123 Maple St.,Springfield,12345
3,1002.0,2023-01-10,101,Alice,B2,Charger,30,alice@example.com,123-456-7890,123 Maple St.,Springfield,12345


## **Hardcoding recursive exploding and flattening the JSON file's content**

In [3]:
import pandas as pd

In [8]:
# JSON
nested_json = {
    "user_id": 101,
    "name": "Alice",
    "contact": {
        "email": "alice@example.com",
        "phone": "123-456-7890"
    },
    "address": {
        "street": "123 Maple St.",
        "city": "Springfield",
        "zip": "12345"
    },
    "orders": [
        {
            "order_id": 1001,
            "order_date": "2023-01-01",
            "items": [
                {"item_id": "A1", "item_name": "Laptop", "price": 800},
                {"item_id": "A2", "item_name": "Mouse", "price": 25}
            ]
        },
        {
            "order_id": 1002,
            "order_date": "2023-01-10",
            "items": [
                {"item_id": "B1", "item_name": "Phone", "price": 500},
                {"item_id": "B2", "item_name": "Charger", "price": 30}
            ]
        }
    ]
}

In [9]:
# def recursive_flatten(data, parent=None):
#     """
#     Recursively flatten nested JSON into a list of flat dicts,
#     generating one row per leaf (especially from lists).
#     """
#     rows = []
#     def _flatten(obj, prefix={}):
#         if isinstance(obj, dict):
#             for k, v in obj.items(): _flatten(v, {**prefix, k: v} if not isinstance(v, (dict, list)) else prefix)
#         elif isinstance(obj, list):
#             for i in obj: _flatten(i, prefix.copy())
#         else: rows.append(prefix)
#     _flatten(data, parent or {})
#     return rows

# Custom version to support better key tracking
def flatten_json_rows(data, parent_key='', sep='_'):
    """
    Recursive fn flattens nested JSON into multiple rows. Handles lists as row generators, dicts as column expansions.
    """
    if isinstance(data, dict):
        rows = [{}]
        for key, value in data.items():
            new_key = f"{parent_key}{sep}{key}" if parent_key else key
            flattened = flatten_json_rows(value, new_key, sep=sep)
            # Combine with current rows
            new_rows = []
            for row in rows:
                for flat in flattened: new_rows.append({**row, **flat})
            rows = new_rows
        return rows
    elif isinstance(data, list):
        rows = []
        for item in data: rows.extend(flatten_json_rows(item, parent_key, sep=sep))
        return rows
    else: return [{parent_key: data}]   # base case

# Applying the custom functions on our JSON data in var 'nested_json'

# First flatten user + order level
order_rows = flatten_json_rows(nested_json)

# Now extract and flatten each item into its own row
final_rows = []
for row in order_rows:
    # Find keys that point to individual items
    item_keys = [k for k in row.keys() if 'items' in k and isinstance(row[k], dict)]
    if not item_keys:
        final_rows.append(row)
    else:
        for item_key in item_keys:
            item_data = row[item_key]
            item_row = {k: v for k, v in row.items() if not k.startswith('orders_') or 'items' not in k}
            # Flatten the item dictionary
            for k, v in item_data.items():
                item_row[f'item_{k}'] = v
            final_rows.append(item_row)

# Convert to DataFrame
df = pd.DataFrame(final_rows)

In [10]:
# Show output
df

Unnamed: 0,user_id,name,contact_email,contact_phone,address_street,address_city,address_zip,orders_order_id,orders_order_date,orders_items_item_id,orders_items_item_name,orders_items_price
0,101,Alice,alice@example.com,123-456-7890,123 Maple St.,Springfield,12345,1001,2023-01-01,A1,Laptop,800
1,101,Alice,alice@example.com,123-456-7890,123 Maple St.,Springfield,12345,1001,2023-01-01,A2,Mouse,25
2,101,Alice,alice@example.com,123-456-7890,123 Maple St.,Springfield,12345,1002,2023-01-10,B1,Phone,500
3,101,Alice,alice@example.com,123-456-7890,123 Maple St.,Springfield,12345,1002,2023-01-10,B2,Charger,30


## **Using  ```json_normalize()``` and ```explode()``` functions to recursively flatten JSON content**

In [11]:
# Run the 'nested_json' code cell above first and then proceed

In [12]:
import pandas as pd

In [15]:
# Recursively flattens a DataFrame that may contain nested dicts or lists of dicts. Uses 'pd.json_normalize()' and 'df.explode()'

def recursive_normalize(df, sep='_'):
    while True:
        # Step 1: Find cols that are lists of dicts → need explode + normalize
        list_cols = [col for col in df.columns if df[col].apply(lambda x: isinstance(x, list)).any()]
        dict_cols = [col for col in df.columns if df[col].apply(lambda x: isinstance(x, dict)).any()]

        # Step 2: Explode list columns
        for col in list_cols: df = df.explode(col, ignore_index = True)

        # Step 3: Normalize dict columns
        for col in dict_cols:
            # Normalize nested dict
            norm = pd.json_normalize(df[col].dropna().tolist(), sep = sep)
            norm.columns = [f"{col}{sep}{subcol}" for subcol in norm.columns]
            df = df.drop(columns=[col]).join(norm)

        if not list_cols and not dict_cols: break  # Done
    return df


In [16]:
# Step 1: Normalize top-level JSON
df = pd.json_normalize(nested_json, sep='_')

# Step 2: Recursively flatten it
flat_df = recursive_normalize(df)

In [17]:
# Step 3: Show result
flat_df

Unnamed: 0,user_id,name,contact_email,contact_phone,address_street,address_city,address_zip,orders_order_id,orders_order_date,orders_items_item_id,orders_items_item_name,orders_items_price
0,101,Alice,alice@example.com,123-456-7890,123 Maple St.,Springfield,12345,1001,2023-01-01,A1,Laptop,800
1,101,Alice,alice@example.com,123-456-7890,123 Maple St.,Springfield,12345,1001,2023-01-01,A2,Mouse,25
2,101,Alice,alice@example.com,123-456-7890,123 Maple St.,Springfield,12345,1002,2023-01-10,B1,Phone,500
3,101,Alice,alice@example.com,123-456-7890,123 Maple St.,Springfield,12345,1002,2023-01-10,B2,Charger,30
