In [1]:
import csv
import os
import json
from tqdm import tqdm

# Open the input CSV file
with open('metadata.csv', encoding='utf-8') as f_in:  # Specify UTF-8 encoding
    reader = csv.DictReader(f_in)
    rows = list(reader)  # Read all rows into a list for progress tracking

    for row in tqdm(rows, desc="Processing rows"):

        # Access the full text (if available) for Body Text
        body_text = []
        if row['pdf_json_files']:
            for json_path in row['pdf_json_files'].split('; '):
                with open(json_path, encoding='utf-8') as f_json:  # Specify UTF-8 encoding
                    full_text_dict = json.load(f_json)

                    # Grab body text from the full text
                    for paragraph_dict in full_text_dict['body_text']:
                        paragraph_text = paragraph_dict['text']
                        body_text.append(paragraph_text)

        # Add body text to the current row
        row['body_text'] = " ".join(body_text)  # Join body text paragraphs into a single string

# Write the results to a new CSV file
with open('output4.csv', 'w', newline='', encoding='utf-8') as f_out:
    fieldnames = list(rows[0].keys()) + ['body_text']  # Include all original columns and add 'body_text'
    writer = csv.DictWriter(f_out, fieldnames=fieldnames)

    # Write the header
    writer.writeheader()

    # Write the rows
    for row in tqdm(rows, desc="Writing to CSV"):
        writer.writerow(row)

print("Data has been successfully written to 'output.csv'.")


Processing rows: 100%|█████████████████████████████████████████████████████████| 35094/35094 [00:08<00:00, 3925.89it/s]
Writing to CSV: 100%|██████████████████████████████████████████████████████████| 35094/35094 [00:07<00:00, 4805.70it/s]

Data has been successfully written to 'output.csv'.



