In [1]:
import os
import pandas as pd
import csv
import re

import unicodedata

# Define the directory containing Parquet files
parquet_dir = './output/20240727-225047/artifacts'
csv_dir = './csv'

# Function to clean and properly format the string fields
def clean_quotes(value):
    
    if isinstance(value, str):
        # if value.startswith('"\u5c0f\u9f99\u4e0e\u5996\u9b54\u5728\u94f6'):
        #     print(value)
        
        # Convert Unicode escape sequences to Chinese characters
        def replace_unicode(match):
            unicode_str = match.group(1)
            try:
                return unicode_str.encode('utf-8').decode('unicode_escape')
            except UnicodeDecodeError:
                return match.group(0)
        
        value = re.sub(r'(\\u[0-9a-fA-F]{4})', replace_unicode, value)

        # Remove extra quotes and strip leading/trailing spaces
        value = value.strip().replace('""', '"').replace('"', '')
        
        # Ensure proper quoting for fields with commas or quotes
        if ',' in value or '"' in value:
            value = f'"{value}"'

        if value.endswith('\\'):
            value = value[:-1]

    return value

# Convert all Parquet files to CSV
for file_name in os.listdir(parquet_dir):
    if file_name.endswith('.parquet'):
        parquet_file = os.path.join(parquet_dir, file_name)
        csv_file = os.path.join(csv_dir, file_name.replace('.parquet', '.csv'))
        
        # Load the Parquet file
        df = pd.read_parquet(parquet_file)
        
        # Clean quotes in string fields
        for column in df.select_dtypes(include=['object']).columns:
            df[column] = df[column].apply(clean_quotes)
        
        # Save to CSV
        df.to_csv(csv_file, index=False, quoting=csv.QUOTE_NONNUMERIC, encoding='utf-8')
        print(f"Converted {parquet_file} to {csv_file} successfully.")

print("All Parquet files have been converted to CSV.")

Converted ./output/20240727-225047/artifacts\create_base_documents.parquet to ./csv\create_base_documents.csv successfully.
Converted ./output/20240727-225047/artifacts\create_base_entity_graph.parquet to ./csv\create_base_entity_graph.csv successfully.
Converted ./output/20240727-225047/artifacts\create_base_extracted_entities.parquet to ./csv\create_base_extracted_entities.csv successfully.
Converted ./output/20240727-225047/artifacts\create_base_text_units.parquet to ./csv\create_base_text_units.csv successfully.
Converted ./output/20240727-225047/artifacts\create_final_communities.parquet to ./csv\create_final_communities.csv successfully.
Converted ./output/20240727-225047/artifacts\create_final_community_reports.parquet to ./csv\create_final_community_reports.csv successfully.
Converted ./output/20240727-225047/artifacts\create_final_documents.parquet to ./csv\create_final_documents.csv successfully.
Converted ./output/20240727-225047/artifacts\create_final_entities.parquet to ./