In [9]:
import pandas as pd
import numpy as np
import json
import glob
import os

def to_serializable(obj):
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {k: to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [to_serializable(i) for i in obj]
    else:
        return obj
    
def remove_none_keys(value):
    if isinstance(value, dict):
        return {k: v for k, v in value.items() if v is not None}
    return value  # Leave as-is (e.g., if list or str)

# Pattern for your parquet files
file_pattern = "dataset/articles_entertainment_step_*.parquet"

# Output folder (optional)
output_folder = "cleaned_parquet"
os.makedirs(output_folder, exist_ok=True)

# Process each matching file
for filepath in glob.glob(file_pattern):
    print(f"Processing: {filepath}")
    
    # Load the parquet file
    df = pd.read_parquet(filepath)

    # Remove None values
    df["coref_clusters"] = df["coref_clusters"].apply(remove_none_keys)
    df["person_descriptions"] = df["person_descriptions"].apply(remove_none_keys)

    # Convert specific columns
    for col in ["coref_clusters", "person_descriptions"]:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: json.dumps(to_serializable(x), ensure_ascii=False))

    # Save the updated DataFrame to a new Parquet file
    output_path = os.path.join(output_folder, os.path.basename(filepath))
    df.to_parquet(output_path, index=False)
    print(f"Saved cleaned file to: {output_path}")


Processing: dataset/articles_entertainment_step_1.parquet
Saved cleaned file to: cleaned_parquet/articles_entertainment_step_1.parquet
Processing: dataset/articles_entertainment_step_2.parquet
Saved cleaned file to: cleaned_parquet/articles_entertainment_step_2.parquet
Processing: dataset/articles_entertainment_step_3.parquet
Saved cleaned file to: cleaned_parquet/articles_entertainment_step_3.parquet
Processing: dataset/articles_entertainment_step_4.parquet
Saved cleaned file to: cleaned_parquet/articles_entertainment_step_4.parquet
Processing: dataset/articles_entertainment_step_5.parquet
Saved cleaned file to: cleaned_parquet/articles_entertainment_step_5.parquet
Processing: dataset/articles_entertainment_step_6.parquet
Saved cleaned file to: cleaned_parquet/articles_entertainment_step_6.parquet
Processing: dataset/articles_entertainment_step_7.parquet
Saved cleaned file to: cleaned_parquet/articles_entertainment_step_7.parquet
Processing: dataset/articles_entertainment_step_8.parqu

In [10]:
import pandas as pd
import glob
import os

# Directory containing the cleaned parquet files
cleaned_dir = "cleaned_parquet"

# Find and sort all cleaned parquet files by chunk number
cleaned_files = sorted(
    glob.glob(os.path.join(cleaned_dir, "articles_entertainment_step_*.parquet")),
    key=lambda x: int(x.split("_step_")[-1].split(".")[0])
)

# Load and concatenate all cleaned chunks
dfs = []
for f in cleaned_files:
    print(f"Loading cleaned file: {f}")
    df_chunk = pd.read_parquet(f)
    dfs.append(df_chunk)

# Combine into a single DataFrame
df_combined = pd.concat(dfs, ignore_index=True)

# Output path for the full combined dataset
combined_path = os.path.join(cleaned_dir, "articles_entertainment_nlp.parquet")
df_combined.to_parquet(combined_path)
print(f"✅ Combined cleaned DataFrame saved to: {combined_path}")


Loading cleaned file: cleaned_parquet/articles_entertainment_step_1.parquet
Loading cleaned file: cleaned_parquet/articles_entertainment_step_2.parquet
Loading cleaned file: cleaned_parquet/articles_entertainment_step_3.parquet
Loading cleaned file: cleaned_parquet/articles_entertainment_step_4.parquet
Loading cleaned file: cleaned_parquet/articles_entertainment_step_5.parquet
Loading cleaned file: cleaned_parquet/articles_entertainment_step_6.parquet
Loading cleaned file: cleaned_parquet/articles_entertainment_step_7.parquet
Loading cleaned file: cleaned_parquet/articles_entertainment_step_8.parquet
Loading cleaned file: cleaned_parquet/articles_entertainment_step_9.parquet
Loading cleaned file: cleaned_parquet/articles_entertainment_step_10.parquet
Loading cleaned file: cleaned_parquet/articles_entertainment_step_11.parquet
Loading cleaned file: cleaned_parquet/articles_entertainment_step_12.parquet
Loading cleaned file: cleaned_parquet/articles_entertainment_step_13.parquet
Loading 