In [1]:
import json
import pandas as pd
import os
import re
from datetime import datetime

In [2]:
def clean_text(text):
    """Remove URLs, special chars, and extra spaces."""
    if not isinstance(text, str):
        return ""
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [3]:
def process_json_to_csv(json_path, output_folder):
    """Process a single JSON file to CSV."""
    with open(json_path, 'r') as f:
        thread_data = json.load(f)
    
    post = thread_data['post']
    post_data = {
        'type': 'post',
        'id': post['id'],
        'author': post.get('author', '[deleted]'),
        'title': post.get('title', ''),
        'score': post.get('score', 0),
        'created_utc': datetime.utcfromtimestamp(post['created_utc']).strftime('%Y-%m-%d %H:%M:%S'),
        'url': post.get('url', ''),
        'num_comments': post.get('num_comments', 0),
        'subreddit': post.get('subreddit', ''),
        'body': clean_text(post.get('selftext', ''))
    }

    comments_list = []
    def flatten_comments(comments, parent_id=None, depth=0):
        for comment in comments:
            if comment.get('author') == '[deleted]' or comment.get('body') in ('[deleted]', '[removed]'):
                continue
            comment_data = {
                'type': 'comment',
                'id': comment['id'],
                'author': comment.get('author', '[deleted]'),
                'body': clean_text(comment.get('body', '')),
                'score': comment.get('score', 0),
                'created_utc': datetime.utcfromtimestamp(comment['created_utc']).strftime('%Y-%m-%d %H:%M:%S'),
                'parent_id': parent_id,
                'depth': depth,
                'post_id': post['id']
            }
            comments_list.append(comment_data)
            if 'replies' in comment and comment['replies']:
                flatten_comments(comment['replies'], parent_id=comment['id'], depth=depth + 1)

    flatten_comments(thread_data.get('comments', []))

    df = pd.DataFrame([post_data] + comments_list)
    csv_filename = os.path.join(output_folder, f"{post['id']}.csv")
    df.to_csv(csv_filename, index=False)
    print(f"Saved: {csv_filename}")

In [4]:
def batch_process_json_folder(input_folder, output_folder):
    """Process all JSONs in input_folder and save CSVs to output_folder."""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for filename in os.listdir(input_folder):
        if filename.endswith('.json'):
            json_path = os.path.join(input_folder, filename)
            process_json_to_csv(json_path, output_folder)
    print(f"Processing complete! CSVs saved to: {output_folder}")


In [5]:
# ======== UPDATE THESE 3 LINES ======== #
folder_data = "D:/AIML/reddit mental health/tf_env/data"  # ⚠ Replace with your actual path (e.g., "C:/Users/Name/folder_data")
folder_json = os.path.join(folder_data, "raw")  # Input folder (where JSONs are stored)
folder_csv = os.path.join(folder_data, "processed/clean_csv")   # Output folder (where CSVs will be saved)
# ===================================== #

batch_process_json_folder(folder_json, folder_csv)

Saved: D:/AIML/reddit mental health/tf_env/data\processed/clean_csv\1ggfmyg.csv
Saved: D:/AIML/reddit mental health/tf_env/data\processed/clean_csv\1ggjfl9.csv
Saved: D:/AIML/reddit mental health/tf_env/data\processed/clean_csv\1ggnkgn.csv
Saved: D:/AIML/reddit mental health/tf_env/data\processed/clean_csv\1gh41pn.csv
Saved: D:/AIML/reddit mental health/tf_env/data\processed/clean_csv\1gh6wk4.csv
Saved: D:/AIML/reddit mental health/tf_env/data\processed/clean_csv\1ghc9ti.csv
Saved: D:/AIML/reddit mental health/tf_env/data\processed/clean_csv\1ghcfvs.csv
Saved: D:/AIML/reddit mental health/tf_env/data\processed/clean_csv\1ghchxr.csv
Saved: D:/AIML/reddit mental health/tf_env/data\processed/clean_csv\1ghcjv5.csv
Saved: D:/AIML/reddit mental health/tf_env/data\processed/clean_csv\1gik043.csv
Saved: D:/AIML/reddit mental health/tf_env/data\processed/clean_csv\1gin7az.csv
Saved: D:/AIML/reddit mental health/tf_env/data\processed/clean_csv\1gjer2l.csv
Saved: D:/AIML/reddit mental health/tf_e