In [None]:
from google.colab import drive
drive.mount('/content/drive')

import json
import os


In [None]:
drive_base_path = '/content/drive/MyDrive/IS450 Project/Data/Historical Reddit/'
input_dir = os.path.join(drive_base_path, 'Raw Submissions')
output_dir = os.path.join(drive_base_path, 'Filtered Posts')


In [None]:
def filter_reddit_posts(input_file, output_file, min_score=10):
    """
    Filters Reddit posts: only keeps posts with score greater than min_score,
    and with meaningful selftext (non-empty, not "[removed]" or "[deleted]").
    """
    filtered_posts = []
    posts_read = 0
    posts_kept = 0

    print(f"Filtering {input_file}...")
    if not os.path.exists(input_file):
        print(f"Error: Input file not found: {input_file}")
        return

    try:
        with open(input_file, "r", encoding="utf-8") as f:
            for line in f:
                posts_read += 1
                try:
                    post = json.loads(line)  # Load each line as JSON
                    score = post.get("score", 0)
                    selftext = post.get("selftext", "").strip()

                    if (score is not None and score > min_score and
                        selftext and
                        selftext.lower() not in ["[removed]", "[deleted]"]):

                        filtered_posts.append({
                            "id": post.get("id", "N/A"),
                            "created_utc": post.get("created_utc", 0),
                            "score": score,
                            "title": post.get("title", ""),
                            "selftext": post.get("selftext", ""), # Keep original selftext
                            "permalink": f"https://www.reddit.com{post.get('permalink', '')}"
                        })
                        posts_kept += 1
                except json.JSONDecodeError:
                    print(f"Warning: Skipping corrupted JSON line {posts_read} in {input_file}")
                    continue # Skip corrupted lines
                except Exception as e:
                    print(f"Warning: Error processing line {posts_read} in {input_file}: {e}")
                    continue

        output_dir = os.path.dirname(output_file)
        if not os.path.exists(output_dir):
            print(f"Creating output directory: {output_dir}")
            os.makedirs(output_dir)

        with open(output_file, "w", encoding="utf-8") as f_out:
            # Save as a JSON array (list of objects) instead of line-delimited
            json.dump(filtered_posts, f_out, indent=2, ensure_ascii=False)

        print(f"✅ Read {posts_read} posts. Filtered {posts_kept} meaningful posts saved to {output_file}\n")

    except FileNotFoundError:
        print(f"Error: Input file not found during open: {input_file}")
    except Exception as e:
        print(f"An unexpected error occurred while processing {input_file}: {e}")


In [None]:
# Ensure output directory exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created output directory: {output_dir}")

# Add the base names of your raw submission files here
files_to_process = [
    "stocks_submissions",
    "StockMarket_submissions",
    "ValueInvesting_submissions",
    "investing_submissions",
    "Bogleheads_submissions",
    "options_submissions",
    "CryptoCurrency_submissions"
]

# You can adjust min_score per subreddit if needed, or use a default
default_min_score = 10
score_overrides = {
    "investing_submissions": 50,
    "CryptoCurrency_submissions": 50
}

print(f"Starting filtering process...")
print(f"Input directory: {input_dir}")
print(f"Output directory: {output_dir}")

for base_filename in files_to_process:
    input_file_path = None
    possible_extensions = ['', '.json', '.ndjson', '.jsonl'] # Add more if needed
    for ext in possible_extensions:
        potential_path = os.path.join(input_dir, f"{base_filename}{ext}")
        if os.path.exists(potential_path):
            input_file_path = potential_path
            break # Found the file

    if input_file_path is None:
        print(f"Warning: Input file for '{base_filename}' not found in {input_dir}. Skipping.")
        continue

    # Construct output path
    output_file_path = os.path.join(output_dir, f"filtered_{base_filename}.json") # Standardize output to .json

    # Determine minimum score
    min_score = score_overrides.get(base_filename, default_min_score)

    # Run the filter function
    filter_reddit_posts(input_file_path, output_file_path, min_score=min_score)

print("Filtering complete.")
