# Reddit Data Exploration

Explore the Reddit/Pushshift domain data for NER training.

**Goals:**
- Load and inspect submissions and comments
- Analyze discourse patterns by subreddit
- Test entity extraction on political/news content

In [None]:
import sys
from pathlib import Path

sys.path.insert(0, str(Path('../corpus-core/src').resolve()))
sys.path.insert(0, str(Path('../pipelines/src').resolve()))

from corpus_core.loaders import ParquetLoader
import pandas as pd
import matplotlib.pyplot as plt

## Load Reddit Data

In [None]:
loader = ParquetLoader(Path('../datasets'))

# List Reddit datasets
reddit_datasets = [ds for ds in loader.list_datasets() if ds['domain'] == 'reddit']
print("Reddit datasets:")
for ds in reddit_datasets:
    print(f"  - {ds['name']}")

In [None]:
# Load submissions
if loader.exists('reddit', 'reddit_submissions'):
    submissions_df = loader.read_pandas('reddit', 'reddit_submissions')
    print(f"Submissions: {len(submissions_df)} records")
    display(submissions_df[['subreddit', 'title', 'score', 'num_comments']].head(10))
else:
    print("Run the Dagster pipeline first: dagster dev")

In [None]:
# Load comments
if loader.exists('reddit', 'reddit_comments'):
    comments_df = loader.read_pandas('reddit', 'reddit_comments')
    print(f"Comments: {len(comments_df)} records")
    display(comments_df[['subreddit', 'body', 'score']].head())

## Analyze Subreddit Distribution

In [None]:
if 'submissions_df' in dir():
    subreddit_counts = submissions_df['subreddit'].value_counts().head(20)
    subreddit_counts.plot(kind='barh', title='Submissions by Subreddit')
    plt.xlabel('Count')
    plt.tight_layout()
    plt.show()

In [None]:
# Score distribution
if 'submissions_df' in dir():
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    submissions_df['score'].clip(upper=1000).hist(ax=axes[0], bins=50)
    axes[0].set_title('Score Distribution (capped at 1000)')
    axes[0].set_xlabel('Score')
    
    submissions_df['num_comments'].clip(upper=500).hist(ax=axes[1], bins=50)
    axes[1].set_title('Comments per Post (capped at 500)')
    axes[1].set_xlabel('Number of Comments')
    
    plt.tight_layout()
    plt.show()

## Analyze Content Patterns

In [None]:
# Content length by subreddit category
TARGET_SUBREDDITS = {
    'political': ['politics', 'news', 'worldnews', 'geopolitics'],
    'finance': ['investing', 'stocks', 'wallstreetbets', 'business'],
    'science': ['science', 'technology', 'programming', 'machinelearning'],
}

def categorize_subreddit(sub):
    sub_lower = sub.lower()
    for category, subs in TARGET_SUBREDDITS.items():
        if sub_lower in subs:
            return category
    return 'other'

if 'submissions_df' in dir():
    submissions_df['category'] = submissions_df['subreddit'].apply(categorize_subreddit)
    submissions_df['content_length'] = (submissions_df['title'] + ' ' + submissions_df['selftext'].fillna('')).str.len()
    
    submissions_df.groupby('category')['content_length'].mean().plot(kind='bar')
    plt.ylabel('Avg Content Length')
    plt.title('Content Length by Category')
    plt.tight_layout()
    plt.show()

## Test Entity Extraction

In [None]:
# Sample political post
if 'submissions_df' in dir():
    political = submissions_df[submissions_df['category'] == 'political']
    if len(political) > 0:
        sample = political.iloc[0]
        print(f"Subreddit: r/{sample['subreddit']}")
        print(f"Title: {sample['title']}")
        print(f"Score: {sample['score']}")
        print("-" * 50)
        print(sample['selftext'][:1000] if sample['selftext'] else '[No body text]')

In [None]:
# Entity extraction with spaCy
try:
    import spacy
    nlp = spacy.load('en_core_web_sm')
    
    if 'sample' in dir():
        text = f"{sample['title']}\n\n{sample['selftext'] or ''}"
        doc = nlp(text[:2000])
        
        print("\nExtracted entities:")
        for ent in doc.ents:
            print(f"  {ent.label_:10} | {ent.text}")
except ImportError:
    print("spaCy not installed")

## Entity Type Analysis by Category

Expected entity types by subreddit category:

**Political/News:**
- PERSON: Politicians, public figures
- ORG: Political parties, government agencies
- GPE: Countries, states, cities
- DATE: Event dates, election dates

**Finance:**
- ORG: Companies, exchanges
- MONEY: Stock prices, investments
- PERCENT: Returns, growth rates

**Science/Tech:**
- ORG: Tech companies, research institutions
- PRODUCT: Technologies, software
- PERSON: Researchers, founders

In [None]:
# Compare entity distributions across categories
try:
    import spacy
    nlp = spacy.load('en_core_web_sm')
    
    if 'submissions_df' in dir():
        entity_stats = {}
        
        for category in ['political', 'finance', 'science']:
            cat_posts = submissions_df[submissions_df['category'] == category]
            if len(cat_posts) == 0:
                continue
                
            entity_counts = {}
            for _, row in cat_posts.head(50).iterrows():
                text = f"{row['title']}\n{row['selftext'] or ''}"
                doc = nlp(text[:1000])
                for ent in doc.ents:
                    entity_counts[ent.label_] = entity_counts.get(ent.label_, 0) + 1
            
            entity_stats[category] = entity_counts
        
        # Display comparison
        stats_df = pd.DataFrame(entity_stats).fillna(0).astype(int)
        display(stats_df)
except ImportError:
    print("spaCy not installed")

## Next Steps

1. Expand data collection (more subreddits, longer time range)
2. Filter high-quality posts (score threshold, length requirements)
3. Create category-specific NER models
4. Handle Reddit-specific entities (usernames, subreddits)