# Data Ingestion Notebook

This notebook handles data ingestion from various sources:
- Reddit API extraction
- Loading from local CSV files
- Loading from S3
- Data quality checks

## Objectives
1. Extract data from Reddit API
2. Load existing data files
3. Perform initial data quality checks
4. Save data in standardized format

In [None]:
# Import libraries
import sys
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime

# Add src to path
sys.path.insert(0, str(Path.cwd().parent))

from src.ingestion.reddit_extractor import RedditExtractor
from src.processing.data_validator import DataValidator
from src.utils.config import get_config
from src.utils.logger import get_logger

logger = get_logger(__name__)
print("Libraries imported successfully")

## Option 1: Extract from Reddit API

In [None]:
# Initialize Reddit extractor
extractor = RedditExtractor()

# Define subreddits to extract
subreddits = ['science', 'politics', 'technology', 'relationships']

# Extract posts
df_reddit = extractor.extract_posts_batch(
    subreddits=subreddits,
    time_filter='all',
    limit_per_subreddit=1000,
    sort='top'
)

print(f"Extracted {len(df_reddit)} posts")
df_reddit.head()

## Option 2: Load from Local CSV File

In [None]:
# Load from local CSV file
data_path = Path("../data/output")
csv_files = list(data_path.glob("*.csv"))

if csv_files:
    # Load the most recent file
    latest_file = max(csv_files, key=lambda p: p.stat().st_mtime)
    print(f"Loading: {latest_file}")
    df_local = pd.read_csv(latest_file)
    print(f"Loaded {len(df_local)} rows from {latest_file.name}")
    df_local.head()
else:
    print("No CSV files found in data/output directory")

## Option 3: Load from S3

In [None]:
# Load from S3 (if configured)
try:
    import boto3
    from src.utils.config import get_config
    
    config = get_config()
    s3_client = boto3.client(
        's3',
        aws_access_key_id=config.aws.access_key_id,
        aws_secret_access_key=config.aws.secret_access_key,
        region_name=config.aws.region
    )
    
    # List objects in S3
    bucket = config.aws.bucket_name
    prefix = "raw/reddit/"
    
    response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix)
    
    if 'Contents' in response:
        # Get the most recent file
        objects = sorted(response['Contents'], key=lambda x: x['LastModified'], reverse=True)
        latest_key = objects[0]['Key']
        
        # Download and load
        obj = s3_client.get_object(Bucket=bucket, Key=latest_key)
        df_s3 = pd.read_csv(obj['Body'])
        print(f"Loaded {len(df_s3)} rows from S3: {latest_key}")
        df_s3.head()
    else:
        print("No files found in S3")
except Exception as e:
    print(f"S3 loading not available: {str(e)}")

## Data Quality Validation

In [None]:
# Use the loaded dataframe (choose df_reddit, df_local, or df_s3)
df = df_reddit if 'df_reddit' in locals() and not df_reddit.empty else df_local if 'df_local' in locals() else None

if df is not None:
    validator = DataValidator()
    validation_result = validator.validate(df)
    
    print(validation_result)
    print("\nValidation Statistics:")
    for key, value in validation_result.stats.items():
        print(f"{key}: {value}")
else:
    print("No data loaded. Please run one of the ingestion options above.")

## Save Processed Data

In [None]:
# Save data for next steps
if df is not None:
    output_path = Path("../data/processed/ingested_data.csv")
    output_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(output_path, index=False)
    print(f"Data saved to {output_path}")
    print(f"Shape: {df.shape}")