# Stock Sentiment Analysis - Preprocessing Pipeline

This notebook performs the preprocessing pipeline for stock sentiment analysis:
1. Database Connection - Connect to Oracle database and fetch the data
2. Ticker Data Loading - Fetch and load NASDAQ/NYSE ticker symbols
3. Data Harmonization - Merge posts and comments into unified schema (zatim je to takto, i kdyz trackovat komentare pro urcity posty neni skrze primarni a cizi klic tak tezky, ale pokud je ticke v postu, ten samy ticker se zakonite nemusi resit v komentari pod tim u know. proto je zatim gold standard brat sentiment (pozdeji) z komentu/postu ktery ticker/y primo obsahuje. na tohle musime dat meeting)
4. Data Cleaning - Removing invalid, deleted, and short texts proste cisteni 
5. Feature Engineering - Add temporal and engagement features (ruzny casovy prurezy atd - pro budouci poreby)
6. Ticker Detection - Identify stock ticker mentions with high precision 
7. Text Normalization - Prepare text for sentiment analysis
8. Export Results - Save processed data for further analysis

In [1]:
# Imports and Configuration
import pandas as pd
import numpy as np
import logging
import os
import re
import time
from datetime import datetime
from typing import List, Set, Dict, Any
from tqdm import tqdm
from io import StringIO
import urllib.request
import urllib.parse

# Environment and database
try:
    from dotenv import load_dotenv
    load_dotenv()
    DOTENV_AVAILABLE = True
except ImportError:
    DOTENV_AVAILABLE = False

try:
    import oracledb
    ORACLE_AVAILABLE = True
except ImportError:
    ORACLE_AVAILABLE = False
    print("Oracle DB not available. Jeste nejsme cooked -> pip install oracledb")

# Check for NLP libraries
try:
    import nltk
    NLTK_AVAILABLE = True
    print("NLTK rdy")
except ImportError:
    NLTK_AVAILABLE = False
    print("NLTK not available -> pip install nltk")

try:
    import spacy
    SPACY_AVAILABLE = True
    print("spaCy rdy")
except ImportError:
    SPACY_AVAILABLE = False
    print("spaCy not available -> pip install spacy")

# Import functions from utils
from utils import (
    get_oracle_connection,
    get_all_us_tickers,
    detect_tickers_in_text,
    apply_ticker_detection,
    harmonize_schema,
    drop_invalid_texts,
    deduplicate_and_normalize_types,
    add_temporal_features,
    add_engagement_features,
    apply_text_normalization,
    remove_financial_stopwords,
    remove_stopwords_spacy
)

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)

# Configuration constants
MIN_TEXT_LENGTH = 10
RETRY_DELAY = 10

print("Imports loaded successfully letzgooo!")

NLTK rdy
spaCy rdy
Imports loaded successfully letzgooo!
spaCy rdy
Imports loaded successfully letzgooo!


In [2]:
# Step 1: Database Connection and Data Loading

print("STEP 1: Importing Reddit Data from Oracle Database")

# Check if Oracle DB is available and credentials are set
oracle_credentials_available = (
    ORACLE_AVAILABLE and 
    os.getenv('db-username') and 
    os.getenv('db-password') and
    os.getenv('db-dsn')
)

if oracle_credentials_available:
    print("Oracle credentials found pojdme se pripojit :)")
    
    conn = get_oracle_connection()

    if conn:
        print("Database connection successful letzgoo 🚀")
        
        # Import data from database
        print("Importing Reddit data from existing tables...")
        try:
            # Query to import posts 
            query_posts = """
                SELECT 
                    author, title, created_utc, id, is_original_content,
                    score, DBMS_LOB.SUBSTR(body, 4000, 1) as body, 
                    subreddit, upvote_ratio, url
                FROM historical_posts 
                WHERE ROWNUM <= 5000
            """
            df_posts = pd.read_sql_query(query_posts, conn)
            
            # Query to import comments 
            query_comments = """
                SELECT 
                    author, created_utc, id, parent_post_id, score,
                    DBMS_LOB.SUBSTR(body, 4000, 1) as body,
                    subreddit
                FROM historical_comments 
                WHERE ROWNUM <= 5000
            """
            df_comments = pd.read_sql_query(query_comments, conn)
            
            # Close connection immediately after data import
            conn.close()
            print("Database connection closed")

            print(f"\nPosts imported: {df_posts.shape}")
            print(f"Comments imported: {df_comments.shape}")
            
            if len(df_posts) > 0:
                print(f"Posts columns: {list(df_posts.columns)}")
            
            if len(df_comments) > 0:
                print(f"Comments columns: {list(df_comments.columns)}")
                
        except Exception as e:
            print(f"Error importing data from database: {e}")
            if conn:
                try:
                    conn.close()
                except:
                    pass
            df_posts = pd.DataFrame()
            df_comments = pd.DataFrame()
            
    else:
        print("Failed to connect to database (tak to je v pici - check logs)")
        df_posts = pd.DataFrame()
        df_comments = pd.DataFrame()
        
else:
    print("Oracle database credentials not configured")
    print("Skipping database import...")
    df_posts = pd.DataFrame()
    df_comments = pd.DataFrame()

print(f"\nStep 1 Complete: Loaded {len(df_posts)} posts and {len(df_comments)} comments. Letzgoo")

STEP 1: Importing Reddit Data from Oracle Database
Oracle credentials found pojdme se pripojit :)
Oracle connection successful!
Database connection successful letzgoo 🚀
Importing Reddit data from existing tables...
Oracle connection successful!
Database connection successful letzgoo 🚀
Importing Reddit data from existing tables...


  df_posts = pd.read_sql_query(query_posts, conn)
  df_comments = pd.read_sql_query(query_comments, conn)
  df_comments = pd.read_sql_query(query_comments, conn)


Database connection closed

Posts imported: (5000, 10)
Comments imported: (5000, 7)
Posts columns: ['AUTHOR', 'TITLE', 'CREATED_UTC', 'ID', 'IS_ORIGINAL_CONTENT', 'SCORE', 'BODY', 'SUBREDDIT', 'UPVOTE_RATIO', 'URL']
Comments columns: ['AUTHOR', 'CREATED_UTC', 'ID', 'PARENT_POST_ID', 'SCORE', 'BODY', 'SUBREDDIT']

Step 1 Complete: Loaded 5000 posts and 5000 comments. Letzgoo


In [3]:
# Step 2: Ticker Data Loading
# Check if we have cached ticker data
us_tickers_path = "us_tickers.csv"

if os.path.exists(us_tickers_path):
    print(f"Loading cached ticker data from {us_tickers_path}")
    try:
        tickers_df = pd.read_csv(us_tickers_path, dtype=str)
        # Normalizing column names
        tickers_df.columns = [c.strip().lower().replace(" ", "_") for c in tickers_df.columns]
        if 'ticker' in tickers_df.columns:
            tickers_df['ticker'] = tickers_df['ticker'].astype(str).str.upper().str.strip()
        print(f"Loaded {len(tickers_df)} cached tickers (zatim dobry)")
    except Exception as e:
        print(f"Error loading cached data (ou nou): {e}")
        tickers_df = None
else:
    print("Fetching fresh ticker data")
    tickers_df = None

# If no cached data or fresh data
if tickers_df is None or len(tickers_df) == 0:
    try:
        tickers_df = get_all_us_tickers()
        if len(tickers_df) > 0:
            tickers_df.to_csv(us_tickers_path, index=False)
            print(f"Fetched and cached {len(tickers_df)} US tickers (so far so good)")
        else:
            print("No ticker data retrieved (fuck)")
    except Exception as e:
        print(f"Error fetching ticker data (fuck): {e}")
        tickers_df = pd.DataFrame()

if len(tickers_df) > 0:
    print(f"\nTicker data summary:")
    print(f"  Total tickers: {len(tickers_df)}")
    print(f"  Exchanges: {tickers_df['exchange'].value_counts().to_dict()}")
    
    # Check for major tickers - pojistka proste
    major_tickers = {'AAPL', 'TSLA', 'MSFT', 'AMZN', 'GOOGL', 'NVDA', 'META'}
    found_major = set(tickers_df['ticker']) & major_tickers
    print(f"  Major tickers found: {found_major}")
    
else:
    print("No ticker data available (well fuck)")
    
print(f"\nStep 2 Complete: Loaded {len(tickers_df)} ticker symbols letzgoo 🚀")

Loading cached ticker data from us_tickers.csv
Loaded 8022 cached tickers (zatim dobry)

Ticker data summary:
  Total tickers: 8022
  Exchanges: {'NASDAQ': 5142, 'NYSE': 2880}
  Major tickers found: {'AMZN', 'GOOGL', 'TSLA', 'NVDA', 'AAPL', 'META', 'MSFT'}

Step 2 Complete: Loaded 8022 ticker symbols letzgoo 🚀


In [4]:
# Step 3: Data Harmonization - sloucime posty a komenty (zatim bez nejaky hierarchie)


if len(df_posts) > 0 or len(df_comments) > 0:
    # combine data
    df_unified = harmonize_schema(df_posts, df_comments)
    print(f"Unified dataframe shape: {df_unified.shape}")
    print(f"Unified columns: {list(df_unified.columns)}")
    
    # Show data type distribution
    type_counts = df_unified['type'].value_counts()
    print(f"\nData distribution:")
    for dtype, count in type_counts.items():
        print(f"  {dtype}: {count} rows")
    
else:
    print("No data to harmonize")
    df_unified = pd.DataFrame()

print(f"\nStep 3 Complete: Unified {len(df_unified)} rows. Fuck yeaaah 🚀")

  unified = pd.concat([posts, comments], ignore_index=True, sort=False)
INFO:utils:Harmonized schema. Unified dataframe shape: (10000, 12)


Unified dataframe shape: (10000, 12)
Unified columns: ['author', 'title', 'created_utc', 'id', 'is_original_content', 'score', 'text', 'subreddit', 'upvote_ratio', 'url', 'parent_post_id', 'type']

Data distribution:
  post: 5000 rows
  comment: 5000 rows

Step 3 Complete: Unified 10000 rows. Fuck yeaaah 🚀


In [5]:
# Step 4: Data Cleaning

if len(df_unified) > 0:
    print(f"Starting with {len(df_unified)} rows")
    
    # Step 4a: Remove invalid texts
    df_cleaned = drop_invalid_texts(df_unified, min_len=MIN_TEXT_LENGTH)
    print(f"    After removing invalid texts: {len(df_cleaned)} rows (-{len(df_unified) - len(df_cleaned)})")
    
    # Step 4b: Deduplicate and normalize types
    df_cleaned = deduplicate_and_normalize_types(df_cleaned)
    print(f"    After deduplication: {len(df_cleaned)} rows (tohle by melo byt idealne kladny cislo u know)")
    
    # Show cleaning results
    if len(df_cleaned) > 0:
        print(f"  Original rows: {len(df_unified)}")
        print(f"  Cleaned rows: {len(df_cleaned)}")
        print(f"  Removed: {len(df_unified) - len(df_cleaned)} ({((len(df_unified) - len(df_cleaned))/len(df_unified)*100):.1f}%)")
        
        # Show data types after cleaning
        print(f"\nData types after cleaning:")
        type_counts = df_cleaned['type'].value_counts()
        for dtype, count in type_counts.items():
            print(f"  {dtype}: {count} rows")
    else:
        print("No data remaining after cleaning - tohle neni uplne dobry :)")
        
else:
    print("No data to clean (jsme v prdeli)")
    df_cleaned = pd.DataFrame()

print(f"\nStep 4 Complete: {len(df_cleaned)} clean rows")

INFO:utils:Dropped invalid/short texts. Remaining rows: 6829
INFO:utils:Deduplicated and normalized types.
INFO:utils:Deduplicated and normalized types.


Starting with 10000 rows
    After removing invalid texts: 6829 rows (-3171)
    After deduplication: 6829 rows (tohle by melo byt idealne kladny cislo u know)
  Original rows: 10000
  Cleaned rows: 6829
  Removed: 3171 (31.7%)

Data types after cleaning:
  comment: 4677 rows
  post: 2152 rows

Step 4 Complete: 6829 clean rows


In [None]:
# Step 5: Feature Engineering

if len(df_cleaned) > 0:
    df_features = add_temporal_features(df_cleaned)
    df_features = add_engagement_features(df_features)
    
    print(f"Enhanced dataframe shape: {df_features.shape}")
    
    # new features
    new_features = ['date', 'hour', 'day_of_week', 'month', 'is_weekend', 
                   'text_length', 'word_count', 'score_log1p']
    print(f"\nNew features added: {[f for f in new_features if f in df_features.columns]}")
    
    # Show feature statistics
    if 'text_length' in df_features.columns:
        print(f"  Text length: min={df_features['text_length'].min()}, "
              f"mean={df_features['text_length'].mean():.1f}, "
              f"max={df_features['text_length'].max()}")
    
    if 'word_count' in df_features.columns:
        print(f"  Word count: min={df_features['word_count'].min()}, "
              f"mean={df_features['word_count'].mean():.1f}, "
              f"max={df_features['word_count'].max()}")
    
    if 'day_of_week' in df_features.columns:
        day_counts = df_features['day_of_week'].value_counts()
        print(f"  Day distribution: {day_counts.to_dict()}")
    
    if 'is_weekend' in df_features.columns:
        weekend_pct = df_features['is_weekend'].mean() * 100
        print(f"  Weekend posts: {weekend_pct:.1f}%")
    
else:
    print("No data for feature engineering (upsis)")
    df_features = pd.DataFrame()

print(f"\nStep 5 Complete: {len(df_features)} rows with enhanced features")

In [None]:
# Step 6: Ticker Detection 

if len(df_features) > 0 and len(tickers_df) > 0:
    print(f"Detecting tickers in {len(df_features)} texts using {len(tickers_df)} symbols")
    
    # improved ticker detection
    df_with_tickers = apply_ticker_detection(df_features, tickers_df)
    
    # results
    ticker_stats = df_with_tickers['n_tickers'].value_counts().sort_index()
    total_with_tickers = (df_with_tickers['n_tickers'] > 0).sum()
    print(f"  Total rows: {len(df_with_tickers)}")
    print(f"  Rows with tickers: {total_with_tickers} ({total_with_tickers/len(df_with_tickers)*100:.1f}%)")
    print(f"  Rows without tickers: {len(df_with_tickers) - total_with_tickers}")
    
    print(f"\nTicker count distribution:")
    for count, rows in ticker_stats.head(10).items():
        print(f"  {count} tickers: {rows} rows")
    
    # Show if any tickers were found
    ticker_examples = df_with_tickers[df_with_tickers['n_tickers'] > 0]
    if len(ticker_examples) > 0:
        print(f"\nTicker detection successful: Found {len(ticker_examples)} rows with ticker mentions")
    else:
        print(f"\nNo tickers detected - this indicates high precision (no false positives)")
        print("Testing detection with synthetic examples:")
        
        # Test with known ticker-rich text (projistotu)
        ticker_set = set(tickers_df['ticker'])
        test_texts = [
            "I'm buying $AAPL and TSLA today",
            "MSFT and GOOGL are performing well",
            "Just some random text without tickers"
        ]
        
        for test_text in test_texts:
            detected = detect_tickers_in_text(test_text, ticker_set)
            print(f"    '{test_text}' → {detected}")
    
else:
    print("No data or tickers available for detection")
    df_with_tickers = pd.DataFrame()

print(f"\nStep 6 Complete: Processed {len(df_with_tickers)} rows for ticker detection")

In [None]:
# Step 7: Text Normalization and Stopword Removal

if len(df_with_tickers) > 0:

    # Basic text normalization (creates 'sentiment_ready_text' column)
    df_final = apply_text_normalization(df_with_tickers, keep_tickers=True)
    
    # Apply additional stopword removal to improve the text further
    
    if SPACY_AVAILABLE:
        print("Using spaCy")
        df_final['sentiment_ready_text'] = df_final['sentiment_ready_text'].apply(
            lambda x: remove_stopwords_spacy(x, preserve_tickers=True)
        )
        stopword_method = "spaCy"
    elif NLTK_AVAILABLE:
        print("Using NLTK")
        df_final['sentiment_ready_text'] = df_final['sentiment_ready_text'].apply(
            lambda x: remove_financial_stopwords(x, preserve_tickers=True)
        )
        stopword_method = "NLTK"
    else:
        print("Using built-in stopword removal (lame as fuck)")
        df_final['sentiment_ready_text'] = df_final['sentiment_ready_text'].apply(
            lambda x: remove_financial_stopwords(x, preserve_tickers=True)
        )
        stopword_method = "Built-in"
    
    print(f"Text normalization complete using {stopword_method}")
    print(f"Final dataframe shape: {df_final.shape}")
    print(f"Final columns: {list(df_final.columns)}")
    
    # Final statistics
    print(f"\nFinal dataset statistics:")
    print(f"  Total rows: {len(df_final)}")
    print(f"  Rows with tickers: {(df_final['n_tickers'] > 0).sum()}")
    print(f"  Average original text length: {df_final['text_length'].mean():.1f} characters")
    print(f"  Average word count: {df_final['word_count'].mean():.1f} words")
    
    # Show text processing impact
    avg_original_length = df_final['text'].str.len().mean()
    avg_sentiment_ready_length = df_final['sentiment_ready_text'].str.len().mean()
    
    print(f"\nimpact:")
    print(f"  Original text length: {avg_original_length:.1f} chars")
    print(f"  Sentiment-ready text length: {avg_sentiment_ready_length:.1f} chars")
    print(f"  Reduction from normalization: {((avg_original_length - avg_sentiment_ready_length) / avg_original_length * 100):.1f}%")
    print(f"  Stopword removal method: {stopword_method}")
    
    if 'type' in df_final.columns:
        type_dist = df_final['type'].value_counts()
        print(f"  Content distribution: {type_dist.to_dict()}")
    
else:
    print("No data for text normalization")
    df_final = pd.DataFrame()

print(f"\nStep 7 Complete: {len(df_final)} rows ready for sentiment analysis (letzgoo 🚀)")

In [None]:
# Step 8: Export Sentiment-Ready Data

if len(df_final) > 0:
    print(f"Available columns: {list(df_final.columns)}")
    
    # Export to CSV 
    output_file = "sentiment_ready_data.csv"
    
    # Updated key columns including exchange information
    sentiment_columns = [
        'id', 'text', 'sentiment_ready_text', 'type', 'subreddit', 
        'created_utc', 'score', 'mentioned_tickers', 'n_tickers', 'ticker_exchanges',
        'text_length', 'word_count', 'date', 'hour', 'day_of_week']
    
    # Keep only columns that exist
    export_columns = [col for col in sentiment_columns if col in df_final.columns]
    export_df = df_final[export_columns].copy()
    
    # Save to CSV
    export_df.to_csv(output_file, index=False)
    print(f"Exported {len(export_df)} rows to {output_file}")
    print(f"Exported columns: {export_columns}")
    
    # Show exchange distribution
    if 'ticker_exchanges' in export_df.columns:
        exchange_dist = export_df[export_df['ticker_exchanges'] != '']['ticker_exchanges'].value_counts()
        print(f"\nExchange distribution (rows with tickers):")
        for exchange, count in exchange_dist.items():
            print(f"  {exchange}: {count} rows")
    
    print(f"\nData is ready for sentiment analysis")
    print(f"Use the 'sentiment_ready_text' column for sentiment modeling")
    print(f"Use the 'mentioned_tickers' column for ticker information")
    print(f"Use the 'ticker_exchanges' column for exchange information (NYSE/NASDAQ/BOTH)")
    
else:
    print("No data to export (gg well played)")

print(f"\nPreprocessing Pipeline Complete (letzgoo 🚀)")