In [5]:
from utils import get_oracle_connection, fetch_historical_prices, check_market_moved_before_date, adjust_to_trading_day
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
conn = get_oracle_connection()

if not conn:
    raise ConnectionError("Failed to establish a connection to the Oracle database.")

query = """
SELECT *
FROM sentiment_signals
ORDER BY ticker
"""

df = pd.read_sql_query(query, conn)

conn.close()

Oracle connection successful!


  df = pd.read_sql_query(query, conn)


In [3]:
# Process signals and filter out those where market already moved
filtered_signals = []
current_ticker = None
price_data = None
failed_tickers = set()

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing signals"):
    ticker = row['TICKER']
    signal_date = adjust_to_trading_day(str(row['SIGNAL_DATE'])[:10])

    # Fetch new price data only when ticker changes
    if ticker != current_ticker:
        current_ticker = ticker
        if ticker in failed_tickers:
            price_data = None
        else:
            try:
                price_data = fetch_historical_prices(ticker, "2024-01-01", "2024-12-31")
            except Exception:
                failed_tickers.add(ticker)
                price_data = None

    # Skip if no price data available
    if price_data is None or price_data.empty:
        continue

    # Check if market moved before this signal date
    result = check_market_moved_before_date(price_data, signal_date)

    # If result is None or market did NOT move, keep the signal
    if result is None:
        continue

    if not result['market_moved_flag']:
        # Market didn't move - this is a valid signal to keep
        signal_row = row.to_dict()
        signal_row.update(result)
        filtered_signals.append(signal_row)

# Create final dataframe with filtered signals
df_filtered = pd.DataFrame(filtered_signals)
print(f"\nOriginal signals: {len(df)}")
print(f"Filtered signals (market didn't move): {len(df_filtered)}")
print(f"Failed tickers: {len(failed_tickers)}")

Processing signals:  20%|█▉        | 902/4604 [00:12<01:08, 54.44it/s] ERROR:yfinance:$BRK.A: possibly delisted; no timezone found
Processing signals:  20%|█▉        | 914/4604 [00:13<02:10, 28.24it/s]ERROR:yfinance:$BRK.A: possibly delisted; no timezone found
Processing signals:  20%|█▉        | 914/4604 [00:13<02:10, 28.24it/s]ERROR:yfinance:$BRK.B: possibly delisted; no timezone found
ERROR:yfinance:$BRK.B: possibly delisted; no timezone found
Processing signals:  25%|██▍       | 1148/4604 [00:20<01:38, 35.04it/s]ERROR:yfinance:$COLA: possibly delisted; no price data found  (1d 2024-01-01 -> 2024-12-31) (Yahoo error = "Data doesn't exist for startDate = 1704085200, endDate = 1735621200")
Processing signals:  25%|██▌       | 1158/4604 [00:20<01:54, 30.13it/s]ERROR:yfinance:$COLA: possibly delisted; no price data found  (1d 2024-01-01 -> 2024-12-31) (Yahoo error = "Data doesn't exist for startDate = 1704085200, endDate = 1735621200")
Processing signals:  83%|████████▎ | 3808/4604 [01:


Original signals: 4604
Filtered signals (market didn't move): 1810
Failed tickers: 0





In [6]:
# Export filtered signals to Oracle with quality gates
conn = get_oracle_connection()

if conn:
    cursor = conn.cursor()

    # =========================================================================
    # QUALITY GATES - Validate data before export
    # =========================================================================
    print("=" * 60)
    print("APPLYING QUALITY GATES BEFORE EXPORT")
    print("=" * 60)

    initial_count = len(df_filtered)

    # Gate 1: Remove rows with null/invalid SIGNAL_DATE
    df_export = df_filtered[df_filtered['SIGNAL_DATE'].notna()].copy()
    gate1_removed = initial_count - len(df_export)
    print(f"Gate 1 (valid SIGNAL_DATE): Removed {gate1_removed} rows with null dates")

    # Gate 2: Ensure WINDOW_MENTIONS >= MIN_MENTIONS (5 is the threshold)
    MIN_MENTIONS_THRESHOLD = 5
    before_gate2 = len(df_export)
    df_export = df_export[df_export['WINDOW_MENTIONS'] >= MIN_MENTIONS_THRESHOLD]
    gate2_removed = before_gate2 - len(df_export)
    print(f"Gate 2 (window_mentions >= {MIN_MENTIONS_THRESHOLD}): Removed {gate2_removed} rows with insufficient mentions")

    # Gate 3: Ensure Z_SCORE is valid (not null, not inf)
    before_gate3 = len(df_export)
    df_export = df_export[
        df_export['Z_SCORE'].notna() &
        ~np.isinf(df_export['Z_SCORE'].astype(float))
    ]
    gate3_removed = before_gate3 - len(df_export)
    print(f"Gate 3 (valid Z_SCORE): Removed {gate3_removed} rows with null/inf z-score")

    # Gate 4: Ensure SIGNAL_TYPE is BUY or SELL
    before_gate4 = len(df_export)
    df_export = df_export[df_export['SIGNAL_TYPE'].isin(['BUY', 'SELL'])]
    gate4_removed = before_gate4 - len(df_export)
    print(f"Gate 4 (valid SIGNAL_TYPE): Removed {gate4_removed} rows with invalid signal type")

    # Gate 5: Remove duplicates (ticker, signal_date, signal_type)
    before_gate5 = len(df_export)
    df_export = df_export.drop_duplicates(subset=['TICKER', 'SIGNAL_DATE', 'SIGNAL_TYPE'], keep='first')
    gate5_removed = before_gate5 - len(df_export)
    print(f"Gate 5 (no duplicates): Removed {gate5_removed} duplicate (ticker, date, type) combinations")

    total_removed = initial_count - len(df_export)
    print(f"\n✓ QUALITY GATES COMPLETE: {len(df_export)}/{initial_count} signals passed ({total_removed} removed)")
    print("=" * 60)

    # Drop and recreate table with correct structure
    try:
        cursor.execute("DROP TABLE filtered_signals")
    except:
        pass

    # Create table with all necessary columns
    # Use SIGNAL_DIRECTION to match source data (not SENTIMENT_DIRECTION)
    create_sql = """
    CREATE TABLE filtered_signals (
        TICKER VARCHAR2(20),
        SIGNAL_DATE DATE,
        SIGNAL_TYPE VARCHAR2(10),
        SIGNAL_DIRECTION NUMBER(3,0),
        SENTIMENT_MEAN NUMBER(15,6),
        WINDOW_SENTIMENT NUMBER(15,6),
        WINDOW_MENTIONS NUMBER(15,2),
        Z_SCORE NUMBER(15,6),
        SIGNAL_SCORE NUMBER(15,6),
        TOTAL_UPVOTES NUMBER(15,6),
        TARGET_DATE VARCHAR2(20),
        PCT_CHANGE_3D NUMBER(15,6),
        PCT_3D_Z NUMBER(15,6),
        RET_Z NUMBER(15,6),
        VOL_Z NUMBER(15,6),
        VOL_EXPANSION NUMBER(15,6),
        ATR_14 NUMBER(15,6),
        ATR_MOVE NUMBER(15,6),
        MARKET_MOVED_FLAG NUMBER(1,0)
    )
    """
    cursor.execute(create_sql)
    print("Created filtered_signals table")

    # Map source columns to destination columns
    # Source: SIGNAL_DATE, TICKER, SENTIMENT_MEAN, TOTAL_UPVOTES, WINDOW_SENTIMENT,
    #         WINDOW_MENTIONS, Z_SCORE, SIGNAL_TYPE, SIGNAL_DIRECTION, SIGNAL_SCORE
    # Plus market move columns: target_date, pct_change_3d, pct_3d_z, ret_z, vol_z,
    #                          vol_expansion, atr_14, atr_move, market_moved_flag

    insert_sql = """
    INSERT INTO filtered_signals (
        TICKER, SIGNAL_DATE, SIGNAL_TYPE, SIGNAL_DIRECTION, SENTIMENT_MEAN,
        WINDOW_SENTIMENT, WINDOW_MENTIONS, Z_SCORE, SIGNAL_SCORE, TOTAL_UPVOTES,
        TARGET_DATE, PCT_CHANGE_3D, PCT_3D_Z, RET_Z, VOL_Z, VOL_EXPANSION,
        ATR_14, ATR_MOVE, MARKET_MOVED_FLAG
    ) VALUES (:1, :2, :3, :4, :5, :6, :7, :8, :9, :10, :11, :12, :13, :14, :15, :16, :17, :18, :19)
    """

    # Prepare data with correct column mapping (using df_export after quality gates)
    insert_data = []
    for _, row in df_export.iterrows():
        row_data = (
            str(row.get('TICKER', '')),
            pd.to_datetime(row.get('SIGNAL_DATE')).date() if pd.notna(row.get('SIGNAL_DATE')) else None,
            str(row.get('SIGNAL_TYPE', '')),
            int(row.get('SIGNAL_DIRECTION', 0)) if pd.notna(row.get('SIGNAL_DIRECTION')) else None,
            float(row.get('SENTIMENT_MEAN', 0)) if pd.notna(row.get('SENTIMENT_MEAN')) else None,
            float(row.get('WINDOW_SENTIMENT', 0)) if pd.notna(row.get('WINDOW_SENTIMENT')) else None,
            float(row.get('WINDOW_MENTIONS', 0)) if pd.notna(row.get('WINDOW_MENTIONS')) else None,
            float(row.get('Z_SCORE', 0)) if pd.notna(row.get('Z_SCORE')) else None,
            float(row.get('SIGNAL_SCORE', 0)) if pd.notna(row.get('SIGNAL_SCORE')) else None,
            float(row.get('TOTAL_UPVOTES', 0)) if pd.notna(row.get('TOTAL_UPVOTES')) else None,
            str(row.get('target_date', '')) if pd.notna(row.get('target_date')) else None,
            float(row.get('pct_change_3d', 0)) if pd.notna(row.get('pct_change_3d')) else None,
            float(row.get('pct_3d_z', 0)) if pd.notna(row.get('pct_3d_z')) else None,
            float(row.get('ret_z', 0)) if pd.notna(row.get('ret_z')) else None,
            float(row.get('vol_z', 0)) if pd.notna(row.get('vol_z')) else None,
            float(row.get('vol_expansion', 0)) if pd.notna(row.get('vol_expansion')) else None,
            float(row.get('atr_14', 0)) if pd.notna(row.get('atr_14')) else None,
            float(row.get('atr_move', 0)) if pd.notna(row.get('atr_move')) else None,
            1 if row.get('market_moved_flag') else 0
        )
        insert_data.append(row_data)

    cursor.executemany(insert_sql, insert_data)
    conn.commit()
    cursor.close()
    conn.close()

    print(f"Exported {len(insert_data)} rows to Oracle table filtered_signals")

    # Show distribution
    print(f"\nSignal type distribution in exported data:")
    print(df_filtered['SIGNAL_TYPE'].value_counts().to_string())
    print(f"\nSignal direction distribution:")
    print(df_filtered['SIGNAL_DIRECTION'].value_counts().to_string())
else:
    print("Database connection failed")

Oracle connection successful!
APPLYING QUALITY GATES BEFORE EXPORT
Gate 1 (valid SIGNAL_DATE): Removed 0 rows with null dates
Gate 2 (window_mentions >= 5): Removed 0 rows with insufficient mentions
Gate 3 (valid Z_SCORE): Removed 0 rows with null/inf z-score
Gate 4 (valid SIGNAL_TYPE): Removed 0 rows with invalid signal type
Gate 5 (no duplicates): Removed 0 duplicate (ticker, date, type) combinations

✓ QUALITY GATES COMPLETE: 1810/1810 signals passed (0 removed)
Created filtered_signals table
Exported 1810 rows to Oracle table filtered_signals

Signal type distribution in exported data:
SIGNAL_TYPE
SELL    1018
BUY      792

Signal direction distribution:
SIGNAL_DIRECTION
-1    1018
 1     792
Exported 1810 rows to Oracle table filtered_signals

Signal type distribution in exported data:
SIGNAL_TYPE
SELL    1018
BUY      792

Signal direction distribution:
SIGNAL_DIRECTION
-1    1018
 1     792
