## Market Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# --- CONFIGURATION ---
# Replace with your actual file path
FILE_PATH = "../../../../data/processed/gdelt_ohlcv_join.csv" 

def analyze_market_regimes(file_path):
    print(f"Loading data from {file_path}...")
    
    # 1. Load Data
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        # Fallback for notebook relative paths or missing files
        print(f"File not found at {file_path}")
        return

    # map specific column names to the standard names the script needs.
    rename_map = {
        'price_date': 'date',
        # 'article_date': 'date',    # Fallback if price_date is missing
        'next_close': 'close',
        'Close': 'close',          # Fallback
        'next_volume': 'volume',
        'Volume': 'volume',        # Fallback
        'next_open': 'open',
        'Open': 'open'             # Fallback
    }
    
    # Apply the renaming
    df.rename(columns=rename_map, inplace=True)
    
    # VALIDATION: Check if we have the Big 3 columns
    required_cols = ['date', 'close', 'volume']
    missing = [col for col in required_cols if col not in df.columns]
    
    if missing:
        print(f"   CRITICAL ERROR: Could not find columns: {missing}")
        print(f"   Your available columns are: {df.columns.tolist()}")
        return

    # Standardize Dates
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values(['ticker', 'date'])
    
    print(f" Data Loaded Successfully: {len(df)} rows.")

    # Close-Close % Return
    df['daily_return'] = df.groupby('ticker')['close'].pct_change()
    
    # Absolute Return (Magnitude of move, directionless)
    df['abs_return'] = df['daily_return'].abs()
    
    # We use a 20 day Rolling Standard Deviation of returns (Annualized)
    df['volatility_proxy'] = df.groupby('ticker')['daily_return'].transform(
        lambda x: x.rolling(window=20).std()
    )

    # Drop NaN values generated by shifting/rolling
    df = df.dropna(subset=['daily_return', 'volatility_proxy'])

    # Sentiment Buckets
    # Logic: "High" is top 33%, "Low" is bottom 33% relative to ticker's history.
    
    def bucket_sentiment(group):
        try:
            # qcut to divide into equal sized buckets
            return pd.qcut(group['sentiment_score'], 3, labels=["Low", "Neutral", "High"])
        except ValueError:
            # Fallback if all sentiment scores are identical
            return pd.Series(["Neutral"] * len(group), index=group.index)

    if 'sentiment_score' in df.columns:
        df['sentiment_bucket'] = df.groupby('ticker').apply(bucket_sentiment).reset_index(level=0, drop=True)
    else:
        print(" Warning: 'sentiment_score' column missing. Skipping sentiment buckets.")

    # Volume Buckets
    # RVOL = Today's Vol / 20-Day Average Vol
    df['avg_volume_20d'] = df.groupby('ticker')['volume'].transform(lambda x: x.rolling(20).mean())
    df['rvol'] = df['volume'] / df['avg_volume_20d']
    
    # Bucket: Low (< 0.8x), Normal (0.8x - 1.2x), High (> 1.2x)
    bins = [-np.inf, 0.8, 1.2, np.inf]
    labels = ['Low Vol', 'Normal Vol', 'High Vol']
    df['volume_bucket'] = pd.cut(df['rvol'], bins=bins, labels=labels)


    print("\n==================================================")
    print(" MARKET REGIME REPORT")
    print("==================================================")
    
    # 1. Volatility Inspection
    avg_vol = df.groupby('ticker')['volatility_proxy'].mean() * np.sqrt(252) # Annualized
    print(f"\n1. ðŸ“‰ Average Annualized Volatility:")
    print(avg_vol.to_markdown())

    # 2. Return Distribution by Sentiment
    if 'sentiment_bucket' in df.columns:
        print(f"\n2.  Return Distribution by Sentiment Bucket:")
        sent_stats = df.groupby('sentiment_bucket')['daily_return'].agg(['mean', 'std', 'count'])
        print(sent_stats.to_markdown())
        
        try:
            if sent_stats.loc['High', 'mean'] > sent_stats.loc['Low', 'mean']:
                print("\n    Insight: 'High Sentiment' days have higher average returns.")
            else:
                print("\n    Insight: Sentiment does not clearly differentiate returns.")
        except KeyError:
            pass 

    # 3. Return Distribution by Volume
    print(f"\n3.  Return Distribution by Volume Bucket:")
    vol_stats = df.groupby('volume_bucket')['abs_return'].agg(['mean', 'count'])
    vol_stats.columns = ['Avg Absolute Move', 'Count']
    print(vol_stats.to_markdown())
    
    print("\n   (Note: High Volume usually corresponds to larger Absolute Moves/Volatility)")

    # VISUALIZATION 
    plot_distributions(df)
    
    return df

def plot_distributions(df):
    plt.figure(figsize=(14, 6))

    # Plot 1: Sentiment vs Returns
    if 'sentiment_bucket' in df.columns:
        plt.subplot(1, 2, 1)
        sns.boxplot(x='sentiment_bucket', y='daily_return', hue='sentiment_bucket', data=df, palette="coolwarm", showfliers=False, legend=False)
        plt.title("Does Sentiment Impact Daily Returns?")
        plt.axhline(0, color='black', linestyle='--')
        plt.ylabel("Daily Return")

    # Plot 2: Volume vs Volatility (Absolute Return)
    plt.subplot(1, 2, 2)
    sns.barplot(x='volume_bucket', y='abs_return', hue='volume_bucket', data=df, palette="viridis", legend=False)
    plt.title("Does Volume Impact Price Movement Size?")
    plt.ylabel("Avg Absolute Return (Magnitude)")

    plt.tight_layout()
    plt.show()

if __name__ == "__main__":
    df_regime = analyze_market_regimes(FILE_PATH)

Loading data from ../../../data/processed/gdelt_ohlcv_join.csv...
File not found at ../../../data/processed/gdelt_ohlcv_join.csv
