In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

# Force output to print immediately
sys.stdout.reconfigure(encoding='utf-8')

# --- CONFIGURATION ---
TRADE_LOG = "results/trade_log.csv"
FEATURE_FILE = "data/nifty_regimes_5min.csv"
OUTPUT_REPORT = "results/outlier_insights.txt"
PLOT_DIR = "plots"

print("--- STARTING FINAL ANALYSIS (Task 6.1 - 6.3) ---")

# 1. LOAD DATA
print("1. Loading Data...")
# Load Features (Market Data)
features = pd.read_csv(FEATURE_FILE)
features['datetime'] = pd.to_datetime(features['datetime'])
features = features.drop_duplicates(subset=['datetime'], keep='first')
features.set_index('datetime', inplace=True)
features.sort_index(inplace=True)

# Load Trades
trades = pd.read_csv(TRADE_LOG)
trades['entry_time'] = pd.to_datetime(trades['entry_time'])
trades['exit_time'] = pd.to_datetime(trades['exit_time'])

# 2. FEATURE ENGINEERING FOR ANALYSIS (Task 6.1)
print("2. Calculating Analysis Features (EMA Gap, Time, etc)...")

# We need to enrich the trade log with market state at entry
# Using merge_asof to be robust against timestamp mismatches
trades = trades.sort_values('entry_time')

# Select columns to pull from market data
cols_to_merge = ['average_iv', 'regime', 'delta', 'gamma', 'pcr_oi', 'ema_5', 'ema_15', 'close_spot']

merged = pd.merge_asof(
    trades,
    features[cols_to_merge],
    left_on='entry_time',
    right_index=True,
    direction='backward',
    tolerance=pd.Timedelta("15min")
)

# A. Calculate EMA Gap (Task 6.1)
merged['ema_gap'] = merged['ema_5'] - merged['ema_15']

# B. Calculate Time of Day (Task 6.1)
merged['entry_hour'] = merged['entry_time'].dt.hour

# C. Calculate Duration (Task 6.1)
merged['duration_minutes'] = (merged['exit_time'] - merged['entry_time']).dt.total_seconds() / 60

# Filter for Profitable Trades Only (Task 6.1)
profitable = merged[merged['pnl_points'] > 0].copy()
# FILTER: Remove unrealistic trades > 2000 points (Data Glitches)
profitable = profitable[profitable['pnl_points'] < 2000]

if len(profitable) == 0:
    print("CRITICAL: No profitable trades found to analyze.")
    sys.exit()

# 3. IDENTIFY OUTLIERS (Z-Score > 3)
print("3. Identifying 3-Sigma Outliers...")
mean_pnl = profitable['pnl_points'].mean()
std_pnl = profitable['pnl_points'].std()

profitable['z_score'] = (profitable['pnl_points'] - mean_pnl) / std_pnl
outliers = profitable[profitable['z_score'] > 3]
normal_wins = profitable[profitable['z_score'] <= 3]

print(f"   Total Wins: {len(profitable)}")
print(f"   Outliers Detected: {len(outliers)}")

# 4. GENERATE VISUALIZATIONS (Task 6.2)
print("4. Generating Task 6.2 Visualizations...")

# A. Scatter Plot (PnL vs Duration)
plt.figure(figsize=(10, 6))
plt.scatter(normal_wins['duration_minutes'], normal_wins['pnl_points'], alpha=0.5, label='Normal Wins', color='blue')
plt.scatter(outliers['duration_minutes'], outliers['pnl_points'], alpha=0.9, label='Outliers (>3σ)', color='red', s=100, edgecolor='black')
plt.title("Scatter: PnL vs Duration")
plt.xlabel("Duration (Minutes)")
plt.ylabel("PnL (Points)")
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig(f"{PLOT_DIR}/outlier_scatter_pnl_duration.png")

# B. Box Plots (Feature Distributions)
# We compare IV and EMA Gap
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
data_iv = [normal_wins['average_iv'].dropna(), outliers['average_iv'].dropna()]
plt.boxplot(data_iv, labels=['Normal', 'Outliers'])
plt.title("Feature Dist: Average IV")

plt.subplot(1, 2, 2)
data_gap = [normal_wins['ema_gap'].dropna(), outliers['ema_gap'].dropna()]
plt.boxplot(data_gap, labels=['Normal', 'Outliers'])
plt.title("Feature Dist: EMA Gap at Entry")
plt.savefig(f"{PLOT_DIR}/outlier_boxplots.png")

# C. Correlation Heatmap
plt.figure(figsize=(10, 8))
# Select numeric features for correlation
corr_cols = ['pnl_points', 'duration_minutes', 'average_iv', 'pcr_oi', 'ema_gap', 'delta', 'entry_hour']
corr_matrix = profitable[corr_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap (Profitable Trades)")
plt.savefig(f"{PLOT_DIR}/outlier_heatmap.png")

# D. Time Distribution
plt.figure(figsize=(10, 6))
plt.hist([normal_wins['entry_hour'], outliers['entry_hour']], 
         bins=range(9, 16), stacked=False, color=['blue', 'red'], label=['Normal', 'Outlier'], alpha=0.7)
plt.title("Time of Day Distribution")
plt.xlabel("Hour of Day (9 = 9 AM)")
plt.ylabel("Number of Trades")
plt.legend()
plt.savefig(f"{PLOT_DIR}/outlier_time_distribution.png")

# 5. GENERATE INSIGHTS SUMMARY (Task 6.3)
print("5. Writing Insights Report...")

avg_norm_pnl = normal_wins['pnl_points'].mean() if not normal_wins.empty else 0
avg_out_pnl = outliers['pnl_points'].mean() if not outliers.empty else 0
outlier_pct = (len(outliers) / len(profitable)) * 100

# Most common regime for outliers
outlier_regime = outliers['regime'].mode()[0] if not outliers.empty else "N/A"
normal_regime = normal_wins['regime'].mode()[0] if not normal_wins.empty else "N/A"

report = f"""
TASK 6.3: INSIGHTS SUMMARY
==========================

1. OUTLIER PREVALENCE
   - What percentage are outliers? {outlier_pct:.2f}%
   - Count: {len(outliers)} outliers out of {len(profitable)} profitable trades.

2. PnL COMPARISON
   - Average Normal PnL: {avg_norm_pnl:.2f} points
   - Average Outlier PnL: {avg_out_pnl:.2f} points
   - Impact: Outliers generate {avg_out_pnl/avg_norm_pnl if avg_norm_pnl else 0:.1f}x more profit per trade.

3. REGIME PATTERNS
   - Outlier Dominant Regime: {outlier_regime} (+1=Up, -1=Down)
   - Normal Dominant Regime: {normal_regime}
   - Insight: Do outliers happen in the same trend direction as normal trades?

4. TIME-OF-DAY PATTERNS
   - (See outlier_time_distribution.png)
   - Average Entry Hour (Outliers): {outliers['entry_hour'].mean():.1f}
   - Check if outliers cluster around Market Open (9-10 AM) or Close.

5. IV CHARACTERISTICS
   - Avg IV (Outliers): {outliers['average_iv'].mean():.2f}
   - Avg IV (Normal): {normal_wins['average_iv'].mean():.2f}
   - Insight: Higher IV usually implies higher premiums and potential for bigger moves.

6. DISTINGUISHING FEATURES (Correlation Analysis)
   - Duration: Correlation with PnL = {corr_matrix.loc['pnl_points', 'duration_minutes']:.2f}
   - EMA Gap: Correlation with PnL = {corr_matrix.loc['pnl_points', 'ema_gap']:.2f}
"""

with open(OUTPUT_REPORT, "w") as f:
    f.write(report)

print(report)
print(f"✔ SUCCESS! Task 6 Complete. Report saved to {OUTPUT_REPORT}")