In [11]:
"""
Commodities Trading Dashboard - Data Collection & Analysis
Author: Ondrej Marvan
Course: Python and SQL - Final Project

This notebook collects and analyzes data for 8 instruments:
- EUA (European Union Allowance) - EUR
- Crude Oil (WTI) - USD
- Natural Gas - USD
- Gold - USD
- Silver - USD
- Bitcoin - USD
- S&P 500 Index - USD
- NASDAQ 100 Index - USD

Data Period: January 1, 2025 - December 31, 2025 (Daily)
"""

# ============================================================================
# 1. IMPORT LIBRARIES
# ============================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import sqlite3
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style("darkgrid")
plt.rcParams['figure.figsize'] = (16, 10)

# ============================================================================
# 2. DATA COLLECTION
# ============================================================================

# Base path for data files
base_path = '/home/ondrej-marvan/Documents/GitHub/OBS_DataScience/OBS_DataScience/Autumn 2025/2400-DS1SQL Python and SQL intro, SQL platforms/Project/data/'

# Define instruments with their properties
INSTRUMENTS = {
    'EUA': {'file': 'EUA_daily.csv', 'name': 'EU Allowance', 'currency': 'EUR', 'type': 'Environmental'},
    'OIL': {'file': 'OIL_daily.csv', 'name': 'Crude Oil WTI', 'currency': 'USD', 'type': 'Energy'},
    'NG': {'file': 'NG_daily.csv', 'name': 'Natural Gas', 'currency': 'USD', 'type': 'Energy'},
    'GOLD': {'file': 'GOLD_daily.csv', 'name': 'Gold', 'currency': 'USD', 'type': 'Precious Metal'},
    'SILVER': {'file': 'SILVER_daily.csv', 'name': 'Silver', 'currency': 'USD', 'type': 'Precious Metal'},
    'BTC': {'file': 'BTC_daily.csv', 'name': 'Bitcoin', 'currency': 'USD', 'type': 'Cryptocurrency'},
    'SPX': {'file': 'S&P500_daily.csv', 'name': 'S&P 500', 'currency': 'USD', 'type': 'Equity Index'},
    'NDX': {'file': 'NASDAQ100_daily.csv', 'name': 'NASDAQ 100', 'currency': 'USD', 'type': 'Equity Index'}
}

def load_csv_data(filepath, symbol_name):
    """Load data from CSV file and standardize format"""
    try:
        # Read CSV file
        df = pd.read_csv(filepath)
        
        # Print actual columns found
        print(f"  ðŸ“‹ Columns in {symbol_name}: {list(df.columns)}")
        
        # Your exact column names: Date, Price, Open, High, Low, Vol., Change %
        # Create standardized dataframe
        result_df = pd.DataFrame()
        
        # Date column
        result_df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        
        # Price columns - handle potential comma as decimal separator
        result_df['Price'] = pd.to_numeric(df['Price'].astype(str).str.replace(',', ''), errors='coerce')
        result_df['Open'] = pd.to_numeric(df['Open'].astype(str).str.replace(',', ''), errors='coerce')
        result_df['High'] = pd.to_numeric(df['High'].astype(str).str.replace(',', ''), errors='coerce')
        result_df['Low'] = pd.to_numeric(df['Low'].astype(str).str.replace(',', ''), errors='coerce')
        
        # Volume column - handle Vol. with period and possible K/M/B suffixes
        def parse_volume(x):
            x = str(x).upper().strip().replace(',', '')
            try:
                if x == '-' or x == '' or x == 'NAN':
                    return 0
                if 'B' in x:
                    return float(x.replace('B', '')) * 1000000000
                elif 'M' in x:
                    return float(x.replace('M', '')) * 1000000
                elif 'K' in x:
                    return float(x.replace('K', '')) * 1000
                else:
                    return float(x)
            except:
                return 0
        
        result_df['Vol'] = df['Vol.'].apply(parse_volume)
        
        result_df['Symbol'] = symbol_name
        result_df = result_df.sort_values('Date')
        
        # Fill any missing OHLC values
        result_df['Open'] = result_df['Open'].fillna(result_df['Price'])
        result_df['High'] = result_df['High'].fillna(result_df['Price'])
        result_df['Low'] = result_df['Low'].fillna(result_df['Price'])
        result_df['Vol'] = result_df['Vol'].fillna(0)
        
        # Ensure High is highest, Low is lowest
        result_df['High'] = result_df[['Open', 'High', 'Low', 'Price']].max(axis=1)
        result_df['Low'] = result_df[['Open', 'High', 'Low', 'Price']].min(axis=1)
        
        valid_records = len(result_df[result_df['Price'].notna()])
        print(f"  âœ“ Loaded {valid_records:,} valid records for {symbol_name}")
        
        return result_df[['Date', 'Symbol', 'Open', 'High', 'Low', 'Price', 'Vol']]
        
    except FileNotFoundError:
        print(f"  âœ— File not found: {filepath}")
        return pd.DataFrame()
    except KeyError as e:
        print(f"  âœ— Column not found in {symbol_name}: {str(e)}")
        print(f"     Available columns: {list(df.columns)}")
        return pd.DataFrame()
    except Exception as e:
        print(f"  âœ— Error loading {symbol_name}: {str(e)}")
        print(f"     File: {filepath}")
        return pd.DataFrame()

# Load all data files
print("="*80)
print("LOADING DATA FROM CSV FILES")
print("Data Period: January 1, 2025 - December 31, 2025")
print("="*80 + "\n")

all_dataframes = []
failed_loads = []

for symbol, info in INSTRUMENTS.items():
    print(f"Loading {info['name']} ({symbol})...")
    df = load_csv_data(base_path + info['file'], symbol)
    if len(df) > 0:
        all_dataframes.append(df)
    else:
        failed_loads.append(symbol)

# Combine all data
if len(all_dataframes) > 0:
    all_data = pd.concat(all_dataframes, ignore_index=True)
    print(f"\n{'='*80}")
    print(f"âœ“ SUCCESSFULLY LOADED DATA")
    print(f"{'='*80}")
    print(f"Total records collected: {len(all_data):,}")
    print(f"Instruments loaded: {all_data['Symbol'].nunique()}")
    print(f"Date range: {all_data['Date'].min().date()} to {all_data['Date'].max().date()}")
    
    if failed_loads:
        print(f"\nâš  Failed to load: {', '.join(failed_loads)}")
    
    print(f"{'='*80}\n")
else:
    print("\nâœ— ERROR: No data was loaded. Please check file paths and formats.")
    exit()

# ============================================================================
# 3. DATA CLEANING & PREPROCESSING
# ============================================================================

def clean_data(df):
    """Clean and preprocess the data"""
    print("Cleaning data...")
    
    initial_count = len(df)
    
    # Remove rows with null dates
    df = df.dropna(subset=['Date'])
    print(f"  - Removed {initial_count - len(df):,} rows with invalid dates")
    
    # Remove duplicates
    initial_count = len(df)
    df = df.drop_duplicates(subset=['Date', 'Symbol'])
    if initial_count - len(df) > 0:
        print(f"  - Removed {initial_count - len(df):,} duplicate records")
    
    # Handle missing values in price data
    missing_before = df['Price'].isnull().sum()
    df = df.dropna(subset=['Price'])
    if missing_before > 0:
        print(f"  - Removed {missing_before:,} rows with missing prices")
    
    # Ensure proper data types
    df['Date'] = pd.to_datetime(df['Date'])
    numeric_cols = ['Open', 'High', 'Low', 'Price', 'Vol']
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Remove any rows with negative or zero prices (data errors)
    price_errors = len(df[df['Price'] <= 0])
    df = df[df['Price'] > 0]
    if price_errors > 0:
        print(f"  - Removed {price_errors:,} rows with invalid prices (<= 0)")
    
    # Sort by symbol and date
    df = df.sort_values(['Symbol', 'Date'])
    
    # Reset index
    df = df.reset_index(drop=True)
    
    print(f"  âœ“ Cleaning complete: {len(df):,} clean records\n")
    return df

all_data = clean_data(all_data)

# Display sample data by instrument
print("="*80)
print("LATEST DATA BY INSTRUMENT")
print("="*80)
print(f"{'Symbol':<8} {'Name':<20} {'Latest Price':<15} {'Currency':<8} {'Date':<12}")
print("-"*80)
for symbol in sorted(all_data['Symbol'].unique()):
    sample = all_data[all_data['Symbol'] == symbol].tail(1)
    if len(sample) > 0:
        latest = sample.iloc[0]
        info = INSTRUMENTS[symbol]
        price_str = f"{latest['Price']:.2f}"
        print(f"{symbol:<8} {info['name']:<20} {price_str:<15} {info['currency']:<8} {latest['Date'].date()}")
print("="*80 + "\n")

# ============================================================================
# 4. TECHNICAL INDICATORS CALCULATION
# ============================================================================

def calculate_sma(df, periods=[20, 50, 200]):
    """Calculate Simple Moving Averages"""
    print(f"Calculating Simple Moving Averages ({periods})...")
    for period in periods:
        df[f'SMA_{period}'] = df.groupby('Symbol')['Price'].transform(
            lambda x: x.rolling(window=period, min_periods=1).mean()
        )
    return df

def calculate_bollinger_bands(df, period=20, std_dev=2):
    """Calculate Bollinger Bands"""
    print(f"Calculating Bollinger Bands (period={period}, std_dev={std_dev})...")
    df['BB_Middle'] = df.groupby('Symbol')['Price'].transform(
        lambda x: x.rolling(window=period, min_periods=1).mean()
    )
    df['BB_Std'] = df.groupby('Symbol')['Price'].transform(
        lambda x: x.rolling(window=period, min_periods=1).std()
    )
    df['BB_Upper'] = df['BB_Middle'] + (std_dev * df['BB_Std'])
    df['BB_Lower'] = df['BB_Middle'] - (std_dev * df['BB_Std'])
    return df

def calculate_rsi(df, period=14):
    """Calculate Relative Strength Index"""
    print(f"Calculating RSI (period={period})...")
    def rsi_calc(prices):
        deltas = prices.diff()
        gain = deltas.where(deltas > 0, 0).rolling(window=period, min_periods=1).mean()
        loss = -deltas.where(deltas < 0, 0).rolling(window=period, min_periods=1).mean()
        rs = gain / loss
        return 100 - (100 / (1 + rs))
    
    df['RSI'] = df.groupby('Symbol')['Price'].transform(rsi_calc)
    return df

def calculate_daily_returns(df):
    """Calculate daily returns"""
    print("Calculating daily returns...")
    df['Daily_Return'] = df.groupby('Symbol')['Price'].pct_change() * 100
    return df

# Apply technical indicators
print("\n" + "="*80)
print("CALCULATING TECHNICAL INDICATORS")
print("="*80 + "\n")

all_data = calculate_sma(all_data, [20, 50, 200])
all_data = calculate_bollinger_bands(all_data)
all_data = calculate_rsi(all_data)
all_data = calculate_daily_returns(all_data)

print("âœ“ All technical indicators calculated\n")

# ============================================================================
# 5. BACKWARDATION ANALYSIS
# ============================================================================

def calculate_contango_backwardation(df):
    """
    Calculate contango/backwardation indicator
    Positive = Contango (futures > spot)
    Negative = Backwardation (futures < spot)
    
    Using Close vs Open as a proxy for market structure
    """
    print("Calculating backwardation/contango indicators...")
    
    # Using Close vs Open as proxy for futures curve
    df['Futures_Spread'] = ((df['Price'] - df['Open']) / df['Open']) * 100
    
    # 10-day rolling average of spread
    df['Spread_MA10'] = df.groupby('Symbol')['Futures_Spread'].transform(
        lambda x: x.rolling(window=10, min_periods=1).mean()
    )
    
    # Classify market structure
    # Thresholds: >0.5% = Contango, <-0.5% = Backwardation, else Normal
    df['Market_Structure'] = df['Spread_MA10'].apply(
        lambda x: 'Contango' if x > 0.5 else ('Backwardation' if x < -0.5 else 'Normal')
    )
    
    print("âœ“ Backwardation analysis complete\n")
    return df

print("="*80)
print("BACKWARDATION/CONTANGO ANALYSIS")
print("="*80 + "\n")

all_data = calculate_contango_backwardation(all_data)

# ============================================================================
# 6. SAVE TO SQL DATABASE
# ============================================================================

def save_to_database(df, db_name='commodities_trading.db'):
    """Save data to SQLite database"""
    print("="*80)
    print("SAVING DATA TO DATABASE")
    print("="*80 + "\n")
    
    conn = sqlite3.connect(db_name)
    
    # Main prices table
    print("Creating 'prices' table...")
    df.to_sql('prices', conn, if_exists='replace', index=False)
    print(f"  âœ“ Saved {len(df):,} records to 'prices' table")
    
    # Create instrument metadata table
    print("\nCreating 'instruments' table...")
    instruments_df = pd.DataFrame([
        {'Symbol': k, 'Name': v['name'], 'Currency': v['currency'], 'Type': v['type']}
        for k, v in INSTRUMENTS.items() if k in df['Symbol'].unique()
    ])
    instruments_df.to_sql('instruments', conn, if_exists='replace', index=False)
    print(f"  âœ“ Saved metadata for {len(instruments_df)} instruments")
    
    # Create aggregated statistics table
    print("\nCreating 'statistics' table...")
    stats_df = df.groupby('Symbol').agg({
        'Price': ['mean', 'std', 'min', 'max'],
        'Vol': 'sum',
        'Daily_Return': ['mean', 'std']
    }).reset_index()
    stats_df.columns = ['Symbol', 'Avg_Price', 'Price_Std', 'Min_Price', 'Max_Price', 
                        'Total_Volume', 'Avg_Return', 'Return_Volatility']
    
    # Merge with instrument info
    stats_df = stats_df.merge(instruments_df[['Symbol', 'Currency', 'Type']], on='Symbol', how='left')
    stats_df.to_sql('statistics', conn, if_exists='replace', index=False)
    print(f"  âœ“ Saved statistics for {len(stats_df)} instruments")
    
    # Create users table for login
    print("\nCreating 'users' table...")
    users_df = pd.DataFrame({
        'username': ['admin', 'trader1', 'demo_user'],
        'password': ['admin123', 'trader123', 'demo123'],
        'role': ['admin', 'trader', 'demo']
    })
    users_df.to_sql('users', conn, if_exists='replace', index=False)
    print(f"  âœ“ Created {len(users_df)} user accounts")
    
    # Create orders table
    print("\nCreating 'orders' table...")
    conn.execute('''
        CREATE TABLE IF NOT EXISTS orders (
            order_id INTEGER PRIMARY KEY AUTOINCREMENT,
            timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
            username TEXT,
            symbol TEXT,
            order_type TEXT,
            quantity REAL,
            price REAL,
            total_value REAL,
            status TEXT DEFAULT 'Pending'
        )
    ''')
    print("  âœ“ Orders table created")
    
    conn.commit()
    conn.close()
    print(f"\n{'='*80}")
    print(f"âœ“ All data saved to '{db_name}'")
    print(f"{'='*80}\n")

save_to_database(all_data)

# ============================================================================
# 7. EXPLORATORY DATA ANALYSIS & VISUALIZATIONS
# ============================================================================

print("\n" + "="*80)
print("EXPLORATORY DATA ANALYSIS")
print("="*80)

# Summary statistics by instrument
print("\n1. SUMMARY STATISTICS BY INSTRUMENT:")
print("-" * 80)
for symbol in sorted(all_data['Symbol'].unique()):
    symbol_data = all_data[all_data['Symbol'] == symbol]
    info = INSTRUMENTS[symbol]
    print(f"\n{symbol} ({info['name']}) - {info['currency']}:")
    print(f"  Records: {len(symbol_data):,}")
    print(f"  Price Range: {symbol_data['Price'].min():.2f} - {symbol_data['Price'].max():.2f}")
    print(f"  Avg Price: {symbol_data['Price'].mean():.2f}")
    print(f"  Avg Daily Return: {symbol_data['Daily_Return'].mean():.3f}%")
    print(f"  Volatility (Std Dev): {symbol_data['Daily_Return'].std():.3f}%")
    print(f"  Current RSI: {symbol_data['RSI'].iloc[-1]:.1f}")

# Recent market structure
print("\n2. CURRENT MARKET STATUS:")
print("-" * 80)
print(f"{'Symbol':<8} {'Name':<20} {'Price':<12} {'Currency':<8} {'RSI':<8} {'Market Structure':<15}")
print("-" * 80)
for symbol in sorted(all_data['Symbol'].unique()):
    latest = all_data[all_data['Symbol'] == symbol].tail(1).iloc[0]
    info = INSTRUMENTS[symbol]
    print(f"{symbol:<8} {info['name']:<20} {latest['Price']:<12.2f} {info['currency']:<8} "
          f"{latest['RSI']:<8.1f} {latest['Market_Structure']:<15}")

# Data availability
print("\n3. DATA AVAILABILITY BY INSTRUMENT:")
print("-" * 80)
print(f"{'Symbol':<8} {'Name':<20} {'Start Date':<12} {'End Date':<12} {'Records':<10} {'Trading Days':<12}")
print("-" * 80)
for symbol in sorted(all_data['Symbol'].unique()):
    symbol_data = all_data[all_data['Symbol'] == symbol]
    info = INSTRUMENTS[symbol]
    trading_days = len(symbol_data)
    print(f"{symbol:<8} {info['name']:<20} {symbol_data['Date'].min().date()} "
          f"{symbol_data['Date'].max().date()} {len(symbol_data):<10,} {trading_days:<12,}")

print("\n" + "="*80)
print("GENERATING VISUALIZATIONS (10+ charts)")
print("="*80 + "\n")

# Get symbols list
symbols_list = sorted(all_data['Symbol'].unique())

# Visualization 1: Price Trends (2x4 grid)
print("1. Creating price trends chart...")
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.flatten()
for i, symbol in enumerate(symbols_list):
    data = all_data[all_data['Symbol'] == symbol]
    if len(data) > 0:
        axes[i].plot(data['Date'], data['Price'], label='Price', linewidth=2, color='#2E86AB')
        axes[i].plot(data['Date'], data['SMA_20'], label='SMA 20', alpha=0.7, color='#A23B72', linestyle='--')
        axes[i].plot(data['Date'], data['SMA_50'], label='SMA 50', alpha=0.7, color='#F18F01', linestyle='--')
        info = INSTRUMENTS[symbol]
        axes[i].set_title(f'{symbol} - {info["name"]} ({info["currency"]})', fontsize=11, fontweight='bold')
        axes[i].set_xlabel('Date', fontsize=9)
        axes[i].set_ylabel(f'Price ({info["currency"]})', fontsize=9)
        axes[i].legend(fontsize=8, loc='best')
        axes[i].tick_params(axis='x', rotation=45, labelsize=8)
        axes[i].grid(True, alpha=0.3)
plt.suptitle('Price Trends with Moving Averages (2025)', fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.savefig('price_trends.png', dpi=300, bbox_inches='tight')
print("  âœ“ Saved: price_trends.png")
plt.close()

# Visualization 2: Bollinger Bands (2x4 grid)
print("2. Creating Bollinger Bands chart...")
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.flatten()
for i, symbol in enumerate(symbols_list):
    data = all_data[all_data['Symbol'] == symbol]
    if len(data) > 0:
        axes[i].plot(data['Date'], data['Price'], label='Price', linewidth=2, color='black')
        axes[i].plot(data['Date'], data['BB_Upper'], label='Upper Band', linestyle='--', alpha=0.7, color='red')
        axes[i].plot(data['Date'], data['BB_Middle'], label='Middle', alpha=0.7, color='blue')
        axes[i].plot(data['Date'], data['BB_Lower'], label='Lower Band', linestyle='--', alpha=0.7, color='green')
        axes[i].fill_between(data['Date'], data['BB_Lower'], data['BB_Upper'], alpha=0.15, color='lightblue')
        info = INSTRUMENTS[symbol]
        axes[i].set_title(f'{symbol} - Bollinger Bands', fontsize=11, fontweight='bold')
        axes[i].set_xlabel('Date', fontsize=9)
        axes[i].set_ylabel(f'Price ({info["currency"]})', fontsize=9)
        axes[i].legend(fontsize=8, loc='best')
        axes[i].tick_params(axis='x', rotation=45, labelsize=8)
        axes[i].grid(True, alpha=0.3)
plt.suptitle('Bollinger Bands Analysis (2025)', fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.savefig('bollinger_bands.png', dpi=300, bbox_inches='tight')
print("  âœ“ Saved: bollinger_bands.png")
plt.close()

# Visualization 3: RSI Comparison (2x4 grid)
print("3. Creating RSI analysis chart...")
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.flatten()
for i, symbol in enumerate(symbols_list):
    data = all_data[all_data['Symbol'] == symbol]
    if len(data) > 0:
        axes[i].plot(data['Date'], data['RSI'], linewidth=2, color='purple')
        axes[i].axhline(y=70, color='r', linestyle='--', label='Overbought (70)', alpha=0.7, linewidth=1.5)
        axes[i].axhline(y=30, color='g', linestyle='--', label='Oversold (30)', alpha=0.7, linewidth=1.5)
        axes[i].fill_between(data['Date'], 30, 70, alpha=0.1, color='gray')
        axes[i].set_title(f'{symbol} - RSI', fontsize=11, fontweight='bold')
        axes[i].set_xlabel('Date', fontsize=9)
        axes[i].set_ylabel('RSI', fontsize=9)
        axes[i].legend(fontsize=8, loc='best')
        axes[i].set_ylim(0, 100)
        axes[i].tick_params(axis='x', rotation=45, labelsize=8)
        axes[i].grid(True, alpha=0.3)
plt.suptitle('RSI (Relative Strength Index) Analysis (2025)', fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.savefig('rsi_analysis.png', dpi=300, bbox_inches='tight')
print("  âœ“ Saved: rsi_analysis.png")
plt.close()

# Visualization 4: Daily Returns Distribution (2x4 grid)
print("4. Creating returns distribution chart...")
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.flatten()
for i, symbol in enumerate(symbols_list):
    data = all_data[all_data['Symbol'] == symbol]['Daily_Return'].dropna()
    if len(data) > 0:
        axes[i].hist(data, bins=40, edgecolor='black', alpha=0.7, color='skyblue')
        axes[i].axvline(data.mean(), color='r', linestyle='--', linewidth=2, label=f'Mean: {data.mean():.2f}%')
        axes[i].axvline(data.median(), color='g', linestyle='--', linewidth=2, label=f'Median: {data.median():.2f}%')
        axes[i].axvline(0, color='black', linestyle='-', linewidth=1, alpha=0.3)
        axes[i].set_title(f'{symbol} - Daily Returns', fontsize=11, fontweight='bold')
        axes[i].set_xlabel('Daily Return (%)', fontsize=9)
        axes[i].set_ylabel('Frequency', fontsize=9)
        axes[i].legend(fontsize=8, loc='best')
        axes[i].grid(True, alpha=0.3, axis='y')
plt.suptitle('Daily Returns Distribution (2025)', fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.savefig('returns_distribution.png', dpi=300, bbox_inches='tight')
print("  âœ“ Saved: returns_distribution.png")
plt.close()

# Visualization 5: Correlation Heatmap
print("5. Creating correlation matrix...")
plt.figure(figsize=(12, 10))
pivot_data = all_data.pivot_table(values='Price', index='Date', columns='Symbol')
correlation = pivot_data.corr()
mask = np.triu(np.ones_like(correlation, dtype=bool))
sns.heatmap(correlation, mask=mask, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8},
            fmt='.2f', annot_kws={'size': 11}, vmin=-1, vmax=1)
plt.title('Price Correlation Matrix - All Instruments (2025)', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('correlation_matrix.png', dpi=300, bbox_inches='tight')
print("  âœ“ Saved: correlation_matrix.png")
plt.close()

# Visualization 6: Backwardation/Contango Analysis (2x4 grid)
print("6. Creating backwardation/contango chart...")
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.flatten()
for i, symbol in enumerate(symbols_list):
    data = all_data[all_data['Symbol'] == symbol]
    if len(data) > 0:
        colors = data['Spread_MA10'].apply(lambda x: 'green' if x < 0 else 'red')
        axes[i].bar(data['Date'], data['Spread_MA10'], color=colors, alpha=0.6, width=1)
        axes[i].axhline(y=0, color='black', linestyle='-', linewidth=2)
        axes[i].axhline(y=0.5, color='red', linestyle='--', alpha=0.5, linewidth=1)
        axes[i].axhline(y=-0.5, color='green', linestyle='--', alpha=0.5, linewidth=1)
        axes[i].set_title(f'{symbol} - Market Structure', fontsize=11, fontweight='bold')
        axes[i].set_xlabel('Date', fontsize=9)
        axes[i].set_ylabel('Spread (%)', fontsize=9)
        axes[i].tick_params(axis='x', rotation=45, labelsize=8)
        axes[i].grid(True, alpha=0.3, axis='y')
plt.suptitle('Backwardation/Contango Analysis (2025)', fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.savefig('market_structure.png', dpi=300, bbox_inches='tight')
print("  âœ“ Saved: market_structure.png")
plt.close()

# Visualization 7: Volume Analysis (2x4 grid)
print("7. Creating volume analysis chart...")
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.flatten()
for i, symbol in enumerate(symbols_list):
    data = all_data[all_data['Symbol'] == symbol]
    if len(data) > 0 and data['Vol'].sum() > 0:
        colors = ['red' if ret < 0 else 'green' for ret in data['Daily_Return'].fillna(0)]
        axes[i].bar(data['Date'], data['Vol'], alpha=0.6, color=colors, width=1)
        axes[i].set_title(f'{symbol} - Trading Volume', fontsize=11, fontweight='bold')
        axes[i].set_xlabel('Date', fontsize=9)
        axes[i].set_ylabel('Volume', fontsize=9)
        axes[i].tick_params(axis='x', rotation=45, labelsize=8)
        axes[i].grid(True, alpha=0.3, axis='y')
        # Format y-axis for large numbers
        axes[i].ticklabel_format(style='plain', axis='y')
    else:
        axes[i].text(0.5, 0.5, f'No volume data\navailable for {symbol}', 
                    ha='center', va='center', transform=axes[i].transAxes, fontsize=10)
        axes[i].set_title(f'{symbol} - Trading Volume', fontsize=11, fontweight='bold')
plt.suptitle('Trading Volume Analysis (2025)', fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.savefig('volume_analysis.png', dpi=300, bbox_inches='tight')
print("  âœ“ Saved: volume_analysis.png")
plt.close()

# Visualization 8: Cumulative Returns
print("8. Creating cumulative returns chart...")
plt.figure(figsize=(16, 9))
for symbol in symbols_list:
    data = all_data[all_data['Symbol'] == symbol].copy()
    if len(data) > 0:
        data['Cumulative_Return'] = (1 + data['Daily_Return']/100).cumprod() - 1
        plt.plot(data['Date'], data['Cumulative_Return'] * 100, label=symbol, linewidth=2.5)

plt.title('Cumulative Returns Comparison - All Instruments (2025)', fontsize=16, fontweight='bold')
plt.xlabel('Date', fontsize=12)
plt.ylabel('Cumulative Return (%)', fontsize=12)
plt.legend(fontsize=11, loc='best', ncol=2)
plt.grid(True, alpha=0.3)
plt.axhline(y=0, color='black', linestyle='-', linewidth=1, alpha=0.5)
plt.tight_layout()
plt.savefig('cumulative_returns.png', dpi=300, bbox_inches='tight')
print("  âœ“ Saved: cumulative_returns.png")
plt.close()

# Visualization 9: Risk-Return Scatter
print("9. Creating risk-return profile chart...")
plt.figure(figsize=(12, 9))
risk_return = all_data.groupby('Symbol').agg({
    'Daily_Return': ['mean', 'std']
}).reset_index()
risk_return.columns = ['Symbol', 'Avg_Return', 'Volatility']

# Define colors for each instrument type
colors_map = {
    'EUA': '#9B59B6', 'OIL': '#E74C3C', 'NG': '#3498DB', 
    'GOLD': '#F39C12', 'SILVER': '#95A5A6', 'BTC': '#E67E22',
    'SPX': '#27AE60', 'NDX': '#16A085'
}

for _, row in risk_return.iterrows():
    color = colors_map.get(row['Symbol'], 'gray')
    plt.scatter(row['Volatility'], row['Avg_Return'], s=400, alpha=0.7, color=color, edgecolors='black', linewidth=2)
    plt.annotate(row['Symbol'], (row['Volatility'], row['Avg_Return']), 
                fontsize=12, ha='center', va='center', fontweight='bold', color='white')

plt.axhline(y=0, color='black', linestyle='--', linewidth=1.5, alpha=0.5)
plt.axvline(x=risk_return['Volatility'].mean(), color='gray', linestyle='--', linewidth=1, alpha=0.3, label='Avg Volatility')
plt.title('Risk-Return Profile - All Instruments (2025)', fontsize=16, fontweight='bold')
plt.xlabel('Volatility (Std Dev of Daily Returns %)', fontsize=12)
plt.ylabel('Average Daily Return (%)', fontsize=12)
plt.grid(True, alpha=0.3)
plt.legend(fontsize=10)
plt.tight_layout()
plt.savefig('risk_return.png', dpi=300, bbox_inches='tight')
print("  âœ“ Saved: risk_return.png")
plt.close()

# Visualization 10: Price Performance Comparison (Normalized)
print("10. Creating normalized performance chart...")
plt.figure(figsize=(16, 9))
for symbol in symbols_list:
    data = all_data[all_data['Symbol'] == symbol].copy()
    if len(data) > 0 and data['Price'].iloc[0] > 0:
        data['Normalized_Price'] = (data['Price'] / data['Price'].iloc[0]) * 100
        plt.plot(data['Date'], data['Normalized_Price'], label=symbol, linewidth=2.5)

plt.title('Normalized Price Performance - All Instruments (Base = 100, 2025)', fontsize=16, fontweight='bold')
plt.xlabel('Date', fontsize=12)
plt.ylabel('Normalized Price (Index)', fontsize=12)
plt.legend(fontsize=11, loc='best', ncol=2)
plt.axhline(y=100, color='black', linestyle='--', linewidth=1.5, alpha=0.5, label='Baseline')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('normalized_performance.png', dpi=300, bbox_inches='tight')
print("  âœ“ Saved: normalized_performance.png")
plt.close()

# Visualization 11: Monthly Returns Heatmap
print("11. Creating monthly returns heatmap...")
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.flatten()

for i, symbol in enumerate(symbols_list):
    data = all_data[all_data['Symbol'] == symbol].copy()
    if len(data) > 0:
        data['Month'] = data['Date'].dt.to_period('M')
        monthly_returns = data.groupby('Month')['Daily_Return'].sum().reset_index()
        monthly_returns['Month_Str'] = monthly_returns['Month'].astype(str)
        
        colors = ['red' if x < 0 else 'green' for x in monthly_returns['Daily_Return']]
        axes[i].bar(range(len(monthly_returns)), monthly_returns['Daily_Return'], 
                   color=colors, alpha=0.7, edgecolor='black')
        axes[i].set_title(f'{symbol} - Monthly Returns', fontsize=11, fontweight='bold')
        axes[i].set_xlabel('Month', fontsize=9)
        axes[i].set_ylabel('Return (%)', fontsize=9)
        axes[i].axhline(y=0, color='black', linestyle='-', linewidth=1)
        axes[i].set_xticks(range(len(monthly_returns)))
        axes[i].set_xticklabels([m.split('-')[1] for m in monthly_returns['Month_Str']], 
                               rotation=45, fontsize=8)
        axes[i].grid(True, alpha=0.3, axis='y')

plt.suptitle('Monthly Returns by Instrument (2025)', fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.savefig('monthly_returns.png', dpi=300, bbox_inches='tight')
print("  âœ“ Saved: monthly_returns.png")
plt.close()

# Visualization 12: Volatility Over Time
print("12. Creating volatility over time chart...")
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.flatten()

for i, symbol in enumerate(symbols_list):
    data = all_data[all_data['Symbol'] == symbol].copy()
    if len(data) > 0:
        # Calculate 20-day rolling volatility
        data['Rolling_Vol'] = data['Daily_Return'].rolling(window=20).std()
        axes[i].plot(data['Date'], data['Rolling_Vol'], linewidth=2, color='darkred')
        axes[i].fill_between(data['Date'], data['Rolling_Vol'], alpha=0.3, color='red')
        axes[i].set_title(f'{symbol} - Volatility (20-day)', fontsize=11, fontweight='bold')
        axes[i].set_xlabel('Date', fontsize=9)
        axes[i].set_ylabel('Volatility (%)', fontsize=9)
        axes[i].tick_params(axis='x', rotation=45, labelsize=8)
        axes[i].grid(True, alpha=0.3)

plt.suptitle('Rolling Volatility Analysis (2025)', fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.savefig('volatility_analysis.png', dpi=300, bbox_inches='tight')
print("  âœ“ Saved: volatility_analysis.png")
plt.close()

print("\n" + "="*80)
print("âœ“ ALL VISUALIZATIONS COMPLETE!")
print("="*80)
print(f"\nTotal charts generated: 12 PNG files")
print(f"  1. price_trends.png")
print(f"  2. bollinger_bands.png")
print(f"  3. rsi_analysis.png")
print(f"  4. returns_distribution.png")
print(f"  5. correlation_matrix.png")
print(f"  6. market_structure.png")
print(f"  7. volume_analysis.png")
print(f"  8. cumulative_returns.png")
print(f"  9. risk_return.png")
print(f"  10. normalized_performance.png")
print(f"  11. monthly_returns.png")
print(f"  12. volatility_analysis.png")

print("\n" + "="*80)
print("ANALYSIS COMPLETE!")
print("="*80)
print(f"\nâœ“ Database created: commodities_trading.db")
print(f"âœ“ Total visualizations saved: 12 PNG files")
print(f"\nðŸ“Š Summary by Instrument:")
print("-" * 80)
print(f"{'Symbol':<8} {'Name':<20} {'Records':<10} {'Avg Price':<12} {'Volatility':<12} {'Avg Return':<12}")
print("-" * 80)

for symbol in sorted(all_data['Symbol'].unique()):
    symbol_data = all_data[all_data['Symbol'] == symbol]
    if len(symbol_data) > 0:
        info = INSTRUMENTS[symbol]
        avg_price = symbol_data['Price'].mean()
        volatility = symbol_data['Daily_Return'].std()
        avg_return = symbol_data['Daily_Return'].mean()
        
        print(f"{symbol:<8} {info['name']:<20} {len(symbol_data):<10,} "
              f"{avg_price:<12.2f} {volatility:<12.3f} {avg_return:<12.3f}")

print(f"\n{'='*80}")
print("ðŸ“ˆ KEY INSIGHTS:")
print("-" * 80)

# Calculate some key statistics
all_symbols = sorted(all_data['Symbol'].unique())
volatilities = {}
returns = {}

for symbol in all_symbols:
    symbol_data = all_data[all_data['Symbol'] == symbol]
    volatilities[symbol] = symbol_data['Daily_Return'].std()
    returns[symbol] = symbol_data['Daily_Return'].mean()

most_volatile = max(volatilities, key=volatilities.get)
least_volatile = min(volatilities, key=volatilities.get)
best_performer = max(returns, key=returns.get)
worst_performer = min(returns, key=returns.get)

print(f"\nðŸ”¥ Most Volatile: {most_volatile} ({volatilities[most_volatile]:.2f}% std dev)")
print(f"ðŸ˜Œ Least Volatile: {least_volatile} ({volatilities[least_volatile]:.2f}% std dev)")
print(f"ðŸ“ˆ Best Performer: {best_performer} ({returns[best_performer]:.3f}% avg daily return)")
print(f"ðŸ“‰ Worst Performer: {worst_performer} ({returns[worst_performer]:.3f}% avg daily return)")

# Market structure summary
print(f"\nðŸ”„ Market Structure Summary:")
print("-" * 80)
for symbol in all_symbols:
    symbol_data = all_data[all_data['Symbol'] == symbol]
    structure_counts = symbol_data['Market_Structure'].value_counts()
    total = len(symbol_data)
    
    contango_pct = (structure_counts.get('Contango', 0) / total * 100) if total > 0 else 0
    backwardation_pct = (structure_counts.get('Backwardation', 0) / total * 100) if total > 0 else 0
    normal_pct = (structure_counts.get('Normal', 0) / total * 100) if total > 0 else 0
    
    print(f"{symbol:<8} | Contango: {contango_pct:>5.1f}% | Backwardation: {backwardation_pct:>5.1f}% | Normal: {normal_pct:>5.1f}%")

print(f"\n{'='*80}")
print("ðŸš€ NEXT STEP: Run the Streamlit app")
print("="*80)
print("\nCommand:")
print("  streamlit run streamlit_app.py")
print("\nLogin credentials:")
print("  Username: admin    | Password: admin123")
print("  Username: trader1  | Password: trader123")
print("  Username: demo_user | Password: demo123")
print("="*80)

LOADING DATA FROM CSV FILES
Data Period: January 1, 2025 - December 31, 2025

Loading EU Allowance (EUA)...
  ðŸ“‹ Columns in EUA: ['Date', 'Price', 'Open', 'High', 'Low', 'Vol.', 'Change %']
  âœ“ Loaded 256 valid records for EUA
Loading Crude Oil WTI (OIL)...
  ðŸ“‹ Columns in OIL: ['Date', 'Price', 'Open', 'High', 'Low', 'Vol.', 'Change %']
  âœ“ Loaded 259 valid records for OIL
Loading Natural Gas (NG)...
  ðŸ“‹ Columns in NG: ['Date', 'Price', 'Open', 'High', 'Low', 'Vol.', 'Change %']
  âœ“ Loaded 259 valid records for NG
Loading Gold (GOLD)...
  ðŸ“‹ Columns in GOLD: ['Date', 'Price', 'Open', 'High', 'Low', 'Vol.', 'Change %']
  âœ“ Loaded 259 valid records for GOLD
Loading Silver (SILVER)...
  ðŸ“‹ Columns in SILVER: ['Date', 'Price', 'Open', 'High', 'Low', 'Vol.', 'Change %']
  âœ“ Loaded 258 valid records for SILVER
Loading Bitcoin (BTC)...
  ðŸ“‹ Columns in BTC: ['Date', 'Price', 'Open', 'High', 'Low', 'Vol.', 'Change %']
  âœ“ Loaded 365 valid records for BTC
Loading S&P 50