In [None]:
# Standard library imports for data manipulation and file operations
import pandas as pd              # Data manipulation and analysis
import numpy as np              # Numerical computing and random number generation
from datetime import datetime, timedelta  # Date and time handling
import random                   # Additional random number generation
import os                      # Operating system interface for file operations
from typing import Tuple, List, Dict  # Type hints for better code documentation

# Display settings for better output formatting
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

In [None]:
class FinancialDataGenerator:
    """
    Comprehensive mock financial data generator for FinTech projects.
    
    This class (In Python, a class is a blueprint for generating objects (instances) that share the same data-attributes and behavior (methods),
    creates realistic datasets that simulate:
    1. Stock market behavior (geometric Brownian motion)
    2. Cryptocurrency volatility (higher volatility, 24/7 trading)
    3. Economic indicators (mean-reverting time series)
    4. Portfolio allocations (risk-based asset allocation)
    5. Customer demographics (realistic distributions)
    
    Design Pattern: This follows the Factory Pattern - one class that creates
    multiple types of related objects (different financial datasets).
    """
    
    def __init__(self, seed: int = 42):
        """
        Initialize the generator with predefined market data and random seed.
        
        Args:
            seed: Random seed for reproducibility (crucial for testing and validation)
        
        Why use a seed?
        - Ensures our data generation is reproducible
        - Critical for debugging and validation
        - Allows team members to generate identical datasets
        - Follows best practices in quantitative finance
        """
        # Set random seeds for reproducible results
        np.random.seed(seed)  # NumPy random operations
        random.seed(seed)     # Python random module operations
        
        # Define major stock symbols - representing different sectors and market caps
        # These are real S&P 500 companies for realistic modeling
        self.stock_symbols = [
            # Technology Giants (FAANG + others)
            'AAPL', 'GOOGL', 'MSFT', 'AMZN', 'TSLA', 'META', 'NVDA', 
            
            # Financial Services
            'JPM', 'BAC', 'V', 'MA', 
            
            # Healthcare & Consumer Goods
            'JNJ', 'PG', 'UNH', 'PFE', 'KO', 'WMT',
            
            # Entertainment & Retail
            'DIS', 'HD', 'NKE', 'COST',
            
            # Software & Cloud
            'ADBE', 'CRM', 'ORCL', 'CSCO',
            
            # Traditional Industries
            'T', 'VZ', 'XOM', 'CVX', 'IBM'
        ]
        
        # Major cryptocurrency symbols by market capitalization (as of 2024-2025)
        # Note: Crypto markets are more volatile and trade 24/7
        self.crypto_symbols = [
            'BTC',   # Bitcoin - digital gold, store of value
            'ETH',   # Ethereum - smart contract platform
            'BNB',   # Binance Coin - exchange token
            'XRP',   # Ripple - cross-border payments
            'ADA',   # Cardano - proof-of-stake blockchain
            'DOGE',  # Dogecoin - meme coin with high volatility
            'SOL',   # Solana - high-performance blockchain
            'TRX',   # Tron - decentralized entertainment platform
            'DOT',   # Polkadot - interoperability protocol
            'MATIC', # Polygon - Ethereum scaling solution
            'SHIB',  # Shiba Inu - another meme coin
            'AVAX',  # Avalanche - smart contracts platform
            'LTC',   # Litecoin - Bitcoin fork
            'UNI',   # Uniswap - decentralized exchange token
            'LINK'   # Chainlink - oracle network
        ]

# Test the class initialization
generator = FinancialDataGenerator(seed=42)

In [None]:
def generate_stock_prices(self, 
                         symbols: List[str] = None,
                         start_date: str = '2020-01-01',
                         end_date: str = '2024-12-31',
                         initial_price_ranges: dict) -> pd.DataFrame:
    """
    Generate realistic stock price data using Geometric Brownian Motion.
    
    This method simulates how stock prices evolve over time, incorporating:
    1. Random price movements (market efficiency)
    2. Volatility clustering (periods of high/low volatility)
    3. Mean reversion tendencies (prices don't drift too far from fundamentals)
    4. Realistic trading volumes correlated with price volatility
    
    Args:
        symbols: List of stock symbols to generate (default: first 20 predefined)
        start_date: Start date for price series
        end_date: End date for price series  
        initial_price_range: Range for starting stock prices
        
    Returns:
        DataFrame with columns: Date, Symbol, Open, High, Low, Close, Volume
        
    Financial Insights:
        - Higher volatility stocks have more dramatic price swings
        - Volume increases during high volatility periods (realistic behavior)
        - Mean reversion prevents prices from drifting to unrealistic levels
        - Weekend gaps are handled by excluding weekends from trading days
    """
    if symbols is None:
        symbols = self.stock_symbols

    # Create business day range (exclude weekends - NYSE is closed Sat/Sun)
    date_range = pd.date_range(start=start_date, end=end_date, freq='D')
    business_days = date_range[date_range.weekday < 5]  # Monday=0, Friday=4
    
    print(f"📅 Generating stock data for {len(business_days)} trading days")
    print(f"📈 Creating price series for {len(symbols)} symbols")
    
    all_stock_data = []
    
    for i, symbol in enumerate(symbols):
        print(f"  📊 Processing {symbol} ({i+1}/{len(symbols)})")

        # Set initial price for this stock
        price_range = initial_price_ranges.get(symbol)
        initial_price = np.random.uniform(*price_range)
        
        # Initialize stock-specific parameters
        annual_volatility = np.random.uniform(0.15, 0.45)  # 15-45% annual volatility
        daily_volatility = annual_volatility / np.sqrt(252)  # Convert to daily
        
        # Store prices for mean reversion calculation
        prices = [initial_price]
        volumes = []
        
        # Generate daily price evolution
        for day_idx, date in enumerate(business_days):
            # Base daily return components
            base_drift = np.random.normal(0.0008, 0.002)  # ~20% annual drift with variation
            volatility_shock = np.random.normal(0, daily_volatility)
            
            # Add mean reversion after 30 days (prevents unrealistic price drift)
            if day_idx > 30:
                # Calculate 30-day moving average
                recent_avg = np.mean(prices[-30:])
                mean_reversion_force = (recent_avg - prices[-1]) * 0.001
                base_drift += mean_reversion_force
            
            # Apply geometric Brownian motion formula
            price_multiplier = np.exp(base_drift + volatility_shock)
            new_price = prices[-1] * price_multiplier
            
            # Apply circuit breakers (realistic market limits)
            # No stock can drop more than 30% or gain more than 50% in one day
            new_price = max(new_price, prices[-1] * 0.70)  # Max 30% daily drop
            new_price = min(new_price, prices[-1] * 1.50)  # Max 50% daily gain
            
            prices.append(new_price)
            
            # Generate realistic trading volume
            # Volume correlates with volatility (high volatility = high volume)
            base_volume = np.random.lognormal(15, 1)  # Log-normal distribution for volume
            volatility_multiplier = abs(volatility_shock) * 5 + 1
            daily_volume = int(base_volume * volatility_multiplier)
            volumes.append(daily_volume)
        
        # Convert daily close prices to OHLCV format
        for day_idx, date in enumerate(business_days):
            close_price = prices[day_idx + 1]  # +1 because prices[0] is initial
            previous_close = prices[day_idx]
            
            # Generate intraday price range
            intraday_volatility = abs(np.random.normal(0, daily_volatility * close_price))
            
            # Calculate OHLC with realistic constraints
            high_price = close_price + np.random.uniform(0, 1) * intraday_volatility
            low_price = close_price - np.random.uniform(0, 1) * intraday_volatility  
            open_price = previous_close + np.random.normal(0, daily_volatility * previous_close * 0.3)
            
            # Ensure OHLC logical consistency: Low ≤ Open,Close ≤ High
            high_price = max(high_price, open_price, close_price)
            low_price = min(low_price, open_price, close_price)
            
            # Add to dataset
            all_stock_data.append({
                'Date': date,
                'Symbol': symbol,
                'Open': round(open_price, 2),
                'High': round(high_price, 2), 
                'Low': round(low_price, 2),
                'Close': round(close_price, 2),
                'Volume': volumes[day_idx]
            })
    
    stock_df = pd.DataFrame(all_stock_data)
    return stock_df

# Add the method to our generator class
FinancialDataGenerator.generate_stock_prices = generate_stock_prices

stocks_symbols = ['AAPL', 'GOOGL', 'TSLA']
stocks_initial_price_ranges = {
        'BTC': (30000, 60000),    # Bitcoin: $30k-60k range
        'ETH': (2000, 4000),      # Ethereum: $2k-4k range  
        'BNB': (300, 600),        # Binance Coin: $300-600
        'XRP': (0.5, 1.5),        # Ripple: $0.50-1.50
        'ADA': (0.3, 1.2),        # Cardano: $0.30-1.20
        'DOGE': (0.05, 0.3),      # Dogecoin: $0.05-0.30
        'SOL': (50, 200),         # Solana: $50-200
        'TRX': (0.06, 0.12),      # Tron: $0.06-0.12
        'DOT': (5, 30),           # Polkadot: $5-30
        'MATIC': (0.5, 2.5)       # Polygon: $0.50-2.50
    }
stocks = generator.generate_stock_prices(
    symbols= stocks_symbols, 
    initial_price_ranges = stocks_initial_price_ranges
)

In [None]:
def generate_crypto_prices(self, 
                         symbols: List[str] = None,
                         start_date: str = '2020-01-01',
                         end_date: str = '2024-12-31',
                         initial_price_ranges: dict) -> pd.DataFrame:
    """
    Generate cryptocurrency price data with realistic 24/7 market behavior.
    
    Crypto markets exhibit unique characteristics:
    - Much higher volatility (50-120% annually)
    - 24/7 trading (no weekend gaps)
    - Sentiment-driven price action
    - Lower liquidity leads to more extreme movements
    - Different behavior patterns for weekends vs weekdays
    
    Args:
        symbols: List of crypto symbols (default: top 10 by market cap)
        start_date: Start date for generation
        end_date: End date for generation
        initial_price_range: Range for starting stock prices
        
    Returns:
        DataFrame with columns: Timestamp, Symbol, Open, High, Low, Close, Volume
        
    Technical Implementation:
        - 6-hour intervals (4 data points per day)
        - Higher volatility parameters than stocks
        - Weekend and night-time volume adjustments
        - Realistic initial price ranges for major cryptocurrencies
    """
    if symbols is None:
        symbols = self.crypto_symbols[:10]  # Top 10 cryptocurrencies
    
    # Crypto trades 24/7 - generate 6-hour intervals
    full_date_range = pd.date_range(start=start_date, end=end_date, freq='h')
    # Take every 6th hour: 00:00, 06:00, 12:00, 18:00 UTC
    crypto_timestamps = full_date_range[::6]

    all_crypto_data = []
    
    for i, symbol in enumerate(symbols):
        print(f"  💰 Processing {symbol} ({i+1}/{len(symbols)})")
        
        # Set initial price and volatility for this crypto
        price_range = initial_price_ranges.get(symbol)
        initial_price = np.random.uniform(*price_range)
        
        # Crypto volatility is much higher than stocks
        annual_volatility = np.random.uniform(0.5, 1.2)  # 50-120% annual volatility
        six_hour_volatility = annual_volatility / np.sqrt(365 * 4)  # Convert to 6-hour periods
        
        prices = [initial_price]
        volumes = []
        
        # Generate price evolution for each 6-hour period
        for period_idx, timestamp in enumerate(crypto_timestamps):
            # Base price movement
            base_drift = np.random.normal(0, 0.001)  # Slightly positive expected return
            volatility_shock = np.random.normal(0, six_hour_volatility)
            
            # Weekend effect: Crypto markets are less active on weekends
            if timestamp.weekday() >= 5:  # Saturday=5, Sunday=6
                base_drift *= 0.7  # Reduced weekend activity
            
            # Night time effect: Reduced activity during US night hours
            if timestamp.hour < 6 or timestamp.hour > 22:
                base_drift *= 0.5  # Lower overnight activity
            
            # Apply geometric Brownian motion with higher volatility bounds
            price_multiplier = np.exp(base_drift + volatility_shock)
            new_price = prices[-1] * price_multiplier
            
            # Crypto circuit breakers (more lenient than stocks due to higher volatility)
            new_price = max(new_price, prices[-1] * 0.5)   # Max 50% period drop
            new_price = min(new_price, prices[-1] * 2.0)   # Max 100% period gain
            
            prices.append(new_price)
            
            # Generate trading volume (crypto volumes are typically lower than stocks)
            base_volume = np.random.lognormal(12, 1.5)  # Smaller base volume than stocks
            volatility_multiplier = abs(volatility_shock) * 10 + 1  # Higher sensitivity to volatility
            period_volume = int(base_volume * volatility_multiplier)
            volumes.append(period_volume)
        
        # Convert to OHLCV format for each 6-hour period
        for period_idx, timestamp in enumerate(crypto_timestamps): #Fix: Aggregate Data to Daily Data instead of 6-hour window
            close_price = prices[period_idx + 1]
            previous_close = prices[period_idx]
            
            # Generate intraday range for 6-hour period
            period_volatility = abs(np.random.normal(0, six_hour_volatility * close_price * 2))
            
            # Calculate OHLC
            high_price = close_price + np.random.uniform(0, 1) * period_volatility
            low_price = close_price - np.random.uniform(0, 1) * period_volatility
            open_price = previous_close + np.random.normal(0, six_hour_volatility * previous_close)
            
            # Ensure OHLC consistency
            high_price = max(high_price, open_price, close_price)
            low_price = min(low_price, open_price, close_price)
            
            # Add to dataset with appropriate precision
            # Crypto prices need more decimal places due to wide price ranges
            decimal_places = 6 if close_price < 10 else 2
            
            all_crypto_data.append({
                'Timestamp': timestamp,
                'Symbol': symbol,
                'Open': round(open_price, decimal_places),
                'High': round(high_price, decimal_places),
                'Low': round(low_price, decimal_places), 
                'Close': round(close_price, decimal_places),
                'Volume': volumes[period_idx]
            })
    
    crypto_df = pd.DataFrame(all_crypto_data)
    return crypto_df

# Add method to the generator class
FinancialDataGenerator.generate_crypto_prices = generate_crypto_prices

crypto_symbols = ['AAPL', 'GOOGL', 'TSLA']
crypto_initial_price_ranges = {
        'BTC': (30000, 60000),    # Bitcoin: $30k-60k range
        'ETH': (2000, 4000),      # Ethereum: $2k-4k range  
        'BNB': (300, 600),        # Binance Coin: $300-600
        'XRP': (0.5, 1.5),        # Ripple: $0.50-1.50
        'ADA': (0.3, 1.2),        # Cardano: $0.30-1.20
        'DOGE': (0.05, 0.3),      # Dogecoin: $0.05-0.30
        'SOL': (50, 200),         # Solana: $50-200
        'TRX': (0.06, 0.12),      # Tron: $0.06-0.12
        'DOT': (5, 30),           # Polkadot: $5-30
        'MATIC': (0.5, 2.5)       # Polygon: $0.50-2.50
    }
stocks = generator.generate_stock_prices(
    symbols = crypto_symbols, 
    initial_price_ranges = crypto_initial_price_ranges
)