# Part 1: Enhanced Data Collection (Top 10 Cryptos)
Dataset: BTC, ETH, BNB, SOL, XRP, ADA, AVAX, DOGE, DOT, MATIC (2 years hourly data)

### CELL 1: Setup & Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create directories
!mkdir -p /content/drive/MyDrive/crypto_bot/data
!mkdir -p /content/drive/MyDrive/crypto_bot/models
!mkdir -p /content/drive/MyDrive/crypto_bot/charts

print("‚úÖ Google Drive mounted successfully!")

### CELL 2: Install Dependencies

In [None]:
!pip install -q ccxt pandas numpy pandas-ta mplfinance pillow scikit-learn tqdm

print("‚úÖ Dependencies installed!")

### CELL 3: Fetch Top 10 Crypto Data (2 Years)

In [None]:
import ccxt
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from tqdm.notebook import tqdm
import time

# Top 10 cryptocurrencies by market cap
TOP_10_SYMBOLS = [
    'BTC/USDT',   # Bitcoin
    'ETH/USDT',   # Ethereum
    'BNB/USDT',   # Binance Coin
    'SOL/USDT',   # Solana
    'XRP/USDT',   # Ripple
    'ADA/USDT',   # Cardano
    'AVAX/USDT',  # Avalanche
    'DOGE/USDT',  # Dogecoin
    'DOT/USDT',   # Polkadot
    'MATIC/USDT'  # Polygon
]

def fetch_historical_data(symbol, timeframe='1h', days=730):
    """
    Fetch 2 years of historical data from Binance
    730 days * 24 hours = 17,520 candles per symbol
    """
    exchange = ccxt.binance({
        'enableRateLimit': True,
        'options': {'defaultType': 'spot'}
    })
    
    # Calculate start time (2 years ago)
    end_time = datetime.now()
    start_time = end_time - timedelta(days=days)
    since = int(start_time.timestamp() * 1000)
    
    all_data = []
    current_since = since
    
    print(f"üìä Fetching {symbol}...")
    
    with tqdm(total=days*24, desc=f"{symbol}", unit="candle") as pbar:
        while True:
            try:
                ohlcv = exchange.fetch_ohlcv(
                    symbol, 
                    timeframe, 
                    since=current_since, 
                    limit=1000
                )
                
                if not ohlcv:
                    break
                
                all_data.extend(ohlcv)
                pbar.update(len(ohlcv))
                
                # Check if we've reached current time
                if ohlcv[-1][0] >= int(end_time.timestamp() * 1000):
                    break
                
                # Move to next batch
                current_since = ohlcv[-1][0] + 1
                
                # Rate limiting
                time.sleep(exchange.rateLimit / 1000)
                
            except Exception as e:
                print(f"‚ùå Error fetching {symbol}: {e}")
                time.sleep(5)
                continue
    
    # Convert to DataFrame
    df = pd.DataFrame(
        all_data, 
        columns=['timestamp', 'open', 'high', 'low', 'close', 'volume']
    )
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
    df['symbol'] = symbol.replace('/USDT', '')
    
    # Remove duplicates
    df = df.drop_duplicates(subset='timestamp').reset_index(drop=True)
    
    print(f"‚úÖ {symbol}: {len(df)} candles ({df['timestamp'].min()} to {df['timestamp'].max()})")
    
    return df

# Fetch data for all top 10 cryptocurrencies
print("üöÄ Fetching 2 years of data for Top 10 cryptocurrencies...")
print(f"   Expected: ~17,520 candles per symbol")
print(f"   Total: ~175,200 data points\n")

all_crypto_data = {}

for symbol in TOP_10_SYMBOLS:
    try:
        df = fetch_historical_data(symbol, timeframe='1h', days=730)
        all_crypto_data[symbol] = df
        
        # Save individual CSV
        crypto_name = symbol.replace('/USDT', '').lower()
        df.to_csv(f'/content/drive/MyDrive/crypto_bot/data/{crypto_name}_2y_raw.csv', index=False)
        
        # Small delay between symbols
        time.sleep(2)
        
    except Exception as e:
        print(f"‚ùå Failed to fetch {symbol}: {e}")
        continue

print(f"\n‚úÖ Successfully fetched {len(all_crypto_data)} cryptocurrencies")
print(f"   Total data points: {sum(len(df) for df in all_crypto_data.values()):,}")

### CELL 4: Add Technical Indicators (All Symbols)

In [None]:
import pandas_ta as ta

def add_technical_indicators(df):
    """
    Add comprehensive technical indicators for ML features
    """
    df = df.copy()
    
    # Momentum Indicators
    df['rsi'] = ta.rsi(df['close'], length=14)
    df['rsi_ma'] = df['rsi'].rolling(14).mean()
    df['rsi_std'] = df['rsi'].rolling(14).std()
    
    # MACD
    macd = ta.macd(df['close'], fast=12, slow=26, signal=9)
    df['macd'] = macd['MACD_12_26_9']
    df['macd_signal'] = macd['MACDs_12_26_9']
    df['macd_hist'] = macd['MACDh_12_26_9']
    
    # Moving Averages
    df['sma_7'] = ta.sma(df['close'], length=7)
    df['sma_20'] = ta.sma(df['close'], length=20)
    df['sma_50'] = ta.sma(df['close'], length=50)
    df['sma_100'] = ta.sma(df['close'], length=100)
    df['sma_200'] = ta.sma(df['close'], length=200)
    
    df['ema_12'] = ta.ema(df['close'], length=12)
    df['ema_26'] = ta.ema(df['close'], length=26)
    df['ema_50'] = ta.ema(df['close'], length=50)
    
    # Bollinger Bands
    bbands = ta.bbands(df['close'], length=20, std=2)
    df['bb_upper'] = bbands['BBU_20_2.0']
    df['bb_middle'] = bbands['BBM_20_2.0']
    df['bb_lower'] = bbands['BBL_20_2.0']
    df['bb_width'] = (df['bb_upper'] - df['bb_lower']) / df['bb_middle']
    df['bb_percent'] = (df['close'] - df['bb_lower']) / (df['bb_upper'] - df['bb_lower'])
    
    # Volume indicators
    df['volume_sma'] = ta.sma(df['volume'], length=20)
    df['volume_ratio'] = df['volume'] / df['volume_sma']
    df['volume_std'] = df['volume'].rolling(20).std()
    
    # ATR (volatility)
    df['atr'] = ta.atr(df['high'], df['low'], df['close'], length=14)
    df['atr_percent'] = (df['atr'] / df['close']) * 100
    
    # Stochastic
    stoch = ta.stoch(df['high'], df['low'], df['close'], k=14, d=3)
    df['stoch_k'] = stoch['STOCHk_14_3_3']
    df['stoch_d'] = stoch['STOCHd_14_3_3']
    
    # ADX (trend strength)
    adx = ta.adx(df['high'], df['low'], df['close'], length=14)
    df['adx'] = adx['ADX_14']
    df['di_plus'] = adx['DMP_14']
    df['di_minus'] = adx['DMN_14']
    
    # Price momentum
    df['momentum'] = df['close'].pct_change(periods=10) * 100
    df['rate_of_change'] = ta.roc(df['close'], length=10)
    
    # Price position relative to highs/lows
    df['high_20'] = df['high'].rolling(20).max()
    df['low_20'] = df['low'].rolling(20).min()
    df['price_position'] = (df['close'] - df['low_20']) / (df['high_20'] - df['low_20'])
    
    # Williams %R
    df['williams_r'] = ta.willr(df['high'], df['low'], df['close'], length=14)
    
    # CCI (Commodity Channel Index)
    df['cci'] = ta.cci(df['high'], df['low'], df['close'], length=20)
    
    # Returns
    df['returns_1h'] = df['close'].pct_change(1)
    df['returns_24h'] = df['close'].pct_change(24)
    df['returns_7d'] = df['close'].pct_change(168)  # 7 days * 24 hours
    
    # Volatility
    df['volatility_24h'] = df['returns_1h'].rolling(24).std()
    df['volatility_7d'] = df['returns_1h'].rolling(168).std()
    
    # Drop NaN rows
    df = df.dropna().reset_index(drop=True)
    
    return df

print("üîÑ Adding technical indicators to all symbols...\n")

processed_data = {}

for symbol, df in all_crypto_data.items():
    print(f"Processing {symbol}...")
    processed_df = add_technical_indicators(df)
    processed_data[symbol] = processed_df
    
    # Save processed data
    crypto_name = symbol.replace('/USDT', '').lower()
    processed_df.to_csv(
        f'/content/drive/MyDrive/crypto_bot/data/{crypto_name}_2y_processed.csv', 
        index=False
    )
    
    print(f"‚úÖ {symbol}: {len(processed_df.columns)} features, {len(processed_df)} rows")

# Combine all into one master dataset
master_df = pd.concat(processed_data.values(), ignore_index=True)
master_df.to_csv('/content/drive/MyDrive/crypto_bot/data/master_top10_2y.csv', index=False)

print(f"\n‚úÖ Master dataset created:")
print(f"   Total rows: {len(master_df):,}")
print(f"   Total features: {len(master_df.columns)}")
print(f"   Symbols: {master_df['symbol'].nunique()}")

### CELL 5: Generate Chart Images (50,000 Images)

In [None]:
import mplfinance as mpf
from PIL import Image
import matplotlib
matplotlib.use('Agg')  # Non-interactive backend for faster generation
import matplotlib.pyplot as plt

def generate_chart_images_batch(df, symbol, num_images=5000, lookback=100):
    """
    Generate candlestick chart images with labels
    
    Labels based on next 24h return:
    - 0: Bearish (< -2%)
    - 1: Neutral (-2% to +2%)
    - 2: Bullish (> +2%)
    """
    
    images = []
    labels = []
    metadata = []
    
    # Ensure we have enough data
    max_start = len(df) - lookback - 24
    if max_start < num_images:
        num_images = max_start
    
    # Random sampling for diversity
    indices = np.random.choice(
        range(lookback, max_start), 
        size=num_images, 
        replace=False
    )
    
    print(f"üìä Generating {num_images} images for {symbol}...")
    
    for idx in tqdm(indices, desc=symbol):
        try:
            # Get chart window
            chart_data = df.iloc[idx-lookback:idx].copy()
            chart_data = chart_data[['open', 'high', 'low', 'close', 'volume']].copy()
            chart_data.index = pd.DatetimeIndex(df.iloc[idx-lookback:idx]['timestamp'])
            
            # Calculate future return
            current_price = df.iloc[idx]['close']
            future_price = df.iloc[idx + 24]['close']
            future_return = (future_price - current_price) / current_price * 100
            
            # Assign label (adjusted thresholds for more balance)
            if future_return < -2:
                label = 0  # Bearish
            elif future_return > 2:
                label = 2  # Bullish
            else:
                label = 1  # Neutral
            
            # Generate chart
            fig, axes = mpf.plot(
                chart_data,
                type='candle',
                style='charles',
                volume=True,
                mav=(20, 50),
                figsize=(6, 4),
                returnfig=True,
                tight_layout=True,
                warn_too_much_data=10000
            )
            
            # Convert to array
            fig.canvas.draw()
            img = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
            img = img.reshape(fig.canvas.get_width_height()[::-1] + (3,))
            plt.close(fig)
            
            # Resize to 224x224
            img = Image.fromarray(img).resize((224, 224), Image.LANCZOS)
            img_array = np.array(img)
            
            images.append(img_array)
            labels.append(label)
            metadata.append({
                'symbol': symbol,
                'index': int(idx),
                'timestamp': str(df.iloc[idx]['timestamp']),
                'current_price': float(current_price),
                'future_return': float(future_return),
                'label': int(label)
            })
            
        except Exception as e:
            # Skip problematic charts
            continue
    
    images = np.array(images)
    labels = np.array(labels)
    
    print(f"‚úÖ Generated {len(images)} images")
    print(f"   Distribution: Bearish={sum(labels==0)}, Neutral={sum(labels==1)}, Bullish={sum(labels==2)}")
    
    return images, labels, metadata

# Generate 5,000 images per symbol (50,000 total)
print("\nüé® Generating 50,000 chart images across all symbols...")
print("   This will take 30-45 minutes...\n")

all_images = []
all_labels = []
all_metadata = []

for symbol, df in processed_data.items():
    try:
        images, labels, metadata = generate_chart_images_batch(
            df, 
            symbol, 
            num_images=5000, 
            lookback=100
        )
        
        all_images.append(images)
        all_labels.append(labels)
        all_metadata.extend(metadata)
        
    except Exception as e:
        print(f"‚ùå Error generating charts for {symbol}: {e}")
        continue

# Combine all images
all_images = np.concatenate(all_images)
all_labels = np.concatenate(all_labels)

print(f"\n‚úÖ Total chart images generated: {len(all_images):,}")
print(f"   Shape: {all_images.shape}")
print(f"   File size: ~{all_images.nbytes / (1024**3):.2f} GB")
print(f"\nüìä Label Distribution:")
print(f"   Bearish (0): {sum(all_labels==0):,} ({sum(all_labels==0)/len(all_labels)*100:.1f}%)")
print(f"   Neutral (1): {sum(all_labels==1):,} ({sum(all_labels==1)/len(all_labels)*100:.1f}%)")
print(f"   Bullish (2): {sum(all_labels==2):,} ({sum(all_labels==2)/len(all_labels)*100:.1f}%)")

# Save to Drive
np.save('/content/drive/MyDrive/crypto_bot/data/chart_images_top10.npy', all_images)
np.save('/content/drive/MyDrive/crypto_bot/data/chart_labels_top10.npy', all_labels)

# Save metadata
import json
with open('/content/drive/MyDrive/crypto_bot/data/chart_metadata_top10.json', 'w') as f:
    json.dump(all_metadata, f, indent=2)

print("\n‚úÖ All images saved to Google Drive!")

### CELL 6: Data Statistics & Summary

In [None]:
print("\n" + "="*60)
print("üìä DATA COLLECTION SUMMARY - TOP 10 CRYPTOCURRENCIES")
print("="*60)

print(f"\nüíæ OHLCV Data:")
print(f"   Symbols: {len(processed_data)}")
print(f"   Total rows: {len(master_df):,}")
print(f"   Features: {len(master_df.columns)}")
print(f"   Date range: {master_df['timestamp'].min()} to {master_df['timestamp'].max()}")

print(f"\nüé® Chart Images:")
print(f"   Total images: {len(all_images):,}")
print(f"   Image shape: {all_images.shape[1:]}")
print(f"   Dataset size: {all_images.nbytes / (1024**3):.2f} GB")

print(f"\nüìà Per-Symbol Breakdown:")
for symbol in TOP_10_SYMBOLS:
    crypto_name = symbol.replace('/USDT', '')
    symbol_data = master_df[master_df['symbol'] == crypto_name]
    symbol_images = sum(1 for m in all_metadata if m['symbol'] == symbol)
    print(f"   {crypto_name:6s}: {len(symbol_data):,} rows, {symbol_images:,} images")

print("\n‚úÖ DATA PREPARATION COMPLETE!")
print("\nFiles saved to Google Drive:")
print("   ‚Ä¢ Individual CSVs: data/{symbol}_2y_processed.csv")
print("   ‚Ä¢ Master dataset: data/master_top10_2y.csv")
print("   ‚Ä¢ Chart images: data/chart_images_top10.npy")
print("   ‚Ä¢ Chart labels: data/chart_labels_top10.npy")
print("   ‚Ä¢ Metadata: data/chart_metadata_top10.json")

print("\nüöÄ Next: Run notebook 02_train_lstm_top10.ipynb")