# Importing the Dependecies

In [1]:
pip install yfinance pandas numpy requests fredapi tqdm finnhub-python sec-api


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Axya Quant Platform - Data Pipeline v4.0 (TwelveData Optimized)
import os
import pandas as pd
import requests
import time
import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from fredapi import Fred
import logging
import math

# --- Configuration ---
API_KEYS = {
    'twelvedata': '',
    'fred': ''
}

DATE_RANGE = {
    'start': '2022-01-01',
    'end': datetime.datetime.today().strftime('%Y-%m-%d')
}

DATA_PATH = os.path.abspath('./data')
os.makedirs(DATA_PATH, exist_ok=True)

# Configure logging
logging.basicConfig(
    filename=os.path.join(DATA_PATH, 'pipeline.log'),
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filemode='w'
)
logger = logging.getLogger()
logger.addHandler(logging.StreamHandler())

# --- Global Credit Tracker ---
class APICreditManager:
    def __init__(self):
        self.credits_used = 0
        self.max_credits = 800  # TwelveData daily limit
        self.lock = False
        
    def increment(self, count=1):
        if not self.lock:
            self.credits_used += count
            if self.credits_used >= self.max_credits * 0.95:  # 95% safety cutoff
                logger.critical(f"API limit reached at {self.credits_used}/800 credits")
                self.lock = True
                raise SystemExit("API credit limit exceeded")
                
credit_manager = APICreditManager()

# --- Ticker Management ---
def fetch_tickers_with_retry(url: str) -> list:
    """Fetch tickers with enhanced validation"""
    headers = {'User-Agent': 'Axya Quant Platform (research@axya.com)'}
    for attempt in range(3):
        try:
            response = requests.get(url, headers=headers, timeout=15)
            response.raise_for_status()
            return [
                t.strip() for t in response.text.splitlines()
                if t.strip() and 2 <= len(t.strip()) <= 5 and t.strip().isalpha()
            ]
        except Exception as e:
            logger.warning(f"Attempt {attempt+1} failed: {str(e)}")
            time.sleep(2 ** attempt)
    return []

def get_validated_tickers() -> list:
    """Get 500 validated tickers from major exchanges"""
    sources = {
        'nasdaq': 'https://raw.githubusercontent.com/rreichel3/US-Stock-Symbols/main/nasdaq/nasdaq_tickers.txt',
        'nyse': 'https://raw.githubusercontent.com/rreichel3/US-Stock-Symbols/main/nyse/nyse_tickers.txt'
    }
    
    tickers = []
    with ThreadPoolExecutor(max_workers=2) as executor:
        futures = {executor.submit(fetch_tickers_with_retry, url): name for name, url in sources.items()}
        for future in as_completed(futures):
            tickers.extend(future.result())
    
    valid_tickers = sorted(list(set(tickers)))[:500]  # Strict 500 ticker limit
    logger.info(f"Selected {len(valid_tickers)} tickers")
    return valid_tickers

# --- Optimized OHLCV Pipeline ---
class OHLCVDownloader:
    def __init__(self):
        self.base_url = "https://api.twelvedata.com/time_series"
        self.params = {
            'apikey': API_KEYS['twelvedata'],
            'interval': '1day',
            'outputsize': '5000',
            'timezone': 'America/New_York'
        }
        self.delay = 15  # 15s delay for 4 RPM per worker (8 RPM total)
        self.max_retries = 2  # Reduced retries to conserve credits
        self.last_request = 0
        self.workers = 2  # Maintain 8 RPM limit (2 workers × 4 RPM = 8 RPM)

    def fetch_ohlcv(self, ticker: str) -> pd.DataFrame:
        """Fetch data with credit tracking"""
        params = self.params.copy()
        params.update({
            'symbol': ticker,
            'start_date': DATE_RANGE['start'],
            'end_date': DATE_RANGE['end']
        })
        
        for attempt in range(self.max_retries + 1):  # 0-based count
            try:
                # Rate limit enforcement
                elapsed = time.time() - self.last_request
                if elapsed < self.delay:
                    sleep_time = self.delay - elapsed
                    time.sleep(sleep_time)
                
                response = requests.get(self.base_url, params=params, timeout=20)
                self.last_request = time.time()
                credit_manager.increment()
                
                # Handle rate limits
                if response.status_code == 429:
                    retry_after = int(response.headers.get('Retry-After', 60))
                    remaining = int(response.headers.get('X-Ratelimit-Requests-Remaining', 800))
                    logger.warning(f"Rate limited. Retry after {retry_after}s | Remaining: {remaining}")
                    time.sleep(retry_after + 5)
                    continue
                    
                response.raise_for_status()
                data = response.json()
                
                if data.get('status') == 'ok' and 'values' in data:
                    df = pd.DataFrame(data['values'])
                    df = df.rename(columns={
                        'datetime': 'date',
                        'open': 'open',
                        'high': 'high',
                        'low': 'low',
                        'close': 'close',
                        'volume': 'volume'
                    })
                    df['ticker'] = ticker
                    df['date'] = pd.to_datetime(df['date']).dt.date
                    return df[['date', 'open', 'high', 'low', 'close', 'volume', 'ticker']]
                
                logger.warning(f"No data for {ticker}: {data.get('message', 'Unknown error')}")
                return pd.DataFrame()
                
            except Exception as e:
                if attempt < self.max_retries:
                    logger.warning(f"Attempt {attempt+1} failed for {ticker}: {str(e)}")
                    time.sleep(math.pow(2, attempt))
                else:
                    logger.error(f"Final failure for {ticker}")
                    return pd.DataFrame()

    def run(self, tickers: list):
        """Execute with credit-aware parallel processing"""
        results = []
        failed_tickers = []
        
        with ThreadPoolExecutor(max_workers=self.workers) as executor:
            futures = {executor.submit(self.fetch_ohlcv, t): t for t in tickers}
            
            with tqdm(total=len(tickers), desc="Downloading OHLCV", unit="ticker") as pbar:
                for future in as_completed(futures):
                    ticker = futures[future]
                    result = future.result()
                    if not result.empty:
                        results.append(result)
                        logger.info(f"Success: {ticker} | Credits: {credit_manager.credits_used}/800")
                    else:
                        failed_tickers.append(ticker)
                        logger.warning(f"Failed: {ticker}")
                    pbar.update(1)
        
        if results:
            final_df = pd.concat(results)
            start_date = pd.to_datetime(DATE_RANGE['start']).date()
            end_date = pd.to_datetime(DATE_RANGE['end']).date()
            
            final_df = final_df[
                (final_df['date'] >= start_date) &
                (final_df['date'] <= end_date)
            ]
            
            final_df.to_parquet(os.path.join(DATA_PATH, 'ohlcv.parquet'))
            logger.info(f"Saved {len(final_df)} records | Failed: {len(failed_tickers)}")
            
            if failed_tickers:
                with open(os.path.join(DATA_PATH, 'failed_tickers.txt'), 'w') as f:
                    f.write('\n'.join(failed_tickers))
        return pd.concat(results) if results else pd.DataFrame()

# --- Macro Data Pipeline ---
def fetch_macro_data():
    """Fetch FRED data with error handling"""
    try:
        fred = Fred(api_key=API_KEYS['fred'])
        series = {
            'VIXCLS': 'vix',
            'T10Y2Y': 'yield_curve',
            'FEDFUNDS': 'fed_rate'
        }
        
        macro_df = pd.DataFrame()
        for series_id, col_name in series.items():
            try:
                data = fred.get_series(series_id, observation_start=DATE_RANGE['start'])
                temp_df = pd.DataFrame({'date': data.index, col_name: data.values})
                macro_df = temp_df if macro_df.empty else macro_df.merge(temp_df, on='date', how='outer')
            except Exception as e:
                logger.error(f"FRED {series_id} failed: {str(e)}")
        
        if not macro_df.empty:
            macro_df['date'] = pd.to_datetime(macro_df['date']).dt.date
            start_date = pd.to_datetime(DATE_RANGE['start']).date()
            end_date = pd.to_datetime(DATE_RANGE['end']).date()
            
            macro_df = macro_df[
                (macro_df['date'] >= start_date) &
                (macro_df['date'] <= end_date)
            ].sort_values('date').ffill()
            
            macro_df.to_parquet(os.path.join(DATA_PATH, 'macro_data.parquet'))
            logger.info(f"Saved {len(macro_df)} macro records")
        return macro_df
    except Exception as e:
        logger.error(f"Macro data failed: {str(e)}")
    return pd.DataFrame()

# --- Main Execution ---
if __name__ == "__main__":
    logger.info("Starting Axya Data Pipeline v4.0")
    
    try:
        # Step 1: Get Tickers
        tickers = get_validated_tickers()
        logger.info(f"Processing {len(tickers)} tickers")
        
        # Step 2: Download OHLCV
        ohlcv_data = OHLCVDownloader().run(tickers)
        
        # Step 3: Macro Data
        macro_data = fetch_macro_data()
        
        # Final Report
        logger.info("\n=== Pipeline Report ===")
        logger.info(f"OHLCV Records: {len(ohlcv_data) if not ohlcv_data.empty else 0}")
        logger.info(f"Macro Data Points: {len(macro_data) if not macro_data.empty else 0}")
        logger.info(f"API Credits Used: {credit_manager.credits_used}/800")
        logger.info(f"Data saved to: {DATA_PATH}")
        
    except Exception as e:
        logger.critical(f"Pipeline failed: {str(e)}")
        raise

Starting Axya Data Pipeline v4.0
Selected 500 tickers
Processing 500 tickers
Downloading OHLCV:   0%|                            | 0/500 [00:00<?, ?ticker/s]Success: AA | Credits: 1/800
Downloading OHLCV:   0%|                    | 1/500 [00:00<05:07,  1.62ticker/s]Success: AACB | Credits: 2/800
Downloading OHLCV:   0%|                    | 2/500 [00:01<04:31,  1.84ticker/s]Success: AACBR | Credits: 3/800
Downloading OHLCV:   1%|                  | 3/500 [00:16<1:00:40,  7.33s/ticker]Success: AACBU | Credits: 4/800
Downloading OHLCV:   1%|▏                   | 4/500 [00:17<38:31,  4.66s/ticker]Success: AACG | Credits: 5/800
Downloading OHLCV:   1%|▏                 | 5/500 [00:32<1:09:40,  8.45s/ticker]Success: AACT | Credits: 6/800
Downloading OHLCV:   1%|▏                   | 6/500 [00:32<47:02,  5.71s/ticker]Success: AAL | Credits: 7/800
Downloading OHLCV:   1%|▎                 | 7/500 [00:48<1:13:40,  8.97s/ticker]Success: AAM | Credits: 8/800
Downloading OHLCV:   2%|▎            