<a href="https://colab.research.google.com/github/SiMori92/pre-market-DL/blob/main/Step_0_Data_Sourcing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install alpha-vantage

Collecting alpha-vantage
  Downloading alpha_vantage-3.0.0-py3-none-any.whl.metadata (12 kB)
Downloading alpha_vantage-3.0.0-py3-none-any.whl (35 kB)
Installing collected packages: alpha-vantage
Successfully installed alpha-vantage-3.0.0


In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import uuid
from alpha_vantage.timeseries import TimeSeries
import time

def fetch_alpha_vantage_data(ticker, start_date, end_date, api_key):
    """Fetch stock data from Alpha Vantage and process required fields."""
    ts = TimeSeries(key=api_key, output_format='pandas')
    columns = [
        'ID', 'Date', 'Ticker', 'T-1 Close Price', 'T-1 High Price', 'T-1 Low Price', 'T-1 Volume',
        'Pre-market/ Futures Open Price', 'Pre-market/ Futures Price at 08:00',
        'Pre-market/ Futures Volume at 08:00', 'Pre-market/ Futures Close Price',
        'Pre-market/ Futures High Price', 'Pre-market/ Futures Low Price', 'Pre-market/ Futures Volume',
        'Price at market open',
        'High Price 0930-0931', 'Low Price 0930-0931', 'Price at 9:31',
        'High Price 0931-0935', 'Low Price 0931-0935', 'Price at 9:35',
        'High Price 0936-0940', 'Low Price 0936-0940', 'Price at 9:40',
        'High Price 0941-0945', 'Low Price 0941-0945', 'Price at 9:45',
        'High Price 0946-0950', 'Low Price 0946-0950', 'Price at 9:50',
        'High Price 0951-0955', 'Low Price 0951-0955', 'Price at 9:55',
        'High Price 0956-1000', 'Low Price 0956-1000', 'Price at 10:00',
        'Volume at 9:31', 'Volume at 9:35', 'Volume at 9:40', 'Volume at 9:45',
        'Volume at 9:50', 'Volume at 9:55', 'Volume at 10:00'
    ]
    data = []

    # Fetch daily data for T-1 fields
    daily_data, _ = ts.get_daily_adjusted(symbol=ticker, outputsize='full')
    daily_data.columns = ['open', 'high', 'low', 'close', 'adjusted_close', 'volume', 'dividend', 'split']
    daily_data.index = pd.to_datetime(daily_data.index)

    # Convert dates to datetime
    start = datetime.strptime(start_date, '%Y-%m-%d')
    end = datetime.strptime(end_date, '%Y-%m-%d')
    current_date = start

    while current_date <= end:
        date_str = current_date.strftime('%Y-%m-%d') # format time
        record = {'ID': str(uuid.uuid4()), 'Date': date_str, 'Ticker': ticker}

        # T-1 data (previous trading day)
        prev_day = current_date - timedelta(days=1)
        prev_day_str = prev_day.strftime('%Y-%m-%d')
        if prev_day_str in daily_data.index:
            record['T-1 Close Price'] = daily_data.loc[prev_day_str, 'close']
            record['T-1 High Price'] = daily_data.loc[prev_day_str, 'high']
            record['T-1 Low Price'] = daily_data.loc[prev_day_str, 'low']
            record['T-1 Volume'] = daily_data.loc[prev_day_str, 'volume']
        else:
            record['T-1 Close Price'] = np.nan
            record['T-1 High Price'] = np.nan
            record['T-1 Low Price'] = np.nan
            record['T-1 Volume'] = np.nan

        try:
            # Fetch 1-minute intraday data with extended hours
            intraday, _ = ts.get_intraday_extended(
                symbol=ticker,
                interval='1min',
                slice=f'year{(current_date.year-2016)//2+1}month{(current_date.month-1)%12+1}',
                adjusted=True
            )
            intraday.columns = ['time', 'open', 'high', 'low', 'close', 'volume']
            intraday['time'] = pd.to_datetime(intraday['time'])
            intraday.set_index('time', inplace=True)
            intraday = intraday[date_str:date_str]  # Filter for the specific day

            if not intraday.empty:
                # Pre-market/Futures data (4:00 AM - 9:29 AM)
                pre_market = intraday.between_time('04:00', '09:29')
                if not pre_market.empty:
                    record['Pre-market/ Futures Open Price'] = pre_market.iloc[0]['open']
                    record['Pre-market/ Futures Close Price'] = pre_market.iloc[-1]['close']
                    record['Pre-market/ Futures High Price'] = pre_market['high'].max()
                    record['Pre-market/ Futures Low Price'] = pre_market['low'].min()
                    record['Pre-market/ Futures Volume'] = pre_market['volume'].sum()

                    # Price and volume at 08:00
                    try:
                        ts_0800 = pd.to_datetime(f"{date_str} 08:00:00")
                        if ts_0800 in pre_market.index:
                            record['Pre-market/ Futures Price at 08:00'] = pre_market.loc[ts_0800, 'close']
                            record['Pre-market/ Futures Volume at 08:00'] = pre_market.loc[ts_0800, 'volume']
                        else:
                            record['Pre-market/ Futures Price at 08:00'] = np.nan
                            record['Pre-market/ Futures Volume at 08:00'] = np.nan
                    except:
                        record['Pre-market/ Futures Price at 08:00'] = np.nan
                        record['Pre-market/ Futures Volume at 08:00'] = np.nan
                else:
                    record['Pre-market/ Futures Open Price'] = np.nan
                    record['Pre-market/ Futures Close Price'] = np.nan
                    record['Pre-market/ Futures High Price'] = np.nan
                    record['Pre-market/ Futures Low Price'] = np.nan
                    record['Pre-market/ Futures Volume'] = np.nan
                    record['Pre-market/ Futures Price at 08:00'] = np.nan
                    record['Pre-market/ Futures Volume at 08:00'] = np.nan

                # Market open and intraday data (9:30 AM - 10:00 AM)
                market_data = intraday.between_time('09:30', '10:00')
                if not market_data.empty:
                    record['Price at market open'] = market_data.iloc[0]['open']

                    # Specific timestamps
                    timestamps = ['09:31', '09:35', '09:40', '09:45', '09:50', '09:55', '10:00']
                    for t in timestamps:
                        try:
                            ts = pd.to_datetime(f"{date_str} {t}:00")
                            if ts in market_data.index:
                                record[f'Price at {t}'] = market_data.loc[ts, 'close']
                                record[f'Volume at {t}'] = market_data.loc[ts, 'volume']
                            else:
                                record[f'Price at {t}'] = np.nan
                                record[f'Volume at {t}'] = np.nan
                        except:
                            record[f'Price at {t}'] = np.nan
                            record[f'Volume at {t}'] = np.nan

                    # High and low prices for time ranges
                    time_ranges = [
                        ('0930-0931', '09:30', '09:31'),
                        ('0931-0935', '09:31', '09:35'),
                        ('0936-0940', '09:36', '09:40'),
                        ('0941-0945', '09:41', '09:45'),
                        ('0946-0950', '09:46', '09:50'),
                        ('0951-0955', '09:51', '09:55'),
                        ('0956-1000', '09:56', '10:00')
                    ]
                    for prefix, start_time, end_time in time_ranges:
                        try:
                            range_data = market_data.between_time(start_time, end_time)
                            if not range_data.empty:
                                record[f'High Price {prefix}'] = range_data['high'].max()
                                record[f'Low Price {prefix}'] = range_data['low'].min()
                            else:
                                record[f'High Price {prefix}'] = np.nan
                                record[f'Low Price {prefix}'] = np.nan
                        except:
                            record[f'High Price {prefix}'] = np.nan
                            record[f'Low Price {prefix}'] = np.nan
                else:
                    record['Price at market open'] = np.nan
                    for t in ['09:31', '09:35', '09:40', '09:45', '09:50', '09:55', '10:00']:
                        record[f'Price at {t}'] = np.nan
                        record[f'Volume at {t}'] = np.nan
                    for prefix in ['0930-0931', '0931-0935', '0936-0940', '0941-0945', '0946-0950', '0951-0955', '0956-1000']:
                        record[f'High Price {prefix}'] = np.nan
                        record[f'Low Price {prefix}'] = np.nan
            else:
                # Fill with NaN if no intraday data
                for field in columns[7:]:  # Exclude ID, Date, Ticker, T-1 fields
                    record[field] = np.nan

            data.append(record)
        except:
            # Handle API errors
            for field in columns[7:]:
                record[field] = np.nan
            data.append(record)

        current_date += timedelta(days=1)
        time.sleep(12)  # Respect free tier rate limit (5 requests/minute)

    return pd.DataFrame(data, columns=columns)

def main():
    # Configurable parameters
    ticker = input("TQQQ").upper()
    api_key = input("DP6XIR4OLYVAZVBC")
    start_date = '2025-05-01'
    end_date = '2025-05-31'

    # Fetch and save data
    df = fetch_alpha_vantage_data(ticker, start_date, end_date, api_key)
    df.to_csv(f'{ticker}_finance_data.csv', index=False)
    print(f"Data saved to {ticker}_finance_data.csv")

if __name__ == "__main__":
    main()