In [147]:
import sys
print(sys.executable)
# Verify kernal path
import numpy as np
import pandas as pd
import os
import requests
from dotenv import load_dotenv
load_dotenv()

"""
Downloads data for every dat in date range for the given stock symbol. Saves it as parquet file in data/raw
"""
def dowload_one_file_of_raw_data(symbol, start_date, end_date):
    POLYGON_API_KEY = os.getenv('POLYGON_API_KEY')
    url = f"https://api.polygon.io/v2/aggs/ticker/{symbol}/range/1/day/{start_date}/{end_date}?apiKey={POLYGON_API_KEY}"
    response = requests.get(url)
    data = response.json()
    
    # Print keys to inspect the structure of the data
    print(dict(data).keys())

    # Extract the time series data
    time_series = data.get('results', [])
    
    # Check if time_series is empty or not
    if not time_series:
        print("No data available.")
        return

    # Convert the list of dictionaries to a pandas DataFrame
    df = pd.DataFrame(time_series)
    
    # convert 't' (timestamp) to datetime and rename it to 'Date'
    df['datetime'] = pd.to_datetime(df['t'], unit='ms')
    # df.set_index('datetime', inplace=True)
    
    
    # drop the timestamp column 't'
    df.drop(columns=['t'], inplace=True)
    
    # rename columns for clarity
    df.rename(columns={
        'o': 'open_price',
        'h': 'high_price',
        'l': 'low_price',
        'c': 'close_price',
        'v': 'volume',
        'n': 'num_transactions',
        'vw': 'vw_avr_price'
    }, inplace=True)
    print(df)
    

    path = f'../data/raw/prices_{start_date}-{end_date}.parquet'
    df.to_parquet(path, index=True)

# For apply stock
dowload_one_file_of_raw_data(symbol='AAPL', start_date='2022-08-02', end_date='2024-08-02') 



/Users/pravachanpatra/Documents/PYTHON/AI_ML_DL/Stock_Price_Predictor/venv/bin/python
dict_keys(['ticker', 'queryCount', 'resultsCount', 'adjusted', 'results', 'status', 'request_id', 'count'])
          volume  vw_avr_price  open_price  close_price  high_price  \
0     56696985.0      164.7432      163.21       165.35     165.850   
1     60362338.0      165.8939      166.37       164.87     167.810   
2     63075503.0      164.8395      164.02       164.92     165.820   
3     70170540.0      168.3496      167.68       169.24     169.340   
4     57142109.0      169.3737      170.06       168.49     170.990   
..           ...           ...         ...          ...         ...   
496   35153729.0      218.1319      216.96       218.24     219.300   
497   40681625.0      218.4059      219.19       218.80     220.325   
498   48422974.0      222.3441      221.44       222.08     223.820   
499   61125243.0      219.4773      224.37       218.36     224.480   
500  102635321.0      221

In [148]:
# VALIDATION STEP: inspect data statistics
file_path = "../data/raw/prices_2022-08-02-2024-08-02.parquet"
prices = pd.read_parquet(file_path)

print("\nVolume Statistics:")
print(prices["volume"].describe())

print("\nVW Statistics:")
print(prices["vw_avr_price"].describe())

print("\nOpen Statistics:")
print(prices["open_price"].describe())

print("\nClose Statistics:")
print(prices["close_price"].describe())

print("\nHigh Statistics:")
print(prices["high_price"].describe())

print("\nLow Statistics:")
print(prices["low_price"].describe()) # scroll output


Volume Statistics:
count    5.010000e+02
mean     6.509021e+07
std      2.394486e+07
min      2.401840e+07
25%      4.894314e+07
50%      5.895305e+07
75%      7.413980e+07
max      2.040182e+08
Name: volume, dtype: float64

VW Statistics:
count    501.000000
mean     173.147452
std       22.094729
min      125.725000
25%      155.012000
50%      174.136600
75%      188.148900
max      234.920200
Name: vw_avr_price, dtype: float64

Open Statistics:
count    501.000000
mean     173.060182
std       22.131691
min      126.010000
25%      154.785000
50%      173.620000
75%      187.930000
max      236.480000
Name: open_price, dtype: float64

Close Statistics:
count    501.000000
mean     173.170120
std       22.068295
min      125.020000
25%      155.000000
50%      173.750000
75%      188.040000
max      234.820000
Name: close_price, dtype: float64

High Statistics:
count    501.000000
mean     174.777922
std       22.071640
min      127.770000
25%      157.090000
50%      175.240000
75

In [149]:
# VALIDATION STEP: exclude data is out of the specified date range
start_date = pd.to_datetime("2022-08-02")  # YYYY-MM-DD
end_date = pd.to_datetime("2024-08-02")

prices = prices[prices.datetime >= start_date]
prices = prices[prices.datetime < end_date]
prices

Unnamed: 0,volume,vw_avr_price,open_price,close_price,high_price,low_price,num_transactions,datetime
0,56696985.0,164.7432,163.21,165.35,165.850,163.00,491310,2022-08-05 04:00:00
1,60362338.0,165.8939,166.37,164.87,167.810,164.20,540017,2022-08-08 04:00:00
2,63075503.0,164.8395,164.02,164.92,165.820,163.25,480552,2022-08-09 04:00:00
3,70170540.0,168.3496,167.68,169.24,169.340,166.90,559789,2022-08-10 04:00:00
4,57142109.0,169.3737,170.06,168.49,170.990,168.19,507914,2022-08-11 04:00:00
...,...,...,...,...,...,...,...,...
495,39827645.0,217.8992,218.70,217.96,219.490,216.01,608504,2024-07-26 04:00:00
496,35153729.0,218.1319,216.96,218.24,219.300,215.75,604680,2024-07-29 04:00:00
497,40681625.0,218.4059,219.19,218.80,220.325,216.12,584305,2024-07-30 04:00:00
498,48422974.0,222.3441,221.44,222.08,223.820,220.63,668833,2024-07-31 04:00:00


In [151]:
prices.to_parquet("../data/transformed/validated_prices_2022-08-02-2024-08-02.parquet") 
