In [101]:
import sys
print(sys.executable)
# Verify kernal path
import numpy as np
import pandas as pd
import os
import requests
from dotenv import load_dotenv
load_dotenv()
load_dotenv()

"""
Downloads data for every dat in date range for the given stock symbol. Saves it as parquet file in data/raw
"""
def dowload_one_file_of_raw_data(symbol, start_date, end_date):
    POLYGON_API_KEY = os.getenv('POLYGON_API_KEY')
    url = f"https://api.polygon.io/v2/aggs/ticker/{symbol}/range/1/day/{start_date}/{end_date}?apiKey={POLYGON_API_KEY}"
    response = requests.get(url)
    data = response.json()
    
    # Print keys to inspect the structure of the data
    print(dict(data).keys())

    # Extract the time series data
    time_series = data.get('results', [])
    
    # Check if time_series is empty or not
    if not time_series:
        print("No data available.")
        return

    # Convert the list of dictionaries to a pandas DataFrame
    df = pd.DataFrame(time_series)
    
    # convert 't' (timestamp) to datetime and rename it to 'Date'
    df['Date'] = pd.to_datetime(df['t'], unit='ms')
    df.set_index('Date', inplace=True)
    
    # drop the timestamp column 't'
    df.drop(columns=['t'], inplace=True)
    
    # rename columns for clarity
    df.rename(columns={
        'o': 'open_price',
        'h': 'high_price',
        'l': 'low_price',
        'c': 'close_price',
        'v': 'volume',
        'vw': 'vw_avr_price'
    }, inplace=True)
    print(df)
    

    path = f'../data/raw/prices_{start_date}-{end_date}.parquet'
    df.to_parquet(path, index=True)

# Example usage
dowload_one_file_of_raw_data(symbol='AAPL', start_date='2022-08-02', end_date='2024-08-02') 



/Users/pravachanpatra/Documents/PYTHON/AI_ML_DL/Stock_Price_Predictor/venv/bin/python
dict_keys(['ticker', 'queryCount', 'resultsCount', 'adjusted', 'results', 'status', 'request_id', 'count'])
                          volume  vw_avr_price  open_price  close_price  \
Date                                                                      
2022-08-04 04:00:00   55474144.0      165.5946     166.005       165.81   
2022-08-05 04:00:00   56696985.0      164.7432     163.210       165.35   
2022-08-08 04:00:00   60362338.0      165.8939     166.370       164.87   
2022-08-09 04:00:00   63075503.0      164.8395     164.020       164.92   
2022-08-10 04:00:00   70170540.0      168.3496     167.680       169.24   
...                          ...           ...         ...          ...   
2024-07-29 04:00:00   35153729.0      218.1319     216.960       218.24   
2024-07-30 04:00:00   40681625.0      218.4059     219.190       218.80   
2024-07-31 04:00:00   48422974.0      222.3441     221.4

In [104]:
# VALIDATION STEP: inspect data statistics
file_path = "../data/raw/prices_2022-08-02-2024-08-02.parquet"
prices = pd.read_parquet(file_path)

print("\nVolume Statistics:")
print(prices["volume"].describe())

print("\nVW Statistics:")
print(prices["vw_avr_price"].describe())

print("\nOpen Statistics:")
print(prices["open_price"].describe())

print("\nClose Statistics:")
print(prices["close_price"].describe())

print("\nHigh Statistics:")
print(prices["high_price"].describe())

print("\nLow Statistics:")
print(prices["low_price"].describe()) # scroll output


Volume Statistics:
count    5.020000e+02
mean     6.507105e+07
std      2.392480e+07
min      2.401840e+07
25%      4.898946e+07
50%      5.883856e+07
75%      7.408942e+07
max      2.040182e+08
Name: volume, dtype: float64

VW Statistics:
count    502.000000
mean     173.132407
std       22.075242
min      125.725000
25%      155.074175
50%      174.013500
75%      188.095625
max      234.920200
Name: vw_avr_price, dtype: float64

Open Statistics:
count    502.000000
mean     173.046127
std       22.111834
min      126.010000
25%      154.795000
50%      173.495000
75%      187.925000
max      236.480000
Name: open_price, dtype: float64

Close Statistics:
count    502.000000
mean     173.155458
std       22.048707
min      125.020000
25%      155.077500
50%      173.750000
75%      188.032500
max      234.820000
Name: close_price, dtype: float64

High Statistics:
count    502.000000
mean     174.762807
std       22.052202
min      127.770000
25%      157.092500
50%      175.220000
75

  print("\Low Statistics:")
