In [46]:
from pathlib import Path
import os
from dotenv import load_dotenv
import yfinance as yf
import datetime as dt
import pandas as pd

load_dotenv()
RAW = Path(os.getenv('DATA_DIR_RAW', 'data/raw'))
PROC = Path(os.getenv('DATA_DIR_PROCESSED', 'data/processed'))
RAW.mkdir(parents=True, exist_ok=True)
PROC.mkdir(parents=True, exist_ok=True)

In [47]:
#reusable function
def time_stamp():
    return dt.datetime.now().strftime("%Y%m%d-%H%M%S")
def validate_loaded(original, reloaded):
    checks = {
        'shape_equal': original.shape == reloaded.shape,
        'date_is_datetime': pd.api.types.is_datetime64_any_dtype(reloaded['date']) if 'date' in reloaded.columns else False,
        'close_is_numeric': pd.api.types.is_numeric_dtype(reloaded['close']) if 'close' in reloaded.columns else False,
        'open_is_numeric': pd.api.types.is_numeric_dtype(reloaded['open']) if 'open' in reloaded.columns else False,
        'high_is_numeric': pd.api.types.is_numeric_dtype(reloaded['high']) if 'high' in reloaded.columns else False,
        'low_is_numeric': pd.api.types.is_numeric_dtype(reloaded['close']) if 'low' in reloaded.columns else False,
        'volume_is_numeric': pd.api.types.is_numeric_dtype(reloaded['volume']) if 'volume' in reloaded.columns else False,
        'adjclose_is_numeric': pd.api.types.is_numeric_dtype(reloaded['close']) if 'adj close' in reloaded.columns else False,
    }
    return checks

In [48]:
#loading dataframe
symbol = "NVDA"
df_api = yf.download(tickers=symbol,period="6mo",interval="1d",auto_adjust=False)
if isinstance(df_api.columns, pd.MultiIndex):
    df_api.columns = df_api.columns.droplevel(1)
df_api = df_api.reset_index()
df_api.rename(columns={"Date":"date","Open":"open","High":"high","Low":"low","Close":"close","Volume":"volume","Adj Close":"adj close"},inplace=True)
df_api

[*********************100%***********************]  1 of 1 completed


Price,date,adj close,close,high,low,open,volume
0,2025-02-18,139.377502,139.399994,143.440002,137.929993,141.270004,219176600
1,2025-02-19,139.207520,139.229996,141.360001,137.220001,139.509995,167536000
2,2025-02-20,140.087387,140.110001,140.660004,136.789993,140.029999,143903600
3,2025-02-21,134.408295,134.429993,141.460007,134.029999,140.039993,228217600
4,2025-02-24,130.258972,130.279999,138.589996,130.080002,136.559998,251381100
...,...,...,...,...,...,...,...
120,2025-08-11,182.059998,182.059998,183.839996,180.250000,182.050003,138323200
121,2025-08-12,183.160004,183.160004,184.479996,179.460007,182.960007,145485700
122,2025-08-13,181.589996,181.589996,183.970001,179.350006,182.619995,179871700
123,2025-08-14,182.020004,182.020004,183.020004,179.460007,179.750000,129554000


In [49]:
#save dataframe to data/raw as csv
csv_path = RAW/f"NVDA_Stock_{time_stamp()}.csv"
df_api.to_csv(csv_path,index=False)

In [50]:
#save dataframe to data/processed as parquet
par_path = PROC/f"NVDA_Stock_{time_stamp()}.parquet"
df_api.to_parquet(par_path)

In [54]:
#reload and validate csv file
df_csv = pd.read_csv(csv_path)
validate_loaded(df_api,df_csv)

{'shape_equal': True,
 'date_is_datetime': False,
 'close_is_numeric': True,
 'open_is_numeric': True,
 'high_is_numeric': True,
 'low_is_numeric': True,
 'volume_is_numeric': True,
 'adjclose_is_numeric': True}

In [55]:
#reload and validate parquet file
df_pq = pd.read_parquet(par_path)
validate_loaded(df_api,df_pq)

{'shape_equal': True,
 'date_is_datetime': True,
 'close_is_numeric': True,
 'open_is_numeric': True,
 'high_is_numeric': True,
 'low_is_numeric': True,
 'volume_is_numeric': True,
 'adjclose_is_numeric': True}