In [9]:
import os, pathlib
import pandas as pd
from dotenv import load_dotenv

import sys
sys.path.append("..")
from src.cleaning import preprocess_df



PROJECT_ROOT = pathlib.Path().resolve().parent
load_dotenv(PROJECT_ROOT / ".env")

RAW_DIR = PROJECT_ROOT / os.getenv("DATA_DIR_RAW")
PROC_DIR = PROJECT_ROOT / os.getenv("DATA_DIR_PROCESSED")
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR.mkdir(parents=True, exist_ok=True)

print("RAW_DIR:", RAW_DIR)
print("PROC_DIR:", PROC_DIR)

RAW_DIR: D:\文心远\研究生\5040-Bootcamp\project\data\raw
PROC_DIR: D:\文心远\研究生\5040-Bootcamp\project\data\processed


In [10]:
csv_files = sorted(RAW_DIR.glob("api_source-yfinance_symbol-MSFT_*.csv"))
print("Stage4 CSV files found:", [f.name for f in csv_files])

latest_csv = csv_files[-1]
df_raw = pd.read_csv(latest_csv, parse_dates=['date'])
print("Loaded Stage4 CSV:", latest_csv)
print(df_raw.head())

Stage4 CSV files found: ['api_source-yfinance_symbol-MSFT_20250821-215314.csv']
Loaded Stage4 CSV: D:\文心远\研究生\5040-Bootcamp\project\data\raw\api_source-yfinance_symbol-MSFT_20250821-215314.csv
        date        open        high         low       close   adj_close  \
0 2025-02-24  408.510010  409.369995  399.320007  404.000000  403.259674   
1 2025-02-25  401.100006  401.920013  396.700012  397.899994  397.170837   
2 2025-02-26  398.010010  403.600006  394.250000  399.730011  398.997498   
3 2025-02-27  401.269989  405.739990  392.170013  392.529999  391.810699   
4 2025-02-28  392.660004  397.630005  386.570007  396.989990  396.262512   

     volume  
0  26443700  
1  29387400  
2  19619000  
3  21127400  
4  32845700  


Cleaning

In [11]:
df_cleaned = preprocess_df(df_raw)

df_cleaned.info()
df_cleaned.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       125 non-null    datetime64[ns]
 1   open       125 non-null    float64       
 2   high       125 non-null    float64       
 3   low        125 non-null    float64       
 4   close      125 non-null    float64       
 5   adj_close  125 non-null    float64       
 6   volume     125 non-null    float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 7.0 KB


Unnamed: 0,date,open,high,low,close,adj_close,volume
0,2025-02-24,0.282016,0.235063,0.291433,0.273028,0.271554,0.309973
1,2025-02-25,0.245755,0.196052,0.27743,0.239342,0.238049,0.372483
2,2025-02-26,0.230634,0.204849,0.264336,0.249448,0.2481,0.165049
3,2025-02-27,0.246587,0.216055,0.25322,0.209686,0.208554,0.19708
4,2025-02-28,0.204453,0.173587,0.223291,0.234316,0.233051,0.445921


In [12]:
clean_csv_path = PROC_DIR / "MSFT_preprocessed.csv"
df_cleaned.to_csv(clean_csv_path, index=False)
print("Saved cleaned data:", clean_csv_path)

Saved cleaned data: D:\文心远\研究生\5040-Bootcamp\project\data\processed\MSFT_preprocessed.csv
