In [1]:
# 1. Imports
import pandas as pd
import os
import sys
from pathlib import Path

# Add project root (parent of notebooks/) to sys.path
project_root = Path("../")  # one level up from notebooks/
sys.path.append(str(project_root))

# Import cleaning functions
from src.cleaning import clean_stock_file

# 2. Point to raw data folder
raw_folder = Path("../data/raw")

# 3. Pick a sample file (for example, Microsoft/MSFT)
example_file = list(raw_folder.glob("msft_historical_*.csv"))[0]
print("DEBUG: Using file ->", example_file)

# 4. Run the cleaning function
df_clean = clean_stock_file(example_file)

# 5. Inspect the cleaned data
print(df_clean.head())          # first 5 rows
print(df_clean.info())          # column types
print(df_clean.describe())      # stats summary

# 6. Save cleaned version
processed_folder = Path("../data/processed")
processed_folder.mkdir(parents=True, exist_ok=True)
output_file = processed_folder / f"cleaned_{example_file.name}"
df_clean.to_csv(output_file, index=False)
print("SUCCESS: Clean file saved to:", output_file)

# 7. Batch clean all stock files (optional, for full dataset)
all_files = list(raw_folder.glob("*_historical_*.csv"))
dfs = [clean_stock_file(f) for f in all_files]
df_all = pd.concat(dfs, ignore_index=True)

df_all.to_csv(processed_folder / "all_stocks_cleaned.csv", index=False)
print("SUCCESS: All stocks cleaned and saved!")


DEBUG: Using file -> ../data/raw/msft_historical_2025-08-24_14-18-17.csv
        date       close        high         low        open    volume symbol
0 2020-08-25  207.556946  207.691181  204.325709  204.325709  23043700   MSFT
1 2020-08-26  212.044220  212.945518  208.410278  208.908872  39600800   MSFT
2 2020-08-27  217.250687  221.632512  210.366312  213.712618  57602200   MSFT
3 2020-08-28  219.484726  221.143490  217.250661  218.784773  26292900   MSFT
4 2020-08-31  216.243912  219.283387  215.074143  217.653386  28774200   MSFT
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1255 entries, 0 to 1254
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    1255 non-null   datetime64[ns]
 1   close   1255 non-null   float64       
 2   high    1255 non-null   float64       
 3   low     1255 non-null   float64       
 4   open    1255 non-null   float64       
 5   volume  1255 non-null   Int64         
 

  df = pd.read_csv(p, parse_dates=parse_dates, infer_datetime_format=True)
  df = pd.read_csv(p, parse_dates=parse_dates, infer_datetime_format=True)
  df = pd.read_csv(p, parse_dates=parse_dates, infer_datetime_format=True)
  df = pd.read_csv(p, parse_dates=parse_dates, infer_datetime_format=True)
  df = pd.read_csv(p, parse_dates=parse_dates, infer_datetime_format=True)
  df = pd.read_csv(p, parse_dates=parse_dates, infer_datetime_format=True)
  df = pd.read_csv(p, parse_dates=parse_dates, infer_datetime_format=True)
  df = pd.read_csv(p, parse_dates=parse_dates, infer_datetime_format=True)
  df = pd.read_csv(p, parse_dates=parse_dates, infer_datetime_format=True)
  df = pd.read_csv(p, parse_dates=parse_dates, infer_datetime_format=True)
  df = pd.read_csv(p, parse_dates=parse_dates, infer_datetime_format=True)
  df = pd.read_csv(p, parse_dates=parse_dates, infer_datetime_format=True)
