In [25]:
import os
import pathlib
import datetime as dt
import pandas as pd
from dotenv import load_dotenv
import numpy as np

# Load environment variables
load_dotenv()
DATA_DIR_RAW = pathlib.Path(os.getenv('DATA_DIR_RAW', 'data/raw'))
DATA_DIR_PROCESSED = pathlib.Path(os.getenv('DATA_DIR_PROCESSED', 'data/processed'))

import numpy as np
dates = pd.date_range('2025-08-01', periods=20, freq='D')
SYMBOL = 'IBM'
df = pd.DataFrame({'date': dates, 'ticker': [SYMBOL]*20, 'price': 150 + np.random.randn(20).cumsum()})
df.head()

# Ensure directories exist
DATA_DIR_RAW.mkdir(parents=True, exist_ok=True)
DATA_DIR_PROCESSED.mkdir(parents=True, exist_ok=True)

#shows dictionary final location
print('RAW ->', DATA_DIR_RAW.resolve())
print('PROC ->', DATA_DIR_PROCESSED.resolve())
#.resolve() converts it into an absolute path

# Save CSV - raw data, human-readable
def ts(): return dt.datetime.now().strftime('%Y%m%d-%H%M%S')
    #define ts() used for later
    #{} - replace variabled for title
    
csv_path = DATA_DIR_RAW / f"{SYMBOL}_prices_{ts()}.csv"
df.to_csv(csv_path, index=False)
print("Saved CSV →", csv_path)

# Save Parquet - processed data, efficient for analysis
parquet_path = DATA_DIR_PROCESSED / f"{SYMBOL}_{ts()}.parquet"
try:
    df.to_parquet(parquet_path)

except Exception as e:
    print('Parquet engine not available. Install pyarrow or fastparquet to complete this step.')
    parquet_path = None
parquet_path




RAW -> /Users/ruilinyang/bootcamp_Ruilin_Yang/homework/homework5/data/raw
PROC -> /Users/ruilinyang/bootcamp_Ruilin_Yang/homework/homework5/data/processed
Saved CSV → data/raw/IBM_prices_20250820-022315.csv


PosixPath('data/processed/IBM_20250820-022315.parquet')

In [12]:

import numpy as np
dates = pd.date_range('2025-08-01', periods=20, freq='D')
df = pd.DataFrame({'date': dates, 'ticker': ['IBM']*20, 'price': 150 + np.random.randn(20).cumsum()})
df.head()

Unnamed: 0,date,ticker,price
0,2025-08-01,IBM,148.484357
1,2025-08-02,IBM,148.728099
2,2025-08-03,IBM,146.764657
3,2025-08-04,IBM,145.804682
4,2025-08-05,IBM,144.991451


In [27]:
# Reload files
df_csv = pd.read_csv(csv_path)
df_parquet = pd.read_parquet(parquet_path)
print(df_csv)
print(df_parquet)

          date ticker       price
0   2025-08-01    IBM  150.165577
1   2025-08-02    IBM  150.991416
2   2025-08-03    IBM  151.148234
3   2025-08-04    IBM  150.933132
4   2025-08-05    IBM  149.069298
5   2025-08-06    IBM  148.839510
6   2025-08-07    IBM  147.459090
7   2025-08-08    IBM  146.459796
8   2025-08-09    IBM  145.925675
9   2025-08-10    IBM  146.161167
10  2025-08-11    IBM  145.982384
11  2025-08-12    IBM  145.015877
12  2025-08-13    IBM  145.353384
13  2025-08-14    IBM  144.620757
14  2025-08-15    IBM  143.653807
15  2025-08-16    IBM  142.246768
16  2025-08-17    IBM  143.318167
17  2025-08-18    IBM  143.104234
18  2025-08-19    IBM  144.275554
19  2025-08-20    IBM  143.452057
         date ticker       price
0  2025-08-01    IBM  150.165577
1  2025-08-02    IBM  150.991416
2  2025-08-03    IBM  151.148234
3  2025-08-04    IBM  150.933132
4  2025-08-05    IBM  149.069298
5  2025-08-06    IBM  148.839510
6  2025-08-07    IBM  147.459090
7  2025-08-08    IBM  

In [31]:
print(df_csv.dtypes)

date       object
ticker     object
price     float64
dtype: object


In [32]:
print(df_parquet.dtypes)

date      datetime64[ns]
ticker            object
price            float64
dtype: object


In [24]:
#Reloading and Validating

# Reload files
df_csv = pd.read_csv(csv_path)
df_parquet = pd.read_parquet(parquet_path)

# Validate 
def validate_df(df_csv, df_parquet, critical_columns=['date','ticker','price']):
    #csv = paraquet
    
    #Compares two objects using Python's default equality operator.
        # Equal = True

    shapes_match = df_csv.shape == df_parquet.shape
    
    dtypes_match = all(df_csv[col].dtype == df_parquet[col].dtype for col in critical_columns)
  
    return {"shapes_match": shapes_match, "dtypes_match": dtypes_match} #why aren't they matching?

validation_results = validate_df(df_csv, df_parquet)
print(validation_results)

#Validate 
df_csv = pd.read_csv(csv_path, parse_dates=['date'])
print('CSV validation:', validate_df(df, df_csv))

if parquet_path.exists():
    try:
        df_parq = pd.read_parquet(parquet_path)
        print('Parquet validation:', validate_df(df, df_parq))
    except Exception as e:
        print('Parquet read failed:', e)
else:
    print('Parquet file does not exist.')


{'shapes_match': True, 'dtypes_match': False}
CSV validation: {'shapes_match': True, 'dtypes_match': True}
Parquet validation: {'shapes_match': True, 'dtypes_match': True}
