In [13]:
import pandas as pd
import numpy as np
import os

# Folder where raw CSVs are stored
RAW_PATH = "/Users/erion/Desktop/MSCF2/ADA/Capstone-Project/data/raw/"

datasets = {}

# Load all raw CSV files
for file in os.listdir(RAW_PATH):
    if file.endswith(".csv"):
        name = file.replace(".csv", "")
        # Skip the first 3 rows (header rows), set Date as index
        df = pd.read_csv(RAW_PATH + file, skiprows=3, index_col=0, parse_dates=True)
        # Set column names based on the structure (Close, High, Low, Open, Volume)
        df.columns = ['Close', 'High', 'Low', 'Open', 'Volume']
        df.index.name = 'Date'
        datasets[name] = df

# Show which datasets we loaded
print("Loaded datasets:", list(datasets.keys()))
print("\nDataset shapes:")
for name, df in datasets.items():
    print(f"{name}: {df.shape}")


Loaded datasets: ['gold', 'eurusd', 'treasury_10y', 'spy', 'vix', 'dxy', 'oil']

Dataset shapes:
gold: (5523, 5)
eurusd: (5471, 5)
treasury_10y: (5529, 5)
spy: (5535, 5)
vix: (5535, 5)
dxy: (5548, 5)
oil: (5527, 5)


In [14]:
# Standardize all datasets: keep only 'Close' price
for name, df in datasets.items():
    # All datasets have a 'Close' column after loading
    datasets[name] = df[['Close']].rename(columns={'Close': name})

# Show preview of the cleaned datasets
for name, df in datasets.items():
    print(f"{name}: {df.shape}")
    display(df.head())


gold: (5523, 1)


Unnamed: 0_level_0,gold
Date,Unnamed: 1_level_1
2003-01-03,351.200012
2003-01-06,351.700012
2003-01-07,347.299988
2003-01-08,353.899994
2003-01-09,353.299988


eurusd: (5471, 1)


Unnamed: 0_level_0,eurusd
Date,Unnamed: 1_level_1
2003-12-02,1.208897
2003-12-03,1.212298
2003-12-04,1.208094
2003-12-05,1.218695
2003-12-08,1.222001


treasury_10y: (5529, 1)


Unnamed: 0_level_0,treasury_10y
Date,Unnamed: 1_level_1
2003-01-03,4.038
2003-01-06,4.066
2003-01-07,4.025
2003-01-08,3.981
2003-01-09,4.148


spy: (5535, 1)


Unnamed: 0_level_0,spy
Date,Unnamed: 1_level_1
2003-01-03,60.004433
2003-01-06,61.062016
2003-01-07,60.910942
2003-01-08,60.030731
2003-01-09,60.963463


vix: (5535, 1)


Unnamed: 0_level_0,vix
Date,Unnamed: 1_level_1
2003-01-03,24.68
2003-01-06,24.91
2003-01-07,25.129999
2003-01-08,25.530001
2003-01-09,24.25


dxy: (5548, 1)


Unnamed: 0_level_0,dxy
Date,Unnamed: 1_level_1
2003-01-03,102.470001
2003-01-06,101.970001
2003-01-07,102.57
2003-01-08,101.870003
2003-01-09,101.940002


oil: (5527, 1)


Unnamed: 0_level_0,oil
Date,Unnamed: 1_level_1
2003-01-03,33.080002
2003-01-06,32.099998
2003-01-07,31.08
2003-01-08,30.559999
2003-01-09,31.99


In [15]:
# Start with gold as the base
merged = datasets['gold']

# Merge the other assets
for name, df in datasets.items():
    if name != 'gold':
        merged = merged.join(df, how='outer')

print("Merged dataset shape before cleaning:", merged.shape)
merged.head()

Merged dataset shape before cleaning: (5732, 7)
 (5732, 7)


Unnamed: 0_level_0,gold,eurusd,treasury_10y,spy,vix,dxy,oil
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2003-01-03,351.200012,,4.038,60.004433,24.68,102.470001,33.080002
2003-01-06,351.700012,,4.066,61.062016,24.91,101.970001,32.099998
2003-01-07,347.299988,,4.025,60.910942,25.129999,102.57,31.08
2003-01-08,353.899994,,3.981,60.030731,25.530001,101.870003,30.559999
2003-01-09,353.299988,,4.148,60.963463,24.25,101.940002,31.99


In [16]:
merged = merged.sort_index().ffill()
print("Merged dataset shape after forward-fill:", merged.shape)
merged.head()

Merged dataset shape after forward-fill: (5732, 7)


Unnamed: 0_level_0,gold,eurusd,treasury_10y,spy,vix,dxy,oil
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2003-01-03,351.200012,,4.038,60.004433,24.68,102.470001,33.080002
2003-01-06,351.700012,,4.066,61.062016,24.91,101.970001,32.099998
2003-01-07,347.299988,,4.025,60.910942,25.129999,102.57,31.08
2003-01-08,353.899994,,3.981,60.030731,25.530001,101.870003,30.559999
2003-01-09,353.299988,,4.148,60.963463,24.25,101.940002,31.99


In [17]:
merged_returns = merged.copy()

for col in merged_returns.columns:
    if col == 'treasury_10y':
        continue  # keep yield unchanged
    merged_returns[col] = merged_returns[col].pct_change()

merged_returns = merged_returns.dropna()
merged_returns.head()

Unnamed: 0_level_0,gold,eurusd,treasury_10y,spy,vix,dxy,oil
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2003-12-03,0.000495,0.002813,4.41,-0.001584,0.022127,-0.001562,0.010396
2003-12-04,-0.001486,-0.003467,4.369,0.004106,-0.019844,0.001117,0.005145
2003-12-05,0.007687,0.008775,4.215,-0.00697,0.048466,-0.005134,-0.016955
2003-12-08,0.000492,0.002713,4.278,0.006738,-0.032183,-0.004375,0.044582
2003-12-09,0.003935,0.00245,4.352,-0.007716,0.065901,-0.001577,-0.010592


In [18]:
processed_path = "../data/processed/merged_dataset.csv"
merged_returns.to_csv(processed_path)

print("Saved cleaned dataset to:", processed_path)

Saved cleaned dataset to: ../data/processed/merged_dataset.csv
