In [5]:
import pandas as pd
import numpy as np

# Load raw CSVs (skipping first row artifact)
btc = pd.read_csv("../data/raw/btc_prices.csv", header=1)
eth = pd.read_csv("../data/raw/eth_prices.csv", header=1)
fg  = pd.read_csv("../data/raw/fear_greed_index.csv", index_col=0)

# Manually assign standard column names to fix "BTC-USD.1" issues
# Screenshot showed order: Date, Close, High, Low, Open, Volume
cols = ["Date", "Close", "High", "Low", "Open", "Volume"]
btc.columns = cols
eth.columns = cols

print("Raw shapes:", btc.shape, eth.shape, fg.shape)

Raw shapes: (1067, 6) (1067, 6) (2861, 3)


In [6]:
# Convert to datetime (coercing errors to NaT)
btc["Date"] = pd.to_datetime(btc["Date"], errors="coerce")
eth["Date"] = pd.to_datetime(eth["Date"], errors="coerce")
fg.index = pd.to_datetime(fg.index)

# Drop rows with invalid dates
btc = btc.dropna(subset=["Date"])
eth = eth.dropna(subset=["Date"])

# Set Index & Sort
btc = btc.set_index("Date").sort_index()
eth = eth.set_index("Date").sort_index()
fg = fg.sort_index()

# Normalize index (remove time components)
btc.index = btc.index.normalize()
eth.index = eth.index.normalize()
fg.index = fg.index.normalize()

print("Dates aligned and normalized.")
btc.head()


Dates aligned and normalized.


  btc["Date"] = pd.to_datetime(btc["Date"], errors="coerce")
  eth["Date"] = pd.to_datetime(eth["Date"], errors="coerce")


Unnamed: 0_level_0,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-02,16688.470703,16759.34375,16572.228516,16625.509766,12097780000.0
2023-01-03,16679.857422,16760.447266,16622.371094,16688.847656,13903080000.0
2023-01-04,16863.238281,16964.585938,16667.763672,16680.205078,18421740000.0
2023-01-05,16836.736328,16884.021484,16790.283203,16863.472656,13692760000.0
2023-01-06,16951.96875,16991.994141,16716.421875,16836.472656,14413660000.0


In [7]:
# Drop duplicates
btc = btc[~btc.index.duplicated(keep="first")]
eth = eth[~eth.index.duplicated(keep="first")]
fg  = fg[~fg.index.duplicated(keep="first")]

# Forward fill missing values
btc = btc.ffill()
eth = eth.ffill()
fg  = fg.ffill()

# Force "Close" column to numeric
btc["Close"] = pd.to_numeric(btc["Close"], errors="coerce")
eth["Close"] = pd.to_numeric(eth["Close"], errors="coerce")

# Clean Sentiment value
fg["FG_Value"] = pd.to_numeric(fg["value"], errors="coerce")

print("Duplicates removed and types fixed.")

Duplicates removed and types fixed.


In [8]:
# Daily Return
btc["BTC_Return"] = btc["Close"].pct_change()

# Moving Averages
btc["BTC_MA7"]  = btc["Close"].rolling(7).mean()
btc["BTC_MA30"] = btc["Close"].rolling(30).mean()

# Rolling Volatility (30 days)
btc["BTC_Vol30"] = btc["BTC_Return"].rolling(30).std()

print("BTC features created.")

BTC features created.


In [9]:
# Daily Return
eth["ETH_Return"] = eth["Close"].pct_change()

# Moving Averages
eth["ETH_MA7"]  = eth["Close"].rolling(7).mean()
eth["ETH_MA30"] = eth["Close"].rolling(30).mean()

# Rolling Volatility (30 days)
eth["ETH_Vol30"] = eth["ETH_Return"].rolling(30).std()

print("ETH features created.")


ETH features created.


In [10]:
# Select only the columns we need
btc_subset = btc[["Close", "Volume", "BTC_Return", "BTC_MA7", "BTC_MA30", "BTC_Vol30"]]
eth_subset = eth[["Close", "Volume", "ETH_Return", "ETH_MA7", "ETH_MA30", "ETH_Vol30"]]
fg_subset  = fg[["FG_Value", "value_classification"]]

# Inner Join: BTC + ETH + Sentiment
merged = btc_subset.join(
    eth_subset, lsuffix="_BTC", rsuffix="_ETH", how="inner"
).join(
    fg_subset, how="inner"
)

print(f"Merged shape: {merged.shape}")


Merged shape: (1065, 14)


In [11]:
# Drop NaN rows created by the rolling windows (first 30 days)
merged_clean = merged.dropna()

# Save to processed folder
merged_clean.to_csv("../data/processed/merged_clean.csv")

print(f" Final cleaned data saved. Shape: {merged_clean.shape}")
merged_clean.head()

 Final cleaned data saved. Shape: (1035, 14)


Unnamed: 0,Close_BTC,Volume_BTC,BTC_Return,BTC_MA7,BTC_MA30,BTC_Vol30,Close_ETH,Volume_ETH,ETH_Return,ETH_MA7,ETH_MA30,ETH_Vol30,FG_Value,value_classification
2023-02-01,23723.769531,26683260000.0,0.025259,23231.479074,20606.082031,0.023814,1641.792725,8116969000.0,0.034829,1602.215402,1490.054069,0.028151,56,Greed
2023-02-02,23471.871094,32066940000.0,-0.010618,23294.206752,20832.482487,0.024069,1643.241577,10558080000.0,0.000882,1607.949062,1504.336161,0.028141,60,Greed
2023-02-03,23449.322266,27083070000.0,-0.000961,23347.148717,21052.01862,0.02418,1664.745605,8169520000.0,0.013086,1617.461792,1517.943461,0.027785,60,Greed
2023-02-04,23331.847656,15639300000.0,-0.00501,23390.114118,21268.522331,0.024251,1667.059204,5843303000.0,0.00139,1630.979527,1531.830815,0.027695,58,Greed
2023-02-05,22955.666016,19564260000.0,-0.016123,23273.128348,21468.645573,0.024751,1631.645874,6926697000.0,-0.021243,1628.906703,1543.906376,0.028253,58,Greed
