In [8]:
import yfinance as yf
import pandas as pd
from datetime import datetime

start_date = "2025-04-01"
end_date = "2025-04-30"

# Define each file with its ticker and required columns
datasets = [
    {
        'filename': 'data/bitcoin.csv',
        'ticker': 'BTC-USD',
        'columns': ['timestamp', 'Open', 'High', 'Low', 'Close', 'Volume']
    },
    {
        'filename': 'data/copper.csv',
        'ticker': 'HG=F',
        'columns': ['Price', 'copper_price']
    },
    {
        'filename': 'data/gold.csv',
        'ticker': 'GC=F',
        'columns': ['timestamp', 'Open', 'High', 'Low', 'Close', 'Volume']
    },
    {
        'filename': 'data/sp500.csv',
        'ticker': '^GSPC',
        'columns': ['timestamp', 'open', 'high', 'low', 'close', 'volume']
    },
    {
        'filename': 'data/MOEX.csv',
        'ticker': 'IMOEX.ME',
        'columns': ['timestamp', 'Close', 'Open', 'High', 'Low']
    },
    {
        'filename': 'data/SSE.csv',
        'ticker': '000001.SS',
        'columns': ['timestamp', 'Close', 'Open', 'High', 'Low']
    },
    {
        'filename': 'data/oil.csv',
        'ticker': 'CL=F',
        'columns': ['timestamp', 'Open', 'High', 'Low', 'Close', 'Volume']
    },
    {
        'filename': 'data/google_trends.csv',
        'ticker': None,
        'columns': ['timestamp', 'sp500', 'SPX', 'index fund', 'ETF']
    }
]

for dataset in datasets:
    print(f"Processing {dataset['filename']}...")

    if dataset['ticker']:
        df = yf.download(dataset['ticker'], start=start_date, end=end_date)
        df.reset_index(inplace=True)

        # Rename columns depending on the dataset
        rename_dict = {'Date': 'timestamp'}
        if dataset['filename'].endswith('sp500.csv'):
            rename_dict.update({
                'Open': 'open',
                'High': 'high',
                'Low': 'low',
                'Close': 'close',
                'Volume': 'volume'
            })

        df = df.rename(columns=rename_dict)

        # Special handling for copper.csv
        if dataset['filename'].endswith('copper.csv'):
            df['Price'] = df['Close']
            df['copper_price'] = df['Close']
            df = df[dataset['columns']]

        # Handling for MOEX or SSE
        elif dataset['filename'].endswith('MOEX.csv') or dataset['filename'].endswith('SSE.csv'):
            df = df[dataset['columns']]

        # Default handling
        else:
            df = df[dataset['columns']]

        df.to_csv(dataset['filename'], index=False)
        print(f"✅ Saved {dataset['filename']}")

    else:
        print("⚠️ No ticker for google_trends.csv → creating empty file.")
        df = pd.DataFrame(columns=dataset['columns'])
        df.to_csv(dataset['filename'], index=False)

print("🎉 All datasets processed and saved.")


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Processing data/bitcoin.csv...
✅ Saved data/bitcoin.csv
Processing data/copper.csv...
✅ Saved data/copper.csv
Processing data/gold.csv...
✅ Saved data/gold.csv
Processing data/sp500.csv...
✅ Saved data/sp500.csv
Processing data/MOEX.csv...



[*********************100%***********************]  1 of 1 completed

1 Failed download:
['IMOEX.ME']: YFPricesMissingError('possibly delisted; no price data found  (1d 2025-04-01 -> 2025-04-30)')
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


✅ Saved data/MOEX.csv
Processing data/SSE.csv...
✅ Saved data/SSE.csv
Processing data/oil.csv...
✅ Saved data/oil.csv
Processing data/google_trends.csv...
⚠️ No ticker for google_trends.csv → creating empty file.
🎉 All datasets processed and saved.


In [14]:
import pandas as pd

files_to_fix = [
    {
        'filename': 'data/treasury_3m.csv',
        'rename_map': {'date': 'timestamp', 'close': 'treasury3m'},
        'columns': ['timestamp', 'treasury3m']
    },
    {
        'filename': 'data/treasury_10y.csv',
        'rename_map': {'date': 'timestamp', 'close': 'treasury10y'},
        'columns': ['timestamp', 'treasury10y']
    },
    {
        'filename': 'data/spgsci.csv',
        'rename_map': {'date': 'timestamp', 'close': 'Close'},
        'columns': ['timestamp', 'Close']
    }
]

for file in files_to_fix:
    print(f"Fixing {file['filename']}...")

    df = pd.read_csv(file['filename'], index_col=False)
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    
    print("Original columns:", df.columns.tolist())
    
    # Clean headers
    df.columns = df.columns.str.strip().str.lower()  # lowercase everything for matching
    print("Cleaned columns:", df.columns.tolist())
    
    # Rename columns
    df = df.rename(columns=file['rename_map'])
    print("After renaming columns:", df.columns.tolist())
    
    # Check for missing columns
    missing_cols = [col for col in file['columns'] if col not in df.columns]
    if missing_cols:
        print(f"❌ Missing columns: {missing_cols}")
        continue
    
    # Keep only the desired columns
    df = df[file['columns']]
    
    # Save the cleaned CSV
    df.to_csv(file['filename'], index=False)
    print(f"✅ Saved {file['filename']} with corrected headers.")

print("🎉 Header corrections complete.")




Fixing data/treasury_3m.csv...
Original columns: ['timestamp', 'Close']
Cleaned columns: ['timestamp', 'close']
After renaming columns: ['timestamp', 'treasury3m']
✅ Saved data/treasury_3m.csv with corrected headers.
Fixing data/treasury_10y.csv...
Original columns: ['timestamp', 'Close']
Cleaned columns: ['timestamp', 'close']
After renaming columns: ['timestamp', 'treasury10y']
✅ Saved data/treasury_10y.csv with corrected headers.
Fixing data/spgsci.csv...
Original columns: ['date', 'open', 'high', 'low', 'close', 'volume']
Cleaned columns: ['date', 'open', 'high', 'low', 'close', 'volume']
After renaming columns: ['timestamp', 'open', 'high', 'low', 'Close', 'volume']
✅ Saved data/spgsci.csv with corrected headers.
🎉 Header corrections complete.
