# Equity Prices
stock prices of 5 major energy firms - BP, CVX, VLO, SHEL, and XOM from 2021-2025

In [54]:
from pathlib import Path
import pandas as pd
from tqdm import tqdm
import yfinance as yf
import time

# ---------------------------------------------------------
# DIRECTORIES (UPDATED)
# ---------------------------------------------------------
BASE_DIR = Path(r"D:\MS_Data_Science_Thesis\Data_Extraction")
RAW_DATA_DIR = BASE_DIR / "Raw_Data_Folder"
RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)

print("Outputs will be saved to:", RAW_DATA_DIR)


Outputs will be saved to: D:\MS_Data_Science_Thesis\Data_Extraction\Raw_Data_Folder


In [56]:
TICKERS = [
    "CVX",   # Chevron
    "VLO",   # Valero
    "XOM",   # ExxonMobil
    "BP",    # BP
    "SHEL"   # Shell
]

START = "2021-01-01"
END = None  # None = up to today


In [58]:
def fetch_single_ticker(ticker, start, end=None, max_retries=3, pause=1.5):
    """
    Download daily OHLCV (auto-adjusted) for one ticker and return a tidy DataFrame.
    """
    for attempt in range(1, max_retries + 1):
        try:
            df = yf.download(
                ticker,
                start=start,
                end=end,
                progress=False,
                auto_adjust=True,
                threads=False
            )

            if not df.empty:
                df = df.reset_index().rename(columns={"Date": "date"})
                df.insert(0, "ticker", ticker)
                return df

        except Exception as e:
            print(f"[{ticker}] attempt {attempt} error: {e}")

        time.sleep(pause * attempt)  # simple backoff

    raise RuntimeError(f"Failed to fetch {ticker} after {max_retries} attempts")


def build_panel(frames, mode="long"):
    """
    mode='long' -> rows: ticker, date, prices...
    mode='wide' -> one row per date, columns are tickers (Close)
    """
    df = pd.concat(frames, ignore_index=True)
    df["date"] = pd.to_datetime(df["date"]).dt.date

    # yfinance with auto_adjust=True does NOT provide 'Adj Close'
    keep_cols = ["ticker", "date", "Open", "High", "Low", "Close", "Volume"]
    for col in keep_cols:
        if col not in df.columns:
            df[col] = pd.NA

    if mode == "long":
        return df[keep_cols].sort_values(["ticker", "date"]).reset_index(drop=True)

    wide = df.pivot(index="date", columns="ticker", values="Close").sort_index()
    wide = wide.rename_axis(None, axis=1).reset_index()
    return wide


In [60]:
frames = []

for t in tqdm(TICKERS, desc="Downloading"):
    try:
        frames.append(fetch_single_ticker(t, start=START, end=END))
    except Exception as e:
        print("Failed:", t, e)

if not frames:
    raise RuntimeError("No data downloaded. Check tickers and internet connection.")

panel_long = build_panel(frames, mode="long")
panel_wide = build_panel(frames, mode="wide")

panel_long.head()


Downloading: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  2.50it/s]


ValueError: Data must be 1-dimensional, got ndarray of shape (6360, 5) instead

In [62]:
long_path = RAW_DATA_DIR / "equity_prices_long.csv"
wide_path = RAW_DATA_DIR / "equity_prices_wide.csv"

panel_long.to_csv(long_path, index=False)
panel_wide.to_csv(wide_path, index=False)

print(f"✅ Saved long panel to: {long_path}  (rows={len(panel_long):,})")
print(f"✅ Saved wide panel to: {wide_path}  (rows={len(panel_wide):,}, cols={panel_wide.shape[1]})")


✅ Saved long panel to: D:\MS_Data_Science_Thesis\Data_Extraction\Raw_Data_Folder\equity_prices_long.csv  (rows=6,360)
✅ Saved wide panel to: D:\MS_Data_Science_Thesis\Data_Extraction\Raw_Data_Folder\equity_prices_wide.csv  (rows=1,272, cols=6)
