In [9]:
import yfinance as yf
import pandas as pd

def download_stock(ticker, period="2y", interval="1d"):
    print(f"[{pd.Timestamp.now()}] Downloading {ticker} | period={period} interval={interval}")

    df = yf.download(ticker, period=period, interval=interval)

    # Reset index to get Date as a column
    df = df.reset_index()

    # Clean and normalize column names
    df.columns = df.columns.astype(str)          # ensure all are strings
    df.columns = df.columns.str.strip()          # remove leading/trailing spaces
    df.columns = df.columns.str.replace(" ", "") # remove inner spaces
    df.columns = [c.capitalize() for c in df.columns]  # standardize naming

    # Debugging print (optional)
    print(f"Columns after cleaning for {ticker}: {df.columns.tolist()}")

    # Critical check for Volume column
    if "Volume" not in df.columns:
        raise ValueError(
            f"\n❌ ERROR: 'Volume' column NOT FOUND for {ticker}!\n"
            f"Available columns: {df.columns.tolist()}\n"
            f"Yahoo Finance may have returned an unusual structure.\n"
        )

    return df


In [8]:
# utils/eda.py
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mplfinance as mpf

OUT_DIR = "outputs"
os.makedirs(OUT_DIR, exist_ok=True)

def eda_summary(df, ticker="TICKER"):
    print("\n====== EDA SUMMARY ======")
    print(df.head())
    print("\n--- Statistical Summary ---")
    print(df.describe())
    print("\n--- Missing Values ---")
    print(df.isna().sum())


def plot_price_trend(df, ticker="TICKER"):
    plt.figure(figsize=(10,5))
    plt.plot(df["Date"], df["Close"], label="Close Price")
    plt.title(f"{ticker} - Price Trend")
    plt.xlabel("Date")
    plt.ylabel("Price")
    plt.grid(True)
    plt.legend()
    out = os.path.join(OUT_DIR, f"{ticker}_eda_price.png")
    plt.savefig(out, dpi=150)
    plt.close()
    return out


def plot_moving_averages(df, ticker="TICKER"):
    plt.figure(figsize=(10,5))
    plt.plot(df["Date"], df["Close"], label="Close", alpha=0.7)
    plt.plot(df["Date"], df["SMA_20"], label="SMA 20", linestyle="--")
    plt.plot(df["Date"], df["SMA_10"], label="SMA 10", linestyle="--")
    plt.title(f"{ticker} - Moving Averages")
    plt.xlabel("Date")
    plt.ylabel("Price")
    plt.grid(True)
    plt.legend()
    out = os.path.join(OUT_DIR, f"{ticker}_eda_ma.png")
    plt.savefig(out, dpi=150)
    plt.close()
    return out


def plot_volume(df, ticker):
    # SAFETY CHECK – this is the ONLY fix needed
    if "Volume" not in df.columns:
        print(f"[INFO] Volume column missing for {ticker}. Skipping volume plot.")
        return

    plt.figure(figsize=(10,4))
    plt.bar(df["Date"], df["Volume"], color="gray")
    plt.title(f"{ticker} - Volume Trend")
    plt.xlabel("Date")
    plt.ylabel("Volume")
    plt.tight_layout()
    plt.savefig(f"outputs/{ticker}_eda_volume.png")
    plt.close()




def plot_candlestick(df, ticker="TICKER"):
    df_candle = df.copy()
    df_candle.index = pd.to_datetime(df_candle["Date"])
    cols = ["Open", "High", "Low", "Close"]

    out = os.path.join(OUT_DIR, f"{ticker}_eda_candlestick.png")

    mpf.plot(
        df_candle[cols],
        type="candle",
        style="charles",
        title=f"{ticker} - Candlestick Chart",
        volume=True,
        savefig=out
    )
    return out


def plot_correlation(df, ticker="TICKER"):
    plt.figure(figsize=(10,8))
    numeric_df = df.select_dtypes(include=np.number)
    corr = numeric_df.corr()

    sns.heatmap(corr, cmap="coolwarm", annot=False)
    plt.title(f"{ticker} - Correlation Heatmap")
    out = os.path.join(OUT_DIR, f"{ticker}_eda_corr.png")
    plt.savefig(out, dpi=150)
    plt.close()
    return out


def run_full_eda(df, ticker="TICKER"):
    print(f"\n==== Running EDA for {ticker} ====")

    eda_summary(df, ticker)
    p1 = plot_price_trend(df, ticker)
    p2 = plot_moving_averages(df, ticker)
    p3 = plot_volume(df, ticker)
    p4 = plot_candlestick(df, ticker)
    p5 = plot_correlation(df, ticker)

    print("\nSaved EDA plots:")
    print(p1)
    print(p2)
    print(p3)
    print(p4)
    print(p5)


In [10]:
!pip install mplfinance



In [11]:
# utils/forecaster.py
import os
import pickle
import pandas as pd
import matplotlib.pyplot as plt

# Try Prophet first (new package name "prophet")
try:
    from prophet import Prophet
    PROPHET_AVAILABLE = True
except Exception:
    try:
        from fbprophet import Prophet
        PROPHET_AVAILABLE = True
    except Exception:
        PROPHET_AVAILABLE = False

# statsmodels fallback
try:
    from statsmodels.tsa.statespace.sarimax import SARIMAX
    STATSMODELS_AVAILABLE = True
except Exception:
    STATSMODELS_AVAILABLE = False

def _ensure_datecol(df, date_col="Date"):
    df = df.copy()
    if date_col in df.columns:
        df[date_col] = pd.to_datetime(df[date_col]).dt.tz_localize(None)

    else:
        df = df.reset_index()
        df.rename(columns={df.columns[0]: 'Date'}, inplace=True)
        df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values("Date")
    return df

def forecast_with_prophet(df, periods=30, freq='D', model_dir="models", ticker='TICKER'):
    os.makedirs(model_dir, exist_ok=True)
    df = _ensure_datecol(df)
    df_prop = df[['Date', 'Close']].rename(columns={'Date': 'ds', 'Close': 'y'})
    m = Prophet()
    m.fit(df_prop)
    future = m.make_future_dataframe(periods=periods, freq=freq)
    forecast = m.predict(future)
    model_path = os.path.join(model_dir, f"{ticker}_prophet.pkl")
    with open(model_path, "wb") as f:
        pickle.dump(m, f)
    out = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].copy()
    if 'y' in forecast.columns:
        out['y'] = forecast['y']
    return out, model_path

def forecast_with_sarimax(df, periods=30, freq='D', order=(1,1,1), seasonal_order=(0,0,0,0), model_dir="models", ticker='TICKER'):
    if not STATSMODELS_AVAILABLE:
        raise RuntimeError("statsmodels not available.")
    os.makedirs(model_dir, exist_ok=True)
    df = _ensure_datecol(df)
    ts = df.set_index('Date')['Close'].asfreq('D').fillna(method='ffill')
    model = SARIMAX(ts, order=order, seasonal_order=seasonal_order, enforce_stationarity=False, enforce_invertibility=False)
    res = model.fit(disp=False)
    start = ts.index[-1] + pd.Timedelta(1, unit='D')
    future_idx = pd.date_range(start=start, periods=periods, freq=freq)
    preds = res.get_forecast(steps=periods)
    mean = preds.predicted_mean
    conf = preds.conf_int()
    out = pd.DataFrame({
        'ds': future_idx,
        'yhat': mean.values,
        'yhat_lower': conf.iloc[:, 0].values,
        'yhat_upper': conf.iloc[:, 1].values
    })
    model_path = os.path.join(model_dir, f"{ticker}_sarimax.pkl")
    with open(model_path, "wb") as f:
        pickle.dump(res, f)
    return out, model_path

def save_forecast_plot(df_history, df_forecast, ticker, out_dir="outputs", horizon_days=30):
    os.makedirs(out_dir, exist_ok=True)
    plt.figure(figsize=(10, 5))
    hist = df_history.copy()
    hist['Date'] = pd.to_datetime(hist['Date'])
    plt.plot(hist['Date'], hist['Close'], label='History', linewidth=1)
    plt.plot(df_forecast['ds'], df_forecast['yhat'], label='Forecast', linestyle='--', linewidth=1.5)
    plt.fill_between(df_forecast['ds'].astype('datetime64[ns]'),
                     df_forecast['yhat_lower'],
                     df_forecast['yhat_upper'],
                     color='gray', alpha=0.2, label='Confidence')
    plt.title(f"{ticker} — Forecast (next {horizon_days} days)")
    plt.xlabel("Date")
    plt.ylabel("Price")
    plt.grid(True)
    plt.legend()
    out_path = os.path.join(out_dir, f"{ticker}_forecast.png")
    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()
    return out_path

def make_forecast(df, ticker, periods=30, freq='D', model_dir="models", out_dir="outputs"):
    df = _ensure_datecol(df)
    method = None
    forecast_df = None
    model_path = None
    if PROPHET_AVAILABLE:
        try:
            forecast_df, model_path = forecast_with_prophet(df, periods=periods, freq=freq, model_dir=model_dir, ticker=ticker)
            method = 'prophet'
        except Exception as e:
            print(f"Prophet failed ({e}) — falling back to SARIMAX if available.")
            if STATSMODELS_AVAILABLE:
                forecast_df, model_path = forecast_with_sarimax(df, periods=periods, freq=freq, model_dir=model_dir, ticker=ticker)
                method = 'sarimax'
            else:
                raise
    else:
        if STATSMODELS_AVAILABLE:
            forecast_df, model_path = forecast_with_sarimax(df, periods=periods, freq=freq, model_dir=model_dir, ticker=ticker)
            method = 'sarimax'
        else:
            raise RuntimeError("Neither Prophet nor SARIMAX available.")
    os.makedirs(out_dir, exist_ok=True)
    forecast_csv = os.path.join(out_dir, f"{ticker}_forecast.csv")
    forecast_df.to_csv(forecast_csv, index=False)
    plot_path = save_forecast_plot(df, forecast_df, ticker, out_dir=out_dir, horizon_days=periods)
    return forecast_df, model_path, plot_path, method


In [12]:
# utils/preprocess.py
import pandas as pd
import numpy as np

def ensure_dateindex(df, date_col='Date'):
    df = df.copy()
    df[date_col] = pd.to_datetime(df[date_col])
    df = df.sort_values(date_col).reset_index(drop=True)
    df.set_index(date_col, inplace=True)
    return df

def add_sma(df, window=20, column='Close', col_name=None):
    if col_name is None:
        col_name = f"SMA_{window}"
    df[col_name] = df[column].rolling(window=window, min_periods=1).mean()
    return df

def add_ema(df, span=20, column='Close', col_name=None):
    if col_name is None:
        col_name = f"EMA_{span}"
    df[col_name] = df[column].ewm(span=span, adjust=False).mean()
    return df

def add_rsi(df, period=14, column='Close', col_name='RSI_14'):
    delta = df[column].diff()
    up = delta.clip(lower=0)
    down = -1 * delta.clip(upper=0)
    roll_up = up.rolling(window=period, min_periods=1).mean()
    roll_down = down.rolling(window=period, min_periods=1).mean()
    rs = roll_up / (roll_down.replace(0, np.nan))
    rsi = 100 - (100 / (1 + rs))
    df[col_name] = rsi.fillna(0)
    return df

def add_macd(df, fast=12, slow=26, signal=9, column='Close'):
    ema_fast = df[column].ewm(span=fast, adjust=False).mean()
    ema_slow = df[column].ewm(span=slow, adjust=False).mean()
    macd = ema_fast - ema_slow
    macd_signal = macd.ewm(span=signal, adjust=False).mean()
    df['MACD'] = macd
    df['MACD_Signal'] = macd_signal
    df['MACD_Hist'] = df['MACD'] - df['MACD_Signal']
    return df

def add_bollinger(df, window=20, column='Close', n_std=2):
    sma = df[column].rolling(window=window, min_periods=1).mean()
    std = df[column].rolling(window=window, min_periods=1).std().fillna(0)
    df['Bollinger_Mid'] = sma
    df['Bollinger_Upper'] = sma + (n_std * std)
    df['Bollinger_Lower'] = sma - (n_std * std)
    return df

def add_volume_ma(df, window=20, column='Volume'):
    df[f'Volume_MA_{window}'] = df[column].rolling(window=window, min_periods=1).mean()
    return df

def add_all_indicators(df):
    """
    Input: DataFrame with Date column and Close column.
    Returns DataFrame with added indicator columns.
    """
    working = df.copy()
    if 'Date' in working.columns:
        working = ensure_dateindex(working, 'Date')

    working = add_sma(working, 10)
    working = add_sma(working, 20)
    working = add_ema(working, 12)
    working = add_ema(working, 26)
    working = add_rsi(working, period=14)
    working = add_macd(working, fast=12, slow=26, signal=9)
    working = add_bollinger(working, window=20, n_std=2)
    working = add_volume_ma(working, window=20)
    # reset index so downstream code that expects 'Date' column still works
    working = working.reset_index().rename_axis(None, axis=1)
    return working


In [13]:
# main.py
import os
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
# These functions are defined in other cells and become globally available once those cells are executed.
# No need for 'from utils.xxx import yyy' as 'utils' is not a Python package here.
# from utils.eda import run_full_eda
# from utils.data_loader import download_stock
# from utils.preprocess import add_all_indicators
# from utils.forecaster import make_forecast
import yfinance as yf

DATA_DIR = "data"
OUT_DIR = "outputs"
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(OUT_DIR, exist_ok=True)


def download_stock(ticker: str, period: str = "2y", interval: str = "1d") -> pd.DataFrame:
    """
    Download historical data for `ticker` using yfinance.
    period example: '1y', '2y', '5y', 'max'
    interval example: '1d', '1wk', '1mo'
    """
    print(f"[{datetime.now()}] Downloading {ticker} | period={period} interval={interval}")
    stock = yf.Ticker(ticker)
    df = stock.history(period=period, interval=interval)
    if df.empty:
        raise RuntimeError(f"No data returned for {ticker}")
    df.reset_index(inplace=True)
    return df


def save_dataframe(df: pd.DataFrame, filename: str) -> str:
    path = os.path.join(DATA_DIR, filename)
    df.to_csv(path, index=False)
    print(f"Saved CSV → {path}")
    return path


def plot_close_price(df: pd.DataFrame, ticker: str) -> str:
    plt.figure(figsize=(10, 5))
    plt.plot(pd.to_datetime(df['Date']), df['Close'], label=f"{ticker} Close")
    plt.title(f"{ticker} — Close Price")
    plt.xlabel("Date")
    plt.ylabel("Price")
    plt.grid(True)
    plt.legend()
    out_path = os.path.join(OUT_DIR, f"{ticker}_close.png")
    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()
    print(f"Saved plot → {out_path}")
    return out_path


def quick_stats(df: pd.DataFrame) -> None:
    """
    Safe quick statistics that works across pandas versions.
    - Prints date range
    - Prints numeric describe()
    - Shows last 5 rows
    """
    print("\n=== Quick stats ===")
    # Ensure Date is datetime
    if 'Date' in df.columns:
        try:
            df['Date'] = pd.to_datetime(df['Date'])
            print("Date range:", df['Date'].min(), "to", df['Date'].max())
        except Exception:
            pass

    # Numeric summary: select numeric columns only
    numeric_df = df.select_dtypes(include='number')
    if not numeric_df.empty:
        print("\nNumeric summary:")
        print(numeric_df.describe().round(4))
    else:
        print("\nNo numeric columns to summarize.")

    print("\nLast 5 rows:")
    print(df.tail())


def main():
    # change tickers as you like
    tickers = ["AAPL", "TSLA"]

    for t in tickers:
        try:
            # 1) Download
            df = download_stock(t, period="2y", interval="1d")

            # 2) Add indicators
            df_ind = add_all_indicators(df)

            # 3) Save processed CSV
            csv_name = f"{t}_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.csv"
            save_dataframe(df_ind, csv_name)

            # 4) Plot close price
            plot_close_price(df_ind, t)

            # 5) Quick stats
            quick_stats(df_ind)
            run_full_eda(df_ind, ticker=t)


            # 6) Forecast (Prophet primary, SARIMAX fallback)
            forecast_df, model_path, forecast_plot, method = make_forecast(
                df_ind,
                ticker=t,
                periods=30
            )

            print(f"\nForecast completed for {t}:")
            print(" - Method:", method)
            print(" - Model saved at:", model_path)
            print(" - Forecast CSV:", os.path.join(OUT_DIR, f"{t}_forecast.csv"))
            print(" - Forecast plot:", forecast_plot)
            print("--------------------------------------\n")

        except Exception as e:
            print(f"Error for {t}: {e}")


if __name__ == "__main__":
    main()


[2026-01-02 04:33:12.612619] Downloading AAPL | period=2y interval=1d
Saved CSV → data/AAPL_20260102_043312.csv
Saved plot → outputs/AAPL_close.png

=== Quick stats ===
Date range: 2024-01-02 00:00:00-05:00 to 2025-12-31 00:00:00-05:00

Numeric summary:
           Open      High       Low     Close        Volume  Dividends  \
count  502.0000  502.0000  502.0000  502.0000  5.020000e+02   502.0000   
mean   218.5344  220.8468  216.4661  218.7577  5.564712e+07     0.0040   
std     29.1062   29.2367   28.9350   29.1409  2.732569e+07     0.0317   
min    164.0121  165.0536  162.7523  163.6649  1.791060e+07     0.0000   
25%    197.1683  199.8740  194.7083  197.8385  4.132715e+07     0.0000   
50%    220.3779  223.1972  218.5103  220.9553  4.890840e+07     0.0000   
75%    234.8280  237.9405  232.7431  235.6269  6.041002e+07     0.0000   
max    286.2000  288.6200  283.3000  286.1900  3.186799e+08     0.2600   

       Stock Splits    SMA_10    SMA_20    EMA_12    EMA_26    RSI_14  \
count 