In [31]:
from pathlib import Path
import os

# Force Jupyter to work inside your repo
os.chdir("/Users/ravish/Documents/Stock-Project")

print("Now working in:", Path.cwd())

Now working in: /Users/ravish/Documents/Stock-Project


In [32]:
import sys, pathlib

# Make sure our project folders exist
pathlib.Path("data/raw").mkdir(parents=True, exist_ok=True)
pathlib.Path("data/processed").mkdir(parents=True, exist_ok=True)

print("Setup OK ✅")
print("Python version:", sys.version.split()[0])

Setup OK ✅
Python version: 3.11.7


In [33]:
# Install yfinance if it's not available already
try:
    import yfinance as yf
except ImportError:
    %pip install yfinance
    import yfinance as yf

print("yfinance ready ✅")

yfinance ready ✅


In [34]:
import pandas as pd

TICKER = "AAPL"
PERIOD = "1y"   # you can also use "2y", "5y", etc.

# Download OHLCV data
df = yf.download(TICKER, period=PERIOD, auto_adjust=False)
df.index.name = "Date"

print(df.head())


[*********************100%***********************]  1 of 1 completed

Price        Adj Close       Close        High         Low        Open  \
Ticker            AAPL        AAPL        AAPL        AAPL        AAPL   
Date                                                                     
2024-09-25  225.321335  226.369995  227.289993  224.020004  224.929993   
2024-09-26  226.466034  227.520004  228.500000  225.410004  227.300003   
2024-09-27  226.734772  227.789993  229.520004  227.300003  228.460007   
2024-09-30  231.920639  233.000000  233.000000  229.649994  230.039993   
2024-10-01  225.162109  226.210007  229.649994  223.740005  229.520004   

Price         Volume  
Ticker          AAPL  
Date                  
2024-09-25  42308700  
2024-09-26  36636700  
2024-09-27  34026000  
2024-09-30  54541900  
2024-10-01  63285000  





In [35]:
df = yf.download(TICKER, period=PERIOD, auto_adjust=False)

[*********************100%***********************]  1 of 1 completed


In [36]:
# Save the data to CSV inside data/raw/
from pathlib import Path

out_path = Path("data/raw") / f"{TICKER}_{PERIOD}.csv"
df.to_csv(out_path)
print(f"Saved {len(df)} rows to {out_path}")


Saved 250 rows to data/raw/AAPL_1y.csv


In [37]:
import datetime as dt
from pathlib import Path

# ✔️ Edit tickers if you want different ones
TICKERS = ["AAPL", "MSFT", "AMZN", "GOOG", "TSLA"]

# 20 years back from today (buffer a few days)
END   = dt.date.today()
START = END - dt.timedelta(days=365*20 + 7)

DATA_RAW = Path("data/raw"); DATA_RAW.mkdir(parents=True, exist_ok=True)

print(f"Fetching {len(TICKERS)} tickers from {START} to {END}")

Fetching 5 tickers from 2005-09-23 to 2025-09-25


In [38]:
import time
import pandas as pd
import yfinance as yf

def fetch(ticker: str, start, end, tries=3, sleep=2) -> pd.DataFrame:
    for i in range(tries):
        df = yf.download(
            ticker, start=start, end=end,
            auto_adjust=False, progress=False, threads=False
        )
        if isinstance(df, pd.DataFrame) and not df.empty:
            df.index.name = "Date"
            return df
        time.sleep(sleep * (i+1))
    raise RuntimeError(f"Failed to download data for {ticker}")

results = []
for t in TICKERS:
    df = fetch(t, START, END)
    out = DATA_RAW / f"{t}_{START}_{END}.csv"
    df.to_csv(out)
    results.append({
        "Ticker": t,
        "Rows": len(df),
        "From": df.index.min().date() if len(df) else None,
        "To":   df.index.max().date() if len(df) else None,
        "Saved": out.as_posix()
    })
    print(f"{t}: {len(df):,} rows → {out}")

AAPL: 5,032 rows → data/raw/AAPL_2005-09-23_2025-09-25.csv
MSFT: 5,032 rows → data/raw/MSFT_2005-09-23_2025-09-25.csv
AMZN: 5,032 rows → data/raw/AMZN_2005-09-23_2025-09-25.csv
GOOG: 5,032 rows → data/raw/GOOG_2005-09-23_2025-09-25.csv
TSLA: 3,834 rows → data/raw/TSLA_2005-09-23_2025-09-25.csv


In [39]:
summary = pd.DataFrame(results).set_index("Ticker").sort_index()
summary

Unnamed: 0_level_0,Rows,From,To,Saved
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AAPL,5032,2005-09-23,2025-09-24,data/raw/AAPL_2005-09-23_2025-09-25.csv
AMZN,5032,2005-09-23,2025-09-24,data/raw/AMZN_2005-09-23_2025-09-25.csv
GOOG,5032,2005-09-23,2025-09-24,data/raw/GOOG_2005-09-23_2025-09-25.csv
MSFT,5032,2005-09-23,2025-09-24,data/raw/MSFT_2005-09-23_2025-09-25.csv
TSLA,3834,2010-06-29,2025-09-24,data/raw/TSLA_2005-09-23_2025-09-25.csv


In [42]:
# Setup for Notebook 02: Feature Engineering

import pandas as pd
import numpy as np
from pathlib import Path

# Folder structure (assumes you're in repo root)
DATA_RAW = Path("data/raw")
DATA_PROC = Path("data/processed")
DATA_PROC.mkdir(parents=True, exist_ok=True)

# List of tickers we have data for
TICKERS = ["AAPL", "MSFT", "AMZN", "GOOG", "TSLA"]

print("Setup complete ✅ | Raw:", DATA_RAW, "| Processed:", DATA_PROC)

Setup complete ✅ | Raw: data/raw | Processed: data/processed
