In [2]:
pip install yfinance pandas

Collecting yfinance
  Downloading yfinance-0.2.66-py2.py3-none-any.whl.metadata (6.0 kB)
Collecting pandas
  Downloading pandas-2.3.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.12.tar.gz (19 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting pytz>=2022.5 (from yfinance)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting frozendict>=2.3.4 (from yfinance)
  Downloading frozendict-2.4.6-py312-none-any.whl.metadata (23 kB)
Collecting peewee>=3.16.2 (from yfinance)
  Downloading peewee-3.18.3.tar.gz (3.0 MB)
     ---------------------------------------- 0.0/3.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/3.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/3.0 MB ? eta -:--:--
     --- ------------------------------------ 0.3/3.0 MB ? eta -:--:--
     ------ --------------------------

  DEPRECATION: Building 'multitasking' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'multitasking'. Discussion can be found at https://github.com/pypa/pip/issues/6334


In [3]:
# ============================
# ðŸ“Š PHASE 1: DATA COLLECTION
# ============================

import yfinance as yf
import pandas as pd
from datetime import date

# ----------------------------
# SETTINGS
# ----------------------------
tickers = ['RELIANCE.NS', 'TCS.NS', 'HDFCBANK.NS', '^NSEI']  # ^NSEI = Nifty50 index
start_date = '2019-01-01'
end_date = '2025-01-01'

# ----------------------------
# DOWNLOAD DATA
# ----------------------------
print(f"Downloading data from {start_date} to {end_date}...")

data = yf.download(tickers, start=start_date, end=end_date, interval='1d', progress=True)

# Flatten multi-level columns (e.g. ('Open', 'RELIANCE.NS') -> 'Open_RELIANCE.NS')
data.columns = ['_'.join(col).strip() for col in data.columns.values]

# Reset index to move 'Date' into a column
data = data.reset_index()

# ----------------------------
# SAVE RAW DATA
# ----------------------------
output_path = '../data/raw/stocks_raw.csv'
data.to_csv(output_path, index=False)
print(f"âœ… Data downloaded and saved as {output_path}")

# ----------------------------
# QUICK PREVIEW
# ----------------------------
print("\nPreview of dataset:")
display(data.head())
print("\nShape of data:", data.shape)
print("\nColumns:")
print(data.columns.tolist())

Downloading data from 2019-01-01 to 2025-01-01...


  data = yf.download(tickers, start=start_date, end=end_date, interval='1d', progress=True)
[*********************100%***********************]  4 of 4 completed

âœ… Data downloaded and saved as ../data/raw/stocks_raw.csv

Preview of dataset:





Unnamed: 0,Date,Close_HDFCBANK.NS,Close_RELIANCE.NS,Close_TCS.NS,Close_^NSEI,High_HDFCBANK.NS,High_RELIANCE.NS,High_TCS.NS,High_^NSEI,Low_HDFCBANK.NS,...,Low_TCS.NS,Low_^NSEI,Open_HDFCBANK.NS,Open_RELIANCE.NS,Open_TCS.NS,Open_^NSEI,Volume_HDFCBANK.NS,Volume_RELIANCE.NS,Volume_TCS.NS,Volume_^NSEI
0,2019-01-01,504.392517,498.49057,1610.928345,,505.073439,501.292117,1617.023891,,494.413149,...,1595.858657,,499.590634,500.380512,1605.17136,,6373440,9746670,1094883,
1,2019-01-02,499.790222,491.99823,1628.283691,10792.5,504.251587,501.158723,1637.723302,10895.349609,497.723895,...,1608.557654,10735.049805,503.147997,495.600156,1612.7907,10868.849609,8134232,15628818,2100463,309700.0
2,2019-01-03,495.868988,485.928284,1608.515015,10672.25,499.731579,495.644616,1646.612426,10814.049805,493.80266,...,1602.715763,10661.25,498.780593,492.487362,1624.64296,10796.799805,12771664,16288287,2611668,286200.0
3,2019-01-04,497.195618,488.551941,1588.958862,10727.349609,499.790265,491.131123,1609.573781,10741.049805,495.52851,...,1558.607935,10628.650391,496.67905,487.996102,1608.557891,10699.700195,7287120,18516544,4280862,296600.0
4,2019-01-07,497.94696,491.264465,1606.779663,10771.799805,501.398694,497.356638,1616.007724,10835.950195,497.32474,...,1592.471947,10750.150391,499.602373,492.265005,1601.615368,10804.849609,5387012,12060290,1856423,269400.0



Shape of data: (1481, 21)

Columns:
['Date', 'Close_HDFCBANK.NS', 'Close_RELIANCE.NS', 'Close_TCS.NS', 'Close_^NSEI', 'High_HDFCBANK.NS', 'High_RELIANCE.NS', 'High_TCS.NS', 'High_^NSEI', 'Low_HDFCBANK.NS', 'Low_RELIANCE.NS', 'Low_TCS.NS', 'Low_^NSEI', 'Open_HDFCBANK.NS', 'Open_RELIANCE.NS', 'Open_TCS.NS', 'Open_^NSEI', 'Volume_HDFCBANK.NS', 'Volume_RELIANCE.NS', 'Volume_TCS.NS', 'Volume_^NSEI']
