# Data Collection
Fetch raw stock data using yfinance for Indian tickers (3-year range). Outputs `raw_stock_data.csv` for next notebook.

In [2]:
# Imports (global for this notebook)
import pandas as pd
import numpy as np
import yfinance as yf
import datetime as dt
import os
from pathlib import Path

# Install requirements if needed (run once)
!pip install pandas numpy yfinance matplotlib seaborn scikit-learn sqlalchemy

print("All libraries imported successfully")

All libraries imported successfully



[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# Indian stock tickers
indian_tickers = [
    "RELIANCE.NS", "TCS.NS", "HDFCBANK.NS", "INFY.NS", "ICICIBANK.NS",
    "HINDUNILVR.NS", "SBIN.NS", "ITC.NS", "BHARTIARTL.NS", "LT.NS",
    "TATAMOTORS.NS", "AXISBANK.NS", "BAJFINANCE.NS", "M&M.NS", "ADANIPORTS.NS"
]

# Define 3-year date range
end_date = dt.datetime.today()
start_date = end_date - dt.timedelta(days=3*365)  # approx 3 years

print(f"Date range: {start_date.date()} to {end_date.date()}")

Date range: 2022-09-24 to 2025-09-23


In [4]:
# Dictionary to store each ticker's historical data
stock_data = {}

for ticker in indian_tickers:
    try:
        data = yf.download(
            ticker,
            start=start_date,
            end=end_date,
            auto_adjust=True  # Adjusted prices account for splits/dividends
        )
        if not data.empty:
            stock_data[ticker] = data
            print(f"✓ Successfully fetched {len(data)} days for {ticker}")
        else:
            print(f"✗ No data available for {ticker}")
    except Exception as e:
        print(f"✗ Error fetching {ticker}: {e}")

[*********************100%***********************]  1 of 1 completed


✓ Successfully fetched 741 days for RELIANCE.NS


[*********************100%***********************]  1 of 1 completed


✓ Successfully fetched 741 days for TCS.NS


[*********************100%***********************]  1 of 1 completed


✓ Successfully fetched 741 days for HDFCBANK.NS


[*********************100%***********************]  1 of 1 completed


✓ Successfully fetched 741 days for INFY.NS


[*********************100%***********************]  1 of 1 completed


✓ Successfully fetched 741 days for ICICIBANK.NS


[*********************100%***********************]  1 of 1 completed


✓ Successfully fetched 741 days for HINDUNILVR.NS


[*********************100%***********************]  1 of 1 completed


✓ Successfully fetched 741 days for SBIN.NS


[*********************100%***********************]  1 of 1 completed


✓ Successfully fetched 741 days for ITC.NS


[*********************100%***********************]  1 of 1 completed


✓ Successfully fetched 741 days for BHARTIARTL.NS


[*********************100%***********************]  1 of 1 completed


✓ Successfully fetched 741 days for LT.NS


[*********************100%***********************]  1 of 1 completed


✓ Successfully fetched 741 days for TATAMOTORS.NS


[*********************100%***********************]  1 of 1 completed


✓ Successfully fetched 741 days for AXISBANK.NS


[*********************100%***********************]  1 of 1 completed


✓ Successfully fetched 741 days for BAJFINANCE.NS


[*********************100%***********************]  1 of 1 completed


✓ Successfully fetched 741 days for M&M.NS


[*********************100%***********************]  1 of 1 completed

✓ Successfully fetched 741 days for ADANIPORTS.NS





In [5]:
# Update with latest data (current trading day if available)
for ticker, data in stock_data.items():
    try:
        ticker_obj = yf.Ticker(ticker)
        latest_data = ticker_obj.history(period="1d")
        
        if latest_data.empty:
            print(f"✗ No latest data for {ticker}")
            continue
        
        latest_date = latest_data.index[0]
        
        # Check if this date is already in the historical data
        if latest_date in data.index:
            print(f"✓ {ticker} already up to date ({latest_date.date()})")
            continue
        
        # Align and append the latest data
        latest_row = latest_data.iloc[0:1]  # Get as DataFrame
        
        # Ensure consistent column names
        for col in data.columns:
            if col not in latest_row.columns:
                latest_row.loc[:, col] = 0 if col == 'Volume' else float(latest_row['Close'].iloc[0])

        # Concatenate and sort
        stock_data[ticker] = pd.concat([data, latest_row])
        stock_data[ticker].sort_index(inplace=True)
        
        print(f"✓ Updated {ticker} with data for {latest_date.date()}")
        
    except Exception as e:
        print(f"✗ Error updating {ticker}: {e}")

print("Live price update completed")

  latest_row.loc[:, col] = 0 if col == 'Volume' else float(latest_row['Close'].iloc[0])
  latest_row.loc[:, col] = 0 if col == 'Volume' else float(latest_row['Close'].iloc[0])


✗ Error updating RELIANCE.NS: Cannot compare tz-naive and tz-aware timestamps
✗ Error updating TCS.NS: Cannot compare tz-naive and tz-aware timestamps


  latest_row.loc[:, col] = 0 if col == 'Volume' else float(latest_row['Close'].iloc[0])
  latest_row.loc[:, col] = 0 if col == 'Volume' else float(latest_row['Close'].iloc[0])


✗ Error updating HDFCBANK.NS: Cannot compare tz-naive and tz-aware timestamps
✗ Error updating INFY.NS: Cannot compare tz-naive and tz-aware timestamps


  latest_row.loc[:, col] = 0 if col == 'Volume' else float(latest_row['Close'].iloc[0])
  latest_row.loc[:, col] = 0 if col == 'Volume' else float(latest_row['Close'].iloc[0])


✗ Error updating ICICIBANK.NS: Cannot compare tz-naive and tz-aware timestamps
✗ Error updating HINDUNILVR.NS: Cannot compare tz-naive and tz-aware timestamps


  latest_row.loc[:, col] = 0 if col == 'Volume' else float(latest_row['Close'].iloc[0])
  latest_row.loc[:, col] = 0 if col == 'Volume' else float(latest_row['Close'].iloc[0])


✗ Error updating SBIN.NS: Cannot compare tz-naive and tz-aware timestamps
✗ Error updating ITC.NS: Cannot compare tz-naive and tz-aware timestamps


  latest_row.loc[:, col] = 0 if col == 'Volume' else float(latest_row['Close'].iloc[0])
  latest_row.loc[:, col] = 0 if col == 'Volume' else float(latest_row['Close'].iloc[0])


✗ Error updating BHARTIARTL.NS: Cannot compare tz-naive and tz-aware timestamps
✗ Error updating LT.NS: Cannot compare tz-naive and tz-aware timestamps


  latest_row.loc[:, col] = 0 if col == 'Volume' else float(latest_row['Close'].iloc[0])
  latest_row.loc[:, col] = 0 if col == 'Volume' else float(latest_row['Close'].iloc[0])


✗ Error updating TATAMOTORS.NS: Cannot compare tz-naive and tz-aware timestamps
✗ Error updating AXISBANK.NS: Cannot compare tz-naive and tz-aware timestamps


  latest_row.loc[:, col] = 0 if col == 'Volume' else float(latest_row['Close'].iloc[0])
  latest_row.loc[:, col] = 0 if col == 'Volume' else float(latest_row['Close'].iloc[0])


✗ Error updating BAJFINANCE.NS: Cannot compare tz-naive and tz-aware timestamps
✗ Error updating M&M.NS: Cannot compare tz-naive and tz-aware timestamps
✗ Error updating ADANIPORTS.NS: Cannot compare tz-naive and tz-aware timestamps
Live price update completed


  latest_row.loc[:, col] = 0 if col == 'Volume' else float(latest_row['Close'].iloc[0])


In [6]:
# Preview TATAMOTORS.NS
print("TATAMOTORS.NS data preview:")
if "TATAMOTORS.NS" in stock_data:
    print(stock_data["TATAMOTORS.NS"].tail())
    print(f"\nData shape: {stock_data['TATAMOTORS.NS'].shape}")
else:
    print("TATAMOTORS.NS not fetched.")

TATAMOTORS.NS data preview:
                           (Close, TATAMOTORS.NS)  (High, TATAMOTORS.NS)  \
Date                                                                       
2025-09-18 00:00:00                    711.200012             725.250000   
2025-09-19 00:00:00                    707.450012             713.500000   
2025-09-22 00:00:00                    696.250000             712.000000   
2025-09-23 00:00:00                    701.349976             707.799988   
2025-09-23 00:00:00+05:30                     NaN                    NaN   

                           (Low, TATAMOTORS.NS)  (Open, TATAMOTORS.NS)  \
Date                                                                     
2025-09-18 00:00:00                  707.599976             722.000000   
2025-09-19 00:00:00                  704.950012             713.500000   
2025-09-22 00:00:00                  694.849976             709.000000   
2025-09-23 00:00:00                  697.000000             701.79998

In [7]:
# Create a list to hold the processed dataframes
processed_dfs = []

for ticker, data in stock_data.items():
    try:
        # Reset the index to make Date a column
        df = data.reset_index()
        df.rename(columns={'index': 'Date'}, inplace=True)
        
        # Add a 'Ticker' column to identify the stock
        df['Ticker'] = ticker
        
        # Ensure 'Adj Close' column exists
        if 'Adj Close' not in df.columns:
            df['Adj Close'] = df['Close']
        
        # Select and reorder columns
        df = df[['Date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']]
        
        # Append the processed dataframe to the list
        processed_dfs.append(df)
        
        print(f"✓ Processed {ticker}: {len(df)} rows")
        
    except Exception as e:
        print(f"✗ Error processing {ticker}: {e}")

# Concatenate all tickers into a single DataFrame
combined_df = pd.concat(processed_dfs, ignore_index=True)

# Basic fill for OHLCV (full cleaning in next notebook)
combined_df.ffill(inplace=True)

print(f"\nCombined DataFrame shape: {combined_df.shape}")
print("Data has been successfully processed and combined")

# Save raw CSV for next notebook
raw_csv = 'raw_stock_data.csv'
combined_df.to_csv(raw_csv, index=False)
print(f"✓ Raw data saved to '{raw_csv}' ({len(combined_df)} rows). Run data_preprocessing_feature_engineering.ipynb next.")

✓ Processed RELIANCE.NS: 742 rows
✓ Processed TCS.NS: 742 rows
✓ Processed HDFCBANK.NS: 742 rows
✓ Processed INFY.NS: 742 rows
✓ Processed ICICIBANK.NS: 742 rows
✓ Processed HINDUNILVR.NS: 742 rows
✓ Processed SBIN.NS: 742 rows
✓ Processed ITC.NS: 742 rows
✓ Processed BHARTIARTL.NS: 742 rows
✓ Processed LT.NS: 742 rows
✓ Processed TATAMOTORS.NS: 742 rows
✓ Processed AXISBANK.NS: 742 rows
✓ Processed BAJFINANCE.NS: 742 rows
✓ Processed M&M.NS: 742 rows
✓ Processed ADANIPORTS.NS: 742 rows

Combined DataFrame shape: (11130, 8)
Data has been successfully processed and combined
✓ Raw data saved to 'raw_stock_data.csv' (11130 rows). Run data_preprocessing_feature_engineering.ipynb next.
