In [5]:
import pandas as pd
import os
# Load Parquet data
df = pd.read_parquet('test_stock_data.parquet')

# Check DataFrame info
print(df.info())

# Reset index to move 'date' from index to a column
df = df.reset_index()

# Now you can see 'date' as a regular column
print(df.columns)

# Convert 'date' column to datetime, if necessary
df['date'] = pd.to_datetime(df['date'])

# Group by year
dfs_by_year = {year: group for year, group in df.groupby(df['date'].dt.year)}


<class 'pandas.core.frame.DataFrame'>
Index: 7500000 entries, 2016-03-03 to 2025-10-01
Data columns (total 6 columns):
 #   Column  Dtype  
---  ------  -----  
 0   symbol  object 
 1   open    float64
 2   high    float64
 3   low     float64
 4   close   float64
 5   volume  float64
dtypes: float64(5), object(1)
memory usage: 400.5+ MB
None
Index(['date', 'symbol', 'open', 'high', 'low', 'close', 'volume'], dtype='object')


In [6]:
# Check how many years and data you have in each
print(f"Total years: {len(dfs_by_year)}")
for year in sorted(dfs_by_year.keys()):
    print(f"Year: {year}, Records: {len(dfs_by_year[year])}")

Total years: 10
Year: 2016, Records: 651000
Year: 2017, Records: 780000
Year: 2018, Records: 783000
Year: 2019, Records: 783000
Year: 2020, Records: 786000
Year: 2021, Records: 783000
Year: 2022, Records: 780000
Year: 2023, Records: 780000
Year: 2024, Records: 786000
Year: 2025, Records: 588000


In [7]:
#SQL Load

In [12]:
from sqlalchemy import create_engine, text

# Connect to PostgreSQL
engine = create_engine('postgresql+psycopg2://postgres:root@localhost/nse_data')

# Create tables: symbols and daily_data with proper data types and primary key

create_table_sql = '''
CREATE TABLE IF NOT EXISTS symbols (
    symbol VARCHAR(20) PRIMARY KEY
);

CREATE TABLE IF NOT EXISTS daily_data (
    date DATE NOT NULL,
    symbol VARCHAR(20) NOT NULL,
    open FLOAT,
    high FLOAT,
    low FLOAT,
    close FLOAT,
    volume BIGINT,
    CONSTRAINT fk_symbol FOREIGN KEY(symbol) REFERENCES symbols(symbol)
);

CREATE INDEX IF NOT EXISTS idx_symbol ON daily_data(symbol);
CREATE INDEX IF NOT EXISTS idx_date ON daily_data(date);
'''

# Schema creation queries
with engine.connect() as conn:
    conn.execute(text(create_table_sql))
    conn.commit()

# Insert unique symbols into symbols table first
symbols = df['symbol'].unique()
symbols_df = pd.DataFrame(symbols, columns=['symbol'])
symbols_df.to_sql('symbols', engine, if_exists='append', index=False)

# Insert year-wise data into daily_data table
for year, df_year in dfs_by_year.items():
    df_year['volume'] = df_year['volume'].astype('int64')
    df_year.to_sql('daily_data', engine, if_exists='append', index=False)
    print(f"Inserted data for year {year}")


Inserted data for year 2016
Inserted data for year 2017
Inserted data for year 2018
Inserted data for year 2019
Inserted data for year 2020
Inserted data for year 2021
Inserted data for year 2022
Inserted data for year 2023
Inserted data for year 2024
Inserted data for year 2025
