# Load VN30 Data for All Constituents

This notebook loads comprehensive data for all VN30 constituents from vnstock API and saves to individual symbol directories.

## Requirements

- vnstock library installed
- StocketAI data acquisition module
- VN30 constituents CSV file

## Data Types Loaded

- Historical price data (10 years)
- Financial ratios (yearly/quarterly)
- Balance sheet (yearly/quarterly)
- Income statement (yearly/quarterly)
- Cash flow statement (yearly/quarterly)
- Company profile and information
- Company shareholders
- Company officers
- Company events
- Company news
- Dividend history
- Intraday data
- Price depth

## Sources Used

- VCI (primary for historical and financial data)
- TCBS (company information, events, news)
- MSN (additional data sources)
- FMARKET (fund-related data)

In [None]:
# Enable autoreload for development
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [None]:
# Import required libraries
import sys
from pathlib import Path
from datetime import datetime, timedelta
import pandas as pd
import logging
import time

# Add src to path for importing StocketAI modules
project_root = Path.cwd().parent.parent
src_path = project_root / 'src'
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

# Import StocketAI modules
from data_acquisition.vnstock_client import VNStockClient
from data_acquisition.vn30_data_loader import VN30DataLoader

print("Libraries imported successfully!")

In [None]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(project_root / 'logs' / 'vn30_data_loading.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

print("Logging configured successfully!")

In [None]:
# Initialize VNStock client and data loader
client = VNStockClient()
data_loader = VN30DataLoader(client=client)
print("VNStock client and data loader initialized successfully!")

In [None]:
# Load VN30 constituents
constituents_path = project_root / 'data' / 'symbols' / 'vn30_constituents.csv'

if not constituents_path.exists():
    raise FileNotFoundError(f"VN30 constituents file not found: {constituents_path}")

vn30_df = pd.read_csv(constituents_path)
symbols = vn30_df['symbol'].tolist()

print(f"Loaded {len(symbols)} VN30 constituents: {symbols[:5]}...")

# Define date range for historical data (10 years)
end_date = datetime.now().strftime('%Y-%m-%d')
start_date = (datetime.now() - timedelta(days=10*365)).strftime('%Y-%m-%d')

print(f"Historical data range: {start_date} to {end_date}")

In [None]:
# Process all VN30 symbols using the new data loader
print(f"Starting data loading for {len(symbols)} VN30 constituents...")
print("This may take several minutes depending on API response times.")
print("Note: Existing data files will be skipped to avoid re-downloading.")

# Use the new VN30DataLoader for batch processing
results = data_loader.load_multiple_symbols(
    symbols=symbols,
    base_path=project_root,
    force_reload=False,  # Skip existing files
    delay_between_symbols=1.0
)

print("\n" + "="*50)
print("DATA LOADING SUMMARY")
print("="*50)
print(f"Total symbols processed: {results['total_symbols']}")
print(f"Successful new loads: {results['successful_loads']}")
print(f"Skipped (already exist): {results['skipped_loads']}")
print(f"Failed loads: {results['failed_loads']}")
print(f"Success rate: {results['success_rate']:.1f}%")
print("\n✓ VN30 data loading completed!")
print("Check the logs/vn30_data_loading.log file for detailed information.")

In [None]:
# Verification: Check what data was loaded
print("Verifying loaded data...")

# Check a few symbols
sample_symbols = symbols[:3]  # Check first 3 symbols

for symbol in sample_symbols:
    symbol_dir = project_root / 'data' / 'symbols' / symbol / 'raw'
    if symbol_dir.exists():
        files = list(symbol_dir.glob('*.csv'))
        print(f"\n{symbol}: {len(files)} data files")
        for file in sorted(files):
            try:
                df = pd.read_csv(file)
                print(f"  - {file.name}: {len(df)} rows")
            except Exception as e:
                print(f"  - {file.name}: Error reading ({e})")
    else:
        print(f"\n{symbol}: Directory not found")

print("\n✓ Data verification completed!")