In [1]:
# %%
# Test DataPreprocessor Import and Initialization
import sys
sys.path.append('../')

from src.stock_predictor.preprocessing import DataPreprocessor
import pandas as pd
import numpy as np
from pathlib import Path

print("✅ DataPreprocessor imported successfully!")

# Initialize preprocessor
try:
    preprocessor = DataPreprocessor("../config/config.yaml")
    print("✅ DataPreprocessor initialized successfully!")
    print(f"📁 Raw data path: {preprocessor.raw_data_path}")
    print(f"📁 Processed data path: {preprocessor.processed_data_path}")
except Exception as e:
    print(f"❌ Initialization failed: {e}")

✅ DataPreprocessor imported successfully!
✅ DataPreprocessor initialized successfully!
📁 Raw data path: ../data/raw
📁 Processed data path: ../data/processed


In [2]:
# %%
# Check what raw data files are available for processing
raw_yahoo_path = Path("../data/raw/yahoo_finance")

if raw_yahoo_path.exists():
    raw_files = list(raw_yahoo_path.glob("*.parquet"))
    print(f"🔍 Found {len(raw_files)} raw data files:")
    for file in raw_files:
        print(f"   📄 {file.name}")
        
    if raw_files:
        # Test loading a single file
        test_file = raw_files[0]
        try:
            test_df = preprocessor.load_raw_data(test_file)
            print(f"\n✅ Test file loaded: {len(test_df)} rows, {len(test_df.columns)} columns")
            print(f"📊 Columns: {list(test_df.columns)}")
            print(f"📅 Date range: {test_df['Datetime'].min()} to {test_df['Datetime'].max()}")
        except Exception as e:
            print(f"❌ Error loading test file: {e}")
    else:
        print("❌ No raw data files found! Run your data fetching notebook first.")
else:
    print("❌ Raw data directory doesn't exist! Run your data fetching notebook first.")

🔍 Found 1 raw data files:
   📄 RELIANCE_20250814_213917.parquet

✅ Test file loaded: 67 rows, 12 columns
📊 Columns: ['Datetime', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits', 'Symbol', 'Source', 'Exchange', 'Currency']
📅 Date range: 2025-05-14 00:00:00+05:30 to 2025-08-14 00:00:00+05:30


In [3]:
# %%
# Test technical indicators calculation on a single stock
if 'test_df' in locals() and not test_df.empty:
    try:
        print("🔧 Testing technical indicators calculation...")
        
        # Test technical indicators
        df_with_indicators = preprocessor.calculate_technical_indicators(test_df)
        
        print(f"✅ Technical indicators added successfully!")
        print(f"📊 Original columns: {len(test_df.columns)}")
        print(f"📊 After indicators: {len(df_with_indicators.columns)}")
        
        # Check specific indicators
        indicator_columns = [col for col in df_with_indicators.columns if col in 
                           ['SMA_20', 'RSI', 'MACD', 'BB_Upper', 'BB_Lower', 'Volatility']]
        print(f"🎯 Key indicators added: {indicator_columns}")
        
        # Show sample of technical indicators
        print("\n📈 Sample Technical Indicators (last 5 rows):")
        sample_cols = ['Datetime', 'Close', 'SMA_20', 'RSI', 'MACD', 'Volatility']
        available_cols = [col for col in sample_cols if col in df_with_indicators.columns]
        print(df_with_indicators[available_cols].tail())
        
    except Exception as e:
        print(f"❌ Technical indicators calculation failed: {e}")
else:
    print("⚠️ No test data available for technical indicators test")

🔧 Testing technical indicators calculation...
✅ Technical indicators added successfully!
📊 Original columns: 12
📊 After indicators: 34
🎯 Key indicators added: ['SMA_20', 'MACD', 'RSI', 'BB_Upper', 'BB_Lower', 'Volatility']

📈 Sample Technical Indicators (last 5 rows):
                    Datetime    Close     SMA_20        RSI       MACD  \
62 2025-08-08 00:00:00+05:30  1362.36  1415.3275  33.847221 -23.424485   
63 2025-08-11 00:00:00+05:30  1380.69  1410.4720  43.030781 -22.870507   
64 2025-08-12 00:00:00+05:30  1374.91  1405.2430  38.043242 -22.633825   
65 2025-08-13 00:00:00+05:30  1377.10  1400.1135  43.860075 -22.016539   
66 2025-08-14 00:00:00+05:30  1373.80  1395.2770  46.058673 -21.543667   

    Volatility  
62    0.011546  
63    0.012178  
64    0.012133  
65    0.012161  
66    0.012147  


In [4]:
# %%
# Test complete ML preprocessing pipeline on single stock
if 'test_df' in locals() and not test_df.empty:
    try:
        print("🚀 Testing complete ML preprocessing pipeline...")
        
        # Get symbol from test data
        symbol = test_df['Symbol'].iloc[0] if 'Symbol' in test_df.columns else "TEST_STOCK"
        
        # Run complete preprocessing
        processed_df = preprocessor.preprocess_for_ml(test_df, symbol)
        
        print(f"✅ ML preprocessing completed for {symbol}!")
        print(f"📊 Final dataset shape: {processed_df.shape}")
        
        # Check for Indian market features
        indian_features = [col for col in processed_df.columns if col in 
                          ['Day_of_Week', 'Month', 'Quarter', 'Is_Month_End', 'Is_Quarter_End']]
        print(f"🇮🇳 Indian market features: {indian_features}")
        
        # Check data quality
        missing_data = processed_df.isnull().sum().sum()
        print(f"🔍 Missing values after preprocessing: {missing_data}")
        
        # Show sample of final dataset
        print("\n📋 Final ML-ready dataset (last 3 rows):")
        display_cols = ['Datetime', 'Close', 'SMA_20', 'RSI', 'MACD', 'Day_of_Week', 'Is_Month_End']
        available_display = [col for col in display_cols if col in processed_df.columns]
        print(processed_df[available_display].tail(3))
        
    except Exception as e:
        print(f"❌ ML preprocessing failed: {e}")
else:
    print("⚠️ No test data available for ML preprocessing test")

🚀 Testing complete ML preprocessing pipeline...
✅ ML preprocessing completed for RELIANCE.NS!
📊 Final dataset shape: (67, 39)
🇮🇳 Indian market features: ['Day_of_Week', 'Month', 'Quarter', 'Is_Month_End', 'Is_Quarter_End']
🔍 Missing values after preprocessing: 0

📋 Final ML-ready dataset (last 3 rows):
                    Datetime    Close     SMA_20        RSI       MACD  \
64 2025-08-12 00:00:00+05:30  1374.91  1405.2430  38.043242 -22.633825   
65 2025-08-13 00:00:00+05:30  1377.10  1400.1135  43.860075 -22.016539   
66 2025-08-14 00:00:00+05:30  1373.80  1395.2770  46.058673 -21.543667   

    Day_of_Week  Is_Month_End  
64            1             0  
65            2             0  
66            3             0  


In [5]:
# %%
# Test LSTM sequence creation
if 'processed_df' in locals() and not processed_df.empty:
    try:
        print("🧠 Testing LSTM sequence creation...")
        
        # Test with smaller sequence length for testing
        X, y, scaler = preprocessor.create_sequences_for_lstm(
            processed_df, 
            sequence_length=30,  # Smaller for testing
            prediction_horizon=1
        )
        
        print(f"✅ LSTM sequences created successfully!")
        print(f"📐 Input sequences shape: {X.shape}")
        print(f"📐 Target values shape: {y.shape}")
        print(f"🔧 Scaler type: {type(scaler).__name__}")
        
        # Validate sequence dimensions
        expected_features = len([col for col in ['Open', 'High', 'Low', 'Close', 'Volume', 
                                               'SMA_5', 'SMA_10', 'SMA_20', 'RSI', 'MACD',
                                               'Price_Change', 'Volatility', 'Volume_Ratio', 'HL_Spread'] 
                               if col in processed_df.columns])
        
        print(f"🎯 Expected features: {expected_features}, Actual features: {X.shape[2]}")
        print(f"🎯 Sequence length: {X.shape[1]} time steps")
        print(f"🎯 Number of sequences: {X.shape[0]}")
        
    except Exception as e:
        print(f"❌ LSTM sequence creation failed: {e}")
else:
    print("⚠️ No processed data available for LSTM sequence test")

🧠 Testing LSTM sequence creation...
✅ LSTM sequences created successfully!
📐 Input sequences shape: (37, 30, 14)
📐 Target values shape: (37,)
🔧 Scaler type: MinMaxScaler
🎯 Expected features: 14, Actual features: 14
🎯 Sequence length: 30 time steps
🎯 Number of sequences: 37


In [6]:
# %%
# Test batch processing of all raw data files
print("🏭 Testing batch processing of all raw data...")

try:
    # Process all raw data files
    processed_files = preprocessor.process_all_raw_data()
    
    print(f"✅ Batch processing completed!")
    print(f"📁 Successfully processed {len(processed_files)} files")
    
    # Show processed files
    for i, file_path in enumerate(processed_files, 1):
        print(f"   {i}. {file_path.name}")
        
    # Verify processed files exist and show file sizes
    print(f"\n📊 Processed files details:")
    for file_path in processed_files:
        if file_path.exists():
            file_size = file_path.stat().st_size / 1024  # Size in KB
            
            # Quick load to get shape
            df = pd.read_parquet(file_path)
            print(f"   📄 {file_path.name}: {df.shape[0]} rows × {df.shape[1]} cols ({file_size:.1f} KB)")
        else:
            print(f"   ❌ {file_path.name}: File not found!")
            
except Exception as e:
    print(f"❌ Batch processing failed: {e}")


🏭 Testing batch processing of all raw data...
✅ Batch processing completed!
📁 Successfully processed 1 files
   1. RELIANCE_ml_ready_20250815_001203.parquet

📊 Processed files details:
   📄 RELIANCE_ml_ready_20250815_001203.parquet: 67 rows × 39 cols (35.1 KB)


In [7]:
# %%
# Final validation and summary
print("🎯 PREPROCESSING PIPELINE VALIDATION SUMMARY")
print("=" * 50)

# Check processed data directory
processed_path = Path("../data/processed")
if processed_path.exists():
    all_processed = list(processed_path.glob("*.parquet"))
    print(f"✅ Processed data directory: {len(all_processed)} files")
    
    # Check file naming convention
    ml_ready_files = [f for f in all_processed if "ml_ready" in f.name]
    print(f"✅ ML-ready files: {len(ml_ready_files)}")
    
    # Sample one file for final validation
    if ml_ready_files:
        sample_file = ml_ready_files[0]
        sample_df = pd.read_parquet(sample_file)
        
        print(f"\n📋 Sample ML-ready dataset validation ({sample_file.name}):")
        print(f"   📊 Shape: {sample_df.shape}")
        print(f"   📅 Date range: {sample_df['Datetime'].min()} to {sample_df['Datetime'].max()}")
        
        # Check for key technical indicators
        key_indicators = ['SMA_20', 'RSI', 'MACD', 'BB_Upper', 'Volatility']
        available_indicators = [ind for ind in key_indicators if ind in sample_df.columns]
        print(f"   🎯 Technical indicators: {len(available_indicators)}/{len(key_indicators)} present")
        
        # Check for Indian market features
        indian_features = ['Day_of_Week', 'Month', 'Is_Month_End', 'Is_Quarter_End']
        available_indian = [feat for feat in indian_features if feat in sample_df.columns]
        print(f"   🇮🇳 Indian market features: {len(available_indian)}/{len(indian_features)} present")
        
        # Data quality check
        missing_count = sample_df.isnull().sum().sum()
        print(f"   🔍 Data quality: {missing_count} missing values")
        
        print(f"\n✅ PREPROCESSING PIPELINE: READY FOR LSTM MODEL TRAINING!")
        
else:
    print("❌ No processed data directory found!")

print("\n🚀 Next Step: Ready for Step 4 - LSTM Model Development")


🎯 PREPROCESSING PIPELINE VALIDATION SUMMARY
✅ Processed data directory: 1 files
✅ ML-ready files: 1

📋 Sample ML-ready dataset validation (RELIANCE_ml_ready_20250815_001203.parquet):
   📊 Shape: (67, 39)
   📅 Date range: 2025-05-14 00:00:00+05:30 to 2025-08-14 00:00:00+05:30
   🎯 Technical indicators: 5/5 present
   🇮🇳 Indian market features: 4/4 present
   🔍 Data quality: 0 missing values

✅ PREPROCESSING PIPELINE: READY FOR LSTM MODEL TRAINING!

🚀 Next Step: Ready for Step 4 - LSTM Model Development
