In [None]:
import pandas as pd
import os
from datetime import datetime, timedelta

def get_resolution_seconds(resolution):
    """Convert resolution string to seconds"""
    resolution_map = {
        "5S": 5, "10S": 10, "15S": 15, "30S": 30, "45S": 45,
        "1": 60, "2": 120, "3": 180, "5": 300, "10": 600, 
        "15": 900, "20": 1200, "30": 1800, "60": 3600, 
        "120": 7200, "240": 14400, "D": 22500
    }
    return resolution_map.get(resolution, 0)


def fill_missing_candles(resolution, data_folder, date_range_start, date_range_end):
    """
    Fill missing candles for all stocks in a resolution folder
    - Fast vectorized approach
    - No JSON file dependency
    """
    resolution_seconds = get_resolution_seconds(resolution)
    if resolution_seconds == 0:
        print(f"❌ Unknown resolution: {resolution}")
        return
    
    if resolution_seconds >= 360:
        print(f"❌ Resolution {resolution} >= 6 minutes. Only works for < 6 min resolutions")
        return
    
    print(f"\n{'='*100}")
    print(f"{'FILLING MISSING CANDLES':^100}")
    print(f"{'='*100}")
    print(f"Resolution: {resolution} ({resolution_seconds}s)")
    print(f"Data Folder: {data_folder}")
    print(f"Date Range: {date_range_start} to {date_range_end}")
    print(f"{'='*100}\n")
    
    # Get all CSV files in folder
    csv_files = [f for f in os.listdir(data_folder) if f.endswith('.csv')]
    total_stocks = len(csv_files)
    
    print(f"Found {total_stocks} CSV files\n")
    
    # Track changes
    stocks_processed = 0
    total_candles_filled = 0
    fill_summary = []
    
    for idx, csv_filename in enumerate(csv_files, 1):
        stock_name = csv_filename.replace(f"_Fyers_{resolution}.csv", "")
        csv_path = os.path.join(data_folder, csv_filename)
        
        print(f"[{idx}/{total_stocks}] Processing {stock_name}...", end=" ", flush=True)
        
        # Read CSV
        df = pd.read_csv(csv_path)
        df['Datetime'] = pd.to_datetime(df['Datetime'])
        df = df.sort_values('Datetime').reset_index(drop=True)
        original_count = len(df)
        
        # Get unique dates
        df['Date'] = df['Datetime'].dt.date
        dates = df['Date'].unique()
        
        filled_timestamps = []
        new_rows = []
        
        for date in dates:
            date_str = str(date)
            day_data = df[df['Date'] == date].copy()
            
            # Generate expected timestamps for this day
            market_start = pd.Timestamp(f"{date_str} 09:15:00")
            market_end = pd.Timestamp(f"{date_str} 15:29:55")
            
            expected_timestamps = pd.date_range(
                start=market_start, 
                end=market_end, 
                freq=f'{resolution_seconds}S'
            )
            
            # Find missing timestamps
            actual_timestamps = set(day_data['Datetime'])
            missing_timestamps = [ts for ts in expected_timestamps if ts not in actual_timestamps]
            
            if not missing_timestamps:
                continue
            
            # Get last available timestamp for this day
            last_available_ts = day_data['Datetime'].max()
            
            # Create a lookup dict for faster access
            day_data_indexed = day_data.set_index('Datetime')
            
            # Process missing timestamps
            for missing_ts in missing_timestamps:
                source_row = None
                
                # Determine if this is after last available (backward fill) or before (forward fill)
                if missing_ts > last_available_ts:
                    # BACKWARD FILL: search backward
                    search_ts = missing_ts - timedelta(seconds=resolution_seconds)
                    while search_ts >= market_start:
                        if search_ts in day_data_indexed.index:
                            source_row = day_data_indexed.loc[search_ts]
                            break
                        search_ts -= timedelta(seconds=resolution_seconds)
                else:
                    # FORWARD FILL: search forward
                    search_ts = missing_ts + timedelta(seconds=resolution_seconds)
                    while search_ts <= market_end:
                        if search_ts in day_data_indexed.index:
                            source_row = day_data_indexed.loc[search_ts]
                            break
                        search_ts += timedelta(seconds=resolution_seconds)
                
                # Create new row if source found
                if source_row is not None:
                    new_row = {
                        'Datetime': missing_ts,
                        'Open': source_row['Open'],
                        'High': source_row['High'],
                        'Low': source_row['Low'],
                        'Close': source_row['Close'],
                        'Volume': 0
                    }
                    new_rows.append(new_row)
                    filled_timestamps.append(missing_ts.strftime('%Y-%m-%d %H:%M:%S'))
        
        # Add new rows and save
        if new_rows:
            df_new = pd.DataFrame(new_rows)
            df = pd.concat([df.drop(columns=['Date']), df_new], ignore_index=True)
            df = df.sort_values('Datetime').reset_index(drop=True)
            df.to_csv(csv_path, index=False)
            
            new_count = len(df)
            candles_filled = new_count - original_count
            stocks_processed += 1
            total_candles_filled += candles_filled
            
            fill_summary.append({
                'stock': stock_name,
                'filled': candles_filled,
                'before': original_count,
                'after': new_count,
                'timestamps': filled_timestamps[:5]  # First 5
            })
            
            print(f"✓ Filled {candles_filled} candles ({original_count:,} → {new_count:,})")
        else:
            print("✓ No missing candles")
            df = df.drop(columns=['Date'])
    
    # Print final summary
    print(f"\n{'='*100}")
    print(f"{'FILL SUMMARY':^100}")
    print(f"{'='*100}")
    print(f"Total stocks: {total_stocks}")
    print(f"Stocks with filled candles: {stocks_processed}")
    print(f"Total candles filled: {total_candles_filled:,}")
    
    if fill_summary:
        print(f"\n{'Stock':<20} {'Filled':<10} {'Before':<12} {'After':<12} {'Sample Timestamps'}")
        print(f"{'-'*100}")
        
        for item in sorted(fill_summary, key=lambda x: x['filled'], reverse=True):
            timestamps_str = ', '.join(item['timestamps'][:3])
            more = f" ... +{len(item['timestamps'])-3}" if len(item['timestamps']) > 3 else ""
            print(f"{item['stock']:<20} {item['filled']:<10} {item['before']:<12,} {item['after']:<12,} {timestamps_str}{more}")
    
    print(f"{'='*100}\n")
    
    return fill_summary


# ==================== EXECUTE ====================

resolution = "5S"
data_folder = r"D:\Programming\Download_Backtest_Deploy_data\1__Download\1__Download_data_Fyers_via_API\storage_Fyers_5S"

# Date range (not used for finding missing candles, just for reference)
summary = fill_missing_candles(
    resolution=resolution,
    data_folder=data_folder,
    date_range_start="2025-01-01",
    date_range_end="2025-09-30"
)