In [1]:
# import os
# import sys
# import pandas as pd

# sys.path.append(os.path.abspath('./'))

# from utils import add_technical_indicators, calculate_return

# def create_long_term_windows(input_folder, output_folder, window_days=252, prediction_days=252):
#     os.makedirs(output_folder, exist_ok=True)
#     all_files = [f for f in os.listdir(input_folder) if f.endswith(".csv")]

#     for file in all_files:
#         symbol = file.replace("_data.csv", "")
#         df = pd.read_csv(os.path.join(input_folder, file), parse_dates=["Date"])
#         df = df.sort_values("Date").reset_index(drop=True)

#         windows = []  # list to hold all windows

#         for start in range(len(df) - window_days - prediction_days):
#             x_window = df.iloc[start:start + window_days].copy()
#             future_window = df.iloc[start + window_days:start + window_days + prediction_days]

#             if len(future_window) < prediction_days:
#                 continue

#             start_price = future_window.iloc[0]["Close"]
#             end_price = future_window.iloc[-1]["Close"]
#             label = calculate_return(start_price, end_price)

#             x_window_ta = add_technical_indicators(x_window)
#             x_window_ta["TargetReturn"] = label
#             x_window_ta["Symbol"] = symbol
#             x_window_ta["WindowID"] = start

        #     # Add a marker for each row so we can split back if needed
        #     x_window_ta["RowInWindow"] = list(range(len(x_window_ta)))

        #     windows.append(x_window_ta)

        # if windows:
        #     all_data = pd.concat(windows, ignore_index=True)
        #     output_file = os.path.join(output_folder, f"{symbol}_long_term_windows.csv")
        #     all_data.to_csv(output_file, index=False)
        #     print(f"✅ Processed {symbol}: {len(windows)} windows saved in one file.")
        # else:
        #     print(f"⚠️ Skipped {symbol}: No valid windows.")


import os
import pandas as pd
from utils import add_technical_indicators

def load_processed_symbols(log_path):
    if os.path.exists(log_path):
        with open(log_path, "r") as f:
            return set(line.strip() for line in f.readlines())
    return set()

def log_processed_symbol(log_path, symbol):
    with open(log_path, "a") as f:
        f.write(symbol + "\n")

def create_long_term_windows(input_folder, output_folder, window_days=252, prediction_days=252):
    os.makedirs(output_folder, exist_ok=True)
    
    # Log file path
    log_file_path = os.path.join(output_folder, "processed_files.txt")
    processed_symbols = load_processed_symbols(log_file_path)

    all_files = [f for f in os.listdir(input_folder) if f.endswith(".csv")]

    for file in all_files:
        symbol = file.replace("_data.csv", "")
        if symbol in processed_symbols:
            print(f"⏩ Skipping {symbol} (already processed)")
            continue

        df = pd.read_csv(os.path.join(input_folder, file), parse_dates=["Date"])
        df = df.sort_values("Date").reset_index(drop=True)
        df_ta = add_technical_indicators(df.copy())  # Compute indicators once ✅

        windows = []

        for start in range(len(df_ta) - window_days - prediction_days):
            x_window = df_ta.iloc[start:start + window_days].copy()
            future_window = df_ta.iloc[start + window_days:start + window_days + prediction_days]

            if len(future_window) < prediction_days:
                continue

            start_price = future_window.iloc[0]["Close"]
            end_price = future_window.iloc[-1]["Close"]
            label = (end_price - start_price) / start_price

            x_window["TargetReturn"] = label
            x_window["Symbol"] = symbol
            x_window["WindowID"] = start
            x_window["RowInWindow"] = list(range(len(x_window)))

            windows.append(x_window)

        if windows:
            all_data = pd.concat(windows, ignore_index=True)
            output_file = os.path.join(output_folder, f"{symbol}_long_term_windows.csv")
            all_data.to_csv(output_file, index=False)
            log_processed_symbol(log_file_path, symbol)
            print(f"✅ Processed {symbol}: {len(windows)} windows.")
        else:
            print(f"⚠️ Skipped {symbol}: No valid windows.")




In [3]:

create_long_term_windows(
    input_folder="../data/stock_data",
    output_folder="../data/long_term_features"
)


⏩ Skipping 360ONE.NS (already processed)
⚠️ Skipped AADHARHFC.NS: No valid windows.
⏩ Skipping AAVAS.NS (already processed)
⏩ Skipping ABBOTINDIA.NS (already processed)
⏩ Skipping ABCAPITAL.NS (already processed)
⏩ Skipping ABSLAMC.NS (already processed)
⏩ Skipping ACC.NS (already processed)
⚠️ Skipped ACMESOLAR.NS: No valid windows.
⚠️ Skipped ADANIENSOL.NS: No valid windows.
⏩ Skipping ADANIENT.NS (already processed)
⏩ Skipping ADANIGREEN.NS (already processed)
⏩ Skipping ADANIPOWER.NS (already processed)
⚠️ Skipped AEGISLOG.NS: No valid windows.
⏩ Skipping AFFLE.NS (already processed)
⚠️ Skipped AIIL.NS: No valid windows.
⏩ Skipping AJANTPHARM.NS (already processed)
⚠️ Skipped AKUMS.NS: No valid windows.
⚠️ Skipped ALIVUS.NS: No valid windows.
⏩ Skipping ALKEM.NS (already processed)
⏩ Skipping ALOKINDS.NS (already processed)
⏩ Skipping AMBER.NS (already processed)
⏩ Skipping AMBUJACEM.NS (already processed)
⏩ Skipping ANANDRATHI.NS (already processed)
⏩ Skipping ANGELONE.NS (already