In [1]:
import os
import pandas as pd
from tqdm import tqdm

def create_window_based_dataset(input_folder: str, output_csv: str):
    all_files = [f for f in os.listdir(input_folder) if f.endswith(".csv")]
    final_data = []

    for file in tqdm(all_files, desc="Processing stocks"):
        df = pd.read_csv(os.path.join(input_folder, file), parse_dates=["Date"])
        df = df.sort_values(["WindowID", "RowInWindow"]).reset_index(drop=True)
        grouped = df.groupby("WindowID")
        
        for window_id, group in grouped:
            if group.shape[0] < 2:
                continue
            open_price = group.iloc[0]["Open"]
            close_today = group.iloc[0]["Close"]
            close_after_year = group.iloc[-1]["Close"]
            target_return = group.iloc[0]["TargetReturn"]
            symbol = group.iloc[0]["Symbol"]
            date = group.iloc[0]["Date"]
            indicators = group.iloc[0].drop(["Date", "Open", "Close", "TargetReturn", "Symbol", "WindowID", "RowInWindow"], errors='ignore').to_dict()
            row = {
                "Date": date,
                "Symbol": symbol,
                "Open": open_price,
                "Close_Today": close_today,
                "Close_After_Year": close_after_year,
                "Target_Return": target_return,
                **indicators
            }
            final_data.append(row)
    
    df_final = pd.DataFrame(final_data)
    df_final.to_csv(output_csv, index=False)
    print(f"✅ Final dataset saved to: {output_csv}")

# Example usage
create_window_based_dataset("../data/long_term_features/", "long_term_dataset.csv")



Processing stocks: 100%|███████████████████████████████████████████████████████████| 172/172 [1:28:05<00:00, 30.73s/it]


✅ Final dataset saved to: long_term_dataset.csv
