# Set up imports and pips

In [None]:
!pip install stable_baselines3[extra]
!pip install --upgrade yfinance
!pip install --upgrade ipywidgets

In [None]:
import gymnasium as gym
import logging
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
import seaborn as sns
import shutil
import time
import yfinance as yf
from glob import glob
from gymnasium import spaces
from stable_baselines3 import DDPG, PPO, SAC, TD3
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy*
from urllib.parse import urlencode
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from transformers import pipeline

In [None]:
def delete_all_except(directory: str):
    if not os.path.exists(directory):
        logging.warning(f"Directory '{directory}' does not exist.")
        return

    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)

        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.remove(file_path)
                logging.info(f"Deleted file: {file_path}")
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
                logging.info(f"Deleted directory and its contents: {file_path}")
        except Exception as e:
            logging.error(f"Failed to delete {file_path}. Reason: {e}")

current_directory = os.getcwd()
current_directory
delete_all_except(current_directory)


# Part 1: Data collection

Defining ticker list

In [None]:
tickers = [
"^GSPC", # S&P 500
"^IXIC", # NASDAQ Composite
"^DJI", # Dow Jones Industrial Average
"^FCHI", # CAC 40 (France)
"^FTSE", # FTSE 100 (UK)
"^STOXX50E",# EuroStoxx 50
"^HSI", # Hang Seng Index (Hong Kong)
"000001.SS",# Shanghai Composite (China)
"^BSESN", # BSE Sensex (India)
"^NSEI", # Nifty 50 (India)
"^KS11", # KOSPI (South Korea)
"GC=F", # Gold
"SI=F", # Silver
"CL=F", # WTI Crude Oil Futures
]

Data fetching

In [None]:
start_date = "2003-01-01"  # Example start date
end_date = "2024-12-31"   # Example end date

# Function to fetch data with rate limit handling
def fetch_data(tickers, start, end):
    retry = True
    delay = 60  # Delay for rate limit errors
    data = None  # Initialize data to None
    while retry:
        try:
            # Batch download, specify actions=False to avoid extra data
            data = yf.download(tickers, start=start, end=end, actions=False)
            # If multi-ticker, extract 'Close' from the multi-level columns
            if len(tickers) > 1:
                data = data['Close']
            else:
                data = data[['Close']]  # Single ticker case
            retry = False
        except Exception as e:
            if "429" in str(e):
                print(f"Rate limit hit. Retrying after {delay} seconds...")
                time.sleep(delay)
            else:
                print(f"Error fetching data: {e}")
                retry = False
                data = pd.DataFrame()  # Return empty DataFrame on failure
    return data

# Download closing prices
data = fetch_data(tickers, start_date, end_date)

# Check if data is not empty
if not data.empty:
    # Save to CSV
    data.to_csv("stock_closing_prices.csv")
    print("Closing prices saved to stock_closing_prices.csv")
else:
    print("No data to save.")

# Add delay to avoid rate limits for subsequent requests
time.sleep(2)

Data Cleaning

In [None]:

def clean_stock_data(input_csv, output_csv):
    """
    Clean stock closing prices by applying backward fill and linear interpolation.

    Parameters:
    input_csv (str): Path to the input CSV file (e.g., 'stock_closing_prices.csv')
    output_csv (str): Path to the output CSV file (e.g., 'clean_data.csv')

    Returns:
    None: Saves the cleaned data to output_csv
    """
    try:
        # Read the CSV, assuming Date is the index
        df = pd.read_csv(input_csv, index_col='Date', parse_dates=True)

        # Apply backward fill to handle missing data at start/end
        df = df.bfill()

        # Apply linear interpolation to fill small gaps
        df = df.interpolate(method='linear', limit_direction='both')

        # Save the cleaned data to a new CSV
        df.to_csv(output_csv)
        print(f"Cleaned data saved to {output_csv}")

        # Report any remaining NaN values
        if df.isna().any().any():
            print("Warning: Some NaN values remain after cleaning.")
            print(df.isna().sum())

    except FileNotFoundError:
        print(f"Error: Input file {input_csv} not found.")
    except Exception as e:
        print(f"Error processing data: {e}")

# Example usage
if __name__ == "__main__":
    clean_stock_data("stock_closing_prices.csv", "clean_data.csv")

Normalize and log normalize

In [None]:
def process_stock_data(input_csv, normalized_output_csv, log_output_csv):
    """
    Process cleaned stock data to create normalized and log evolution CSVs.

    Parameters:
    input_csv (str): Path to the input CSV file (e.g., 'clean_data.csv')
    normalized_output_csv (str): Path to the output CSV for normalized data
    log_output_csv (str): Path to the output CSV for log evolution data

    Returns:
    None: Saves the processed data to the specified output CSVs
    """
    try:
        # Read the CSV, assuming Date is the index
        df = pd.read_csv(input_csv, index_col='Date', parse_dates=True)

        # Check if DataFrame is empty
        if df.empty:
            print("Error: Input CSV is empty.")
            return

        # Initialize DataFrames for normalized and log evolution data
        normalized_df = df.copy()
        log_evolution_df = df.copy()

        # Process each ticker (column)
        for ticker in df.columns:
            # Get the first non-null value for the ticker
            first_valid = df[ticker].dropna().iloc[0] if df[ticker].dropna().size > 0 else None

            if first_valid is not None and first_valid > 0:
                # Normalize: divide by the first non-null value (set first point to 1)
                normalized_df[ticker] = df[ticker] / first_valid

                # Log evolution: ln(price / first_valid)
                log_evolution_df[ticker] = np.log(df[ticker] / first_valid)
            else:
                # If no valid first value or first value is zero, set to NaN
                print(f"Warning: No valid first value for {ticker} or first value is zero.")
                normalized_df[ticker] = np.nan
                log_evolution_df[ticker] = np.nan

        # Save the normalized data
        normalized_df.to_csv(normalized_output_csv)
        print(f"Normalized data saved to {normalized_output_csv}")

        # Save the log evolution data
        log_evolution_df.to_csv(log_output_csv)
        print(f"Log evolution data saved to {log_output_csv}")

        # Report any columns with all NaN values
        if normalized_df.isna().all().any():
            print("Warning: Some tickers have all NaN in normalized data:",
                  normalized_df.columns[normalized_df.isna().all()].tolist())
        if log_evolution_df.isna().all().any():
            print("Warning: Some tickers have all NaN in log evolution data:",
                  log_evolution_df.columns[log_evolution_df.isna().all()].tolist())

    except FileNotFoundError:
        print(f"Error: Input file {input_csv} not found.")
    except Exception as e:
        print(f"Error processing data: {e}")

# Example usage
if __name__ == "__main__":
    process_stock_data("clean_data.csv", "normalized_data.csv", "log_evolution_data.csv")

Simulate EQWP (equal weights portflio)

In [None]:

def simulate_equal_weights_portfolio(input_csv, output_csv, initial_value=1):
    """
    Simulate an equal weights portfolio and save its evolution and log evolution.

    Parameters:
    input_csv (str): Path to the input CSV file (e.g., 'clean_data.csv')
    output_csv (str): Path to the output CSV file (e.g., 'portfolio_evolution.csv')
    initial_value (float): Initial portfolio value in dollars (default: 1000)

    Returns:
    None: Saves the portfolio evolution and log evolution to output_csv
    """
    try:
        # Read the CSV, assuming Date is the index
        df = pd.read_csv(input_csv, index_col='Date', parse_dates=True)

        # Check if DataFrame is empty
        if df.empty:
            print("Error: Input CSV is empty.")
            return

        # Drop columns with all NaN values (e.g., failed tickers)
        df = df.dropna(axis=1, how='all')
        if df.empty:
            print("Error: No valid ticker data after dropping NaN columns.")
            return

        # Number of tickers
        n_tickers = len(df.columns)
        if n_tickers == 0:
            print("Error: No valid tickers found.")
            return

        # Initial allocation: equal weight for each ticker
        weight = 1.0 / n_tickers
        initial_allocation = initial_value * weight

        # Calculate number of shares for each ticker (based on first valid price)
        first_valid_prices = df.apply(lambda x: x.dropna().iloc[0] if x.dropna().size > 0 else np.nan)
        if first_valid_prices.isna().any():
            print("Warning: Some tickers have no valid prices:",
                  first_valid_prices.index[first_valid_prices.isna()].tolist())
            df = df.loc[:, ~first_valid_prices.isna()]
            first_valid_prices = first_valid_prices.dropna()
            n_tickers = len(df.columns)
            weight = 1.0 / n_tickers
            initial_allocation = initial_value * weight

        shares = initial_allocation / first_valid_prices

        # Calculate portfolio value: sum of (shares * price) for each ticker
        portfolio_values = (df * shares).sum(axis=1)

        # Calculate log evolution: ln(portfolio_value / initial_value)
        log_evolution = np.log(portfolio_values / initial_value)

        # Create output DataFrame
        output_df = pd.DataFrame({
            'Portfolio_Value': portfolio_values,
            'Log_Evolution': log_evolution
        })

        # Save to CSV
        output_df.to_csv(output_csv)
        print(f"Portfolio evolution saved to {output_csv}")

        # Report any NaN values in the output
        if output_df.isna().any().any():
            print("Warning: Some NaN values in portfolio evolution.")
            print(output_df.isna().sum())

    except FileNotFoundError:
        print(f"Error: Input file {input_csv} not found.")
    except Exception as e:
        print(f"Error processing data: {e}")

# Example usage
if __name__ == "__main__":
    simulate_equal_weights_portfolio("clean_data.csv", "portfolio_evolution.csv")

Monthly Split

In [None]:
import pandas as pd

def split_normalized_by_month(input_csv, output_dir="."):
    """
    Split normalized data CSV into monthly CSVs named {yyyy}_{mm}_closing.csv.

    Parameters:
    input_csv (str): Path to the input CSV file (e.g., 'normalized_data.csv')
    output_dir (str): Directory to save output CSVs (default: current directory)

    Returns:
    None: Saves monthly CSVs to the specified directory
    """
    try:
        # Read the CSV, assuming Date is the index
        df = pd.read_csv(input_csv, index_col='Date', parse_dates=True)

        # Check if DataFrame is empty
        if df.empty:
            print("Error: Input CSV is empty.")
            return

        # Group data by year and month
        grouped = df.groupby([df.index.year, df.index.month])

        # Iterate through each year-month group
        for (year, month), group_data in grouped:
            # Format the output filename (e.g., 2003_01_closing.csv)
            output_file = f"{output_dir}/{year}_{month:02d}_closing.csv"

            # Save the group data to a CSV
            group_data.to_csv(output_file)
            print(f"Saved {output_file} with {len(group_data)} rows")

            # Warn if the group has NaN values
            if group_data.isna().any().any():
                print(f"Warning: {output_file} contains NaN values.")
                nan_columns = group_data.columns[group_data.isna().any()].tolist()
                print(f"Columns with NaN: {nan_columns}")

        print("All monthly CSVs generated successfully.")

    except FileNotFoundError:
        print(f"Error: Input file {input_csv} not found.")
    except Exception as e:
        print(f"Error processing data: {e}")

# Example usage
if __name__ == "__main__":
    split_normalized_by_month("normalized_data.csv")

Monthly metrics computation

In [None]:

def calculate_monthly_metrics(input_dir=".", output_dir="."):
    """
    Generate monthly observation and correlation CSVs from normalized closing data.

    For each {yyyy}_{mm}_closing.csv, creates:
    - {yyyy}_{mm}_obs.csv: First close, last close, volatility, Sharpe, Sortino, Calmar, MDD
    - {yyyy}_{mm}_corr.csv: Correlation matrix of daily log returns

    Parameters:
    input_dir (str): Directory containing input CSVs (default: current directory)
    output_dir (str): Directory to save output CSVs (default: current directory)

    Returns:
    None: Saves observation and correlation CSVs to output_dir
    """
    try:
        # Find all monthly closing CSVs
        closing_files = glob(f"{input_dir}/*_closing.csv")
        if not closing_files:
            print(f"Error: No *_closing.csv files found in {input_dir}.")
            return

        for file in closing_files:
            # Extract year and month from filename (e.g., 2003_01_closing.csv)
            filename = os.path.basename(file)
            year_month = filename.replace("_closing.csv", "")
            year, month = map(int, year_month.split("_"))

            # Read the monthly CSV
            df = pd.read_csv(file, index_col='Date', parse_dates=True)

            if df.empty:
                print(f"Warning: {filename} is empty. Skipping.")
                continue

            # Drop columns with all NaN values
            df = df.dropna(axis=1, how='all')
            if df.empty:
                print(f"Warning: {filename} has no valid data after dropping NaN columns. Skipping.")
                continue

            # Calculate daily log returns: ln(price_t / price_{t-1})
            log_returns = np.log(df / df.shift(1)).dropna()

            # Initialize metrics DataFrame
            metrics = pd.DataFrame(index=df.columns)

            # Month First Close: First non-null normalized price
            metrics['Month_First_Close'] = df.apply(lambda x: x.dropna().iloc[0] if x.dropna().size > 0 else np.nan)

            # Month Last Close: Last non-null normalized price
            metrics['Month_Last_Close'] = df.apply(lambda x: x.dropna().iloc[-1] if x.dropna().size > 0 else np.nan)

            # Volatility: Annualized standard deviation of daily log returns
            metrics['Volatility'] = log_returns.std() * np.sqrt(252)

            # Sharpe Ratio: Annualized mean log return / volatility (risk-free rate = 0)
            metrics['Sharpe_Ratio'] = (log_returns.mean() * 252) / metrics['Volatility']

            # Sortino Ratio: Annualized mean log return / downside volatility
            downside_returns = log_returns.where(log_returns < 0, 0)
            downside_vol = downside_returns.std() * np.sqrt(252)
            metrics['Sortino_Ratio'] = (log_returns.mean() * 252) / downside_vol

            # Maximum Drawdown (MDD): Max peak-to-trough decline
            def calculate_mdd(series):
                cumulative = series / series.iloc[0]  # Normalize to start at 1
                peak = cumulative.cummax()
                drawdown = (peak - cumulative) / peak
                return drawdown.max() if drawdown.size > 0 else np.nan

            metrics['MDD'] = df.apply(calculate_mdd)

            # Calmar Ratio: Annualized return / MDD
            monthly_return = (metrics['Month_Last_Close'] / metrics['Month_First_Close']) - 1
            annualized_return = (1 + monthly_return) ** (12 / 1) - 1  # Annualize monthly return
            metrics['Calmar_Ratio'] = annualized_return / metrics['MDD']

            # Replace infinities with NaN (e.g., zero volatility or MDD)
            metrics = metrics.replace([np.inf, -np.inf], np.nan)

            # Save observation CSV
            obs_file = f"{output_dir}/{year_month}_obs.csv"
            metrics.to_csv(obs_file)
            print(f"Saved {obs_file} with metrics for {len(metrics)} tickers")

            # Warn about NaN values in metrics
            if metrics.isna().any().any():
                print(f"Warning: {obs_file} contains NaN values.")
                nan_columns = metrics.columns[metrics.isna().any()].tolist()
                print(f"Metrics with NaN: {nan_columns}")

            # Correlation matrix of daily log returns
            if not log_returns.empty:
                corr_matrix = log_returns.corr()
                corr_file = f"{output_dir}/{year_month}_corr.csv"
                corr_matrix.to_csv(corr_file)
                print(f"Saved {corr_file} with correlation matrix")

                # Warn about NaN values in correlation matrix
                if corr_matrix.isna().any().any():
                    print(f"Warning: {corr_file} contains NaN values.")
                    nan_columns = corr_matrix.columns[corr_matrix.isna().any()].tolist()
                    print(f"Columns with NaN: {nan_columns}")
            else:
                print(f"Warning: No valid log returns for {year_month}. Skipping correlation matrix.")

        print("All observation and correlation CSVs generated successfully.")

    except Exception as e:
        print(f"Error processing files: {e}")

# Example usage
if __name__ == "__main__":
    calculate_monthly_metrics()

Adressing the issue of NaN

In [None]:
import pandas as pd
import os
from glob import glob

def replace_nan_with_zeros(input_dir="."):
    """
    Replace NaN values with 0s in specified CSV files.

    Targets:
    - normalized_data.csv
    - log_evolution_data.csv
    - portfolio_evolution.csv
    - {yyyy}_{mm}_closing.csv
    - {yyyy}_{mm}_obs.csv
    - {yyyy}_{mm}_corr.csv

    Parameters:
    input_dir (str): Directory containing the CSV files (default: current directory)

    Returns:
    None: Overwrites original CSVs with NaN replaced by 0
    """
    try:
        # List of specific files to process
        specific_files = [
            "normalized_data.csv",
            "log_evolution_data.csv",
            "portfolio_evolution.csv"
        ]

        # Add monthly files using glob
        monthly_patterns = [
            "*_closing.csv",
            "*_obs.csv",
            "*_corr.csv"
        ]

        # Collect all files to process
        all_files = []
        for file in specific_files:
            file_path = os.path.join(input_dir, file)
            if os.path.exists(file_path):
                all_files.append(file_path)

        for pattern in monthly_patterns:
            all_files.extend(glob(os.path.join(input_dir, pattern)))

        if not all_files:
            print(f"Error: No matching CSV files found in {input_dir}.")
            return

        # Process each file
        for file in all_files:
            try:
                # Determine index column based on file type
                filename = os.path.basename(file)
                if filename.endswith(("_obs.csv", "_corr.csv")):
                    # _obs.csv and _corr.csv use first column as index (tickers)
                    df = pd.read_csv(file, index_col=0)
                else:
                    # Other CSVs use Date as index
                    df = pd.read_csv(file, index_col='Date', parse_dates=True)

                # Replace NaN with 0
                df = df.fillna(0)

                # Save back to the original file
                df.to_csv(file)
                print(f"Processed {filename}: Replaced NaN with 0s")

                # Verify no NaNs remain
                if df.isna().any().any():
                    print(f"Warning: {filename} still contains NaN values after processing.")

            except Exception as e:
                print(f"Error processing {filename}: {e}")

        print("All specified CSVs processed successfully.")

    except Exception as e:
        print(f"Error accessing files: {e}")

# Example usage
if __name__ == "__main__":
    replace_nan_with_zeros()

Combining metrics

In [None]:

def combine_monthly_metrics(input_dir=".", output_dir="."):
    """
    Combine monthly observation and correlation data into a single-column CSV.

    For each {yyyy}_{mm}_obs.csv and {yyyy}_{mm}_corr.csv, creates:
    - {yyyy}_{mm}_combined.csv: Single column with metrics and flattened correlations

    Parameters:
    input_dir (str): Directory containing input CSVs (default: current directory)
    output_dir (str): Directory to save output CSVs (default: current directory)

    Returns:
    None: Saves combined CSVs to output_dir
    """
    try:
        # Find all observation CSVs
        obs_files = glob(f"{input_dir}/*_obs.csv")
        if not obs_files:
            print(f"Error: No *_obs.csv files found in {input_dir}.")
            return

        for obs_file in obs_files:
            # Extract year and month from filename (e.g., 2003_01_obs.csv)
            filename = os.path.basename(obs_file)
            year_month = filename.replace("_obs.csv", "")

            # Corresponding correlation file
            corr_file = f"{input_dir}/{year_month}_corr.csv"

            # Check if correlation file exists
            if not os.path.exists(corr_file):
                print(f"Warning: {corr_file} not found. Skipping {year_month}.")
                continue

            try:
                # Read observation CSV (tickers as index, metrics as columns)
                obs_df = pd.read_csv(obs_file, index_col=0)

                # Read correlation CSV (tickers as both index and columns)
                corr_df = pd.read_csv(corr_file, index_col=0)

                # Validate DataFrames
                if obs_df.empty:
                    print(f"Warning: {filename} is empty. Skipping {year_month}.")
                    continue
                if corr_df.empty:
                    print(f"Warning: {corr_file} is empty. Skipping {year_month}.")
                    continue

                # Initialize list for combined data
                combined_data = []

                # Process observation metrics
                for ticker in obs_df.index:
                    for metric in obs_df.columns:
                        value = obs_df.at[ticker, metric]
                        row_label = f"{ticker}_{metric}"
                        combined_data.append((row_label, value))

                # Flatten correlation matrix (upper triangle, exclude diagonal)
                tickers = corr_df.index
                for i in range(len(tickers)):
                    for j in range(i + 1, len(tickers)):  # Upper triangle
                        ticker1, ticker2 = tickers[i], tickers[j]
                        value = corr_df.at[ticker1, ticker2]
                        row_label = f"{ticker1}_{ticker2}_Correlation"
                        combined_data.append((row_label, value))

                # Create single-column DataFrame
                combined_df = pd.DataFrame(
                    [x[1] for x in combined_data],
                    index=[x[0] for x in combined_data],
                    columns=['Value']
                )

                # Save to combined CSV
                output_file = f"{output_dir}/{year_month}_combined.csv"
                combined_df.to_csv(output_file)
                print(f"Saved {output_file} with {len(combined_df)} rows")

                # Warn about any unexpected values (e.g., NaNs, though replaced with 0s)
                if combined_df['Value'].eq(0).any():
                    print(f"Note: {output_file} contains zero values (from prior NaN replacement).")

            except Exception as e:
                print(f"Error processing {year_month}: {e}")

        print("All combined CSVs generated successfully.")

    except Exception as e:
        print(f"Error accessing files: {e}")

# Example usage
if __name__ == "__main__":
    combine_monthly_metrics()

File Organization

In [None]:

def organize_csv_files(input_dir=".", output_dir="."):
    """
    Organize CSV files into three folders: combined, price, and usage.

    Folders:
    - combined: {yyyy}_{mm}_combined.csv
    - price: clean_data.csv, normalized_data.csv, log_evolution_data.csv, portfolio_evolution.csv
    - usage: {yyyy}_{mm}_closing.csv, {yyyy}_{mm}_obs.csv, {yyyy}_{mm}_corr.csv

    Parameters:
    input_dir (str): Directory containing the CSV files (default: current directory)
    output_dir (str): Directory to create folders and move files (default: current directory)

    Returns:
    None: Moves CSV files to appropriate folders
    """
    try:
        # Define folder names and their corresponding files
        folders = {
            "combined": glob(os.path.join(input_dir, "*_combined.csv")),
            "price": [
                os.path.join(input_dir, f) for f in [
                    "clean_data.csv",
                    "normalized_data.csv",
                    "log_evolution_data.csv",
                    "portfolio_evolution.csv",
                    "stock_closing_prices.csv"
                ] if os.path.exists(os.path.join(input_dir, f))
            ],
            "usage": (
                glob(os.path.join(input_dir, "*_closing.csv")) +
                glob(os.path.join(input_dir, "*_obs.csv")) +
                glob(os.path.join(input_dir, "*_corr.csv"))
            )
        }

        # Check if any files were found
        total_files = sum(len(files) for files in folders.values())
        if total_files == 0:
            print(f"Error: No matching CSV files found in {input_dir}.")
            return

        # Create folders if they don't exist
        for folder in folders:
            folder_path = os.path.join(output_dir, folder)
            os.makedirs(folder_path, exist_ok=True)
            print(f"Created/Verified folder: {folder_path}")

        # Move files to their respective folders
        for folder, files in folders.items():
            target_dir = os.path.join(output_dir, folder)
            for file in files:
                filename = os.path.basename(file)
                target_path = os.path.join(target_dir, filename)

                # Skip if file already exists in target to avoid overwriting
                if os.path.exists(target_path):
                    print(f"Skipped {filename}: Already exists in {target_dir}")
                    continue

                try:
                    shutil.move(file, target_path)
                    print(f"Moved {filename} to {target_dir}")
                except Exception as e:
                    print(f"Error moving {filename}: {e}")

        print("All CSV files organized successfully.")

    except Exception as e:
        print(f"Error organizing files: {e}")

# Example usage
if __name__ == "__main__":
    organize_csv_files()

Generate Plots

In [None]:

def generate_plots(input_dir=".", output_dir="."):
    """
    Generate plots and correlation heatmaps from CSV files.

    Creates:
    - plots/normalized_evolution.png: Normalized prices for all tickers
    - plots/log_evolution.png: Log evolution for all tickers
    - plots/log_evolution_portfolio.png: Log evolution + portfolio log evolution
    - plots/normalized_portfolio.png: Normalized prices + normalized portfolio
    - plots/corr/{yyyy}_{mm}_corr_heatmap.png: Heatmaps for monthly correlations

    Parameters:
    input_dir (str): Directory containing price and usage folders (default: current)
    output_dir (str): Directory to save plots folder (default: current)

    Returns:
    None: Saves plots and heatmaps to plots and plots/corr folders
    """
    try:
        # Create plots and corr folders
        plots_dir = os.path.join(output_dir, "plots")
        corr_dir = os.path.join(plots_dir, "corr")
        os.makedirs(plots_dir, exist_ok=True)
        os.makedirs(corr_dir, exist_ok=True)
        print(f"Created/Verified folders: {plots_dir}, {corr_dir}")

        # Paths to input files
        price_dir = os.path.join(input_dir, "price")
        usage_dir = os.path.join(input_dir, "usage")

        # Load data for plots, handling missing files
        normalized_df = None
        log_evolution_df = None
        portfolio_df = None

        normalized_file = os.path.join(price_dir, "normalized_data.csv")
        if os.path.exists(normalized_file):
            try:
                normalized_df = pd.read_csv(normalized_file, index_col="Date", parse_dates=True)
            except Exception as e:
                print(f"Error reading {normalized_file}: {e}")
        else:
            print(f"Warning: {normalized_file} not found. Skipping normalized plots.")

        log_evolution_file = os.path.join(price_dir, "log_evolution_data.csv")
        if os.path.exists(log_evolution_file):
            try:
                log_evolution_df = pd.read_csv(log_evolution_file, index_col="Date", parse_dates=True)
            except Exception as e:
                print(f"Error reading {log_evolution_file}: {e}")
        else:
            print(f"Warning: {log_evolution_file} not found. Skipping log evolution plots.")

        portfolio_file = os.path.join(price_dir, "portfolio_evolution.csv")
        if os.path.exists(portfolio_file):
            try:
                portfolio_df = pd.read_csv(portfolio_file, index_col="Date", parse_dates=True)
            except Exception as e:
                print(f"Error reading {portfolio_file}: {e}")
        else:
            print(f"Warning: {portfolio_file} not found. Skipping portfolio-related plots.")

        # Plot 1: Normalized Evolution
        if normalized_df is not None and not normalized_df.empty:
            plt.figure(figsize=(12, 6))
            for column in normalized_df.columns:
                if normalized_df[column].eq(0).all():
                    print(f"Warning: {column} has all zero values in normalized_data.csv. Skipping.")
                    continue
                plt.plot(normalized_df.index, normalized_df[column], label=column)
            plt.title("Normalized Evolution of Tickers")
            plt.xlabel("Date")
            plt.ylabel("Normalized Price (First Point = 1)")
            plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
            plt.grid(True)
            plt.tight_layout()
            plt.savefig(os.path.join(plots_dir, "normalized_evolution.png"))
            plt.close()
            print("Saved plots/normalized_evolution.png")
        else:
            print("Skipping normalized_evolution.png: No valid normalized data.")

        # Plot 2: Log Evolution
        if log_evolution_df is not None and not log_evolution_df.empty:
            plt.figure(figsize=(12, 6))
            for column in log_evolution_df.columns:
                if log_evolution_df[column].eq(0).all():
                    print(f"Warning: {column} has all zero values in log_evolution_data.csv. Skipping.")
                    continue
                plt.plot(log_evolution_df.index, log_evolution_df[column], label=column)
            plt.title("Log Evolution of Tickers")
            plt.xlabel("Date")
            plt.ylabel("Log Evolution (ln(price / price_0))")
            plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
            plt.grid(True)
            plt.tight_layout()
            plt.savefig(os.path.join(plots_dir, "log_evolution.png"))
            plt.close()
            print("Saved plots/log_evolution.png")
        else:
            print("Skipping log_evolution.png: No valid log evolution data.")

        # Plot 3: Log Evolution + Portfolio
        if log_evolution_df is not None and portfolio_df is not None and not log_evolution_df.empty and not portfolio_df.empty:
            plt.figure(figsize=(12, 6))
            for column in log_evolution_df.columns:
                if log_evolution_df[column].eq(0).all():
                    print(f"Warning: {column} has all zero values in log_evolution_data.csv. Skipping.")
                    continue
                plt.plot(log_evolution_df.index, log_evolution_df[column], label=column)
            plt.plot(portfolio_df.index, portfolio_df["Log_Evolution"],
                    label="Equal Weights Portfolio", linewidth=2, linestyle="--")
            plt.title("Log Evolution of Tickers and Equal Weights Portfolio")
            plt.xlabel("Date")
            plt.ylabel("Log Evolution")
            plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
            plt.grid(True)
            plt.tight_layout()
            plt.savefig(os.path.join(plots_dir, "log_evolution_portfolio.png"))
            plt.close()
            print("Saved plots/log_evolution_portfolio.png")
        else:
            print("Skipping log_evolution_portfolio.png: Missing log evolution or portfolio data.")

        # Plot 4: Normalized + Portfolio
        if normalized_df is not None and portfolio_df is not None and not normalized_df.empty and not portfolio_df.empty:
            # Normalize portfolio value to start at 1
            portfolio_normalized = portfolio_df["Portfolio_Value"] / portfolio_df["Portfolio_Value"].iloc[0]
            plt.figure(figsize=(12, 6))
            for column in normalized_df.columns:
                if normalized_df[column].eq(0).all():
                    print(f"Warning: {column} has all zero values in normalized_data.csv. Skipping.")
                    continue
                plt.plot(normalized_df.index, normalized_df[column], label=column)
            plt.plot(portfolio_normalized.index, portfolio_normalized,
                    label="Equal Weights Portfolio", linewidth=2, linestyle="--")
            plt.title("Normalized Evolution of Tickers and Equal Weights Portfolio")
            plt.xlabel("Date")
            plt.ylabel("Normalized Value (First Point = 1)")
            plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
            plt.grid(True)
            plt.tight_layout()
            plt.savefig(os.path.join(plots_dir, "normalized_portfolio.png"))
            plt.close()
            print("Saved plots/normalized_portfolio.png")
        else:
            print("Skipping normalized_portfolio.png: Missing normalized or portfolio data.")

        # Generate correlation matrix heatmaps
        corr_files = glob(os.path.join(usage_dir, "*_corr.csv"))
        if not corr_files:
            print(f"Warning: No *_corr.csv files found in {usage_dir}.")
        else:
            for corr_file in corr_files:
                try:
                    # Extract year and month from filename
                    filename = os.path.basename(corr_file)
                    year_month = filename.replace("_corr.csv", "")

                    # Read correlation matrix
                    corr_df = pd.read_csv(corr_file, index_col=0)

                    # Skip empty or invalid matrices
                    if corr_df.empty or corr_df.eq(0).all().all():
                        print(f"Warning: {filename} is empty or all zeros. Skipping.")
                        continue

                    # Plot heatmap
                    plt.figure(figsize=(10, 8))
                    sns.heatmap(corr_df, annot=True, cmap="coolwarm", vmin=-1, vmax=1,
                               center=0, fmt=".2f")
                    plt.title(f"Correlation Matrix - {year_month}")
                    output_file = os.path.join(corr_dir, f"{year_month}_corr_heatmap.png")
                    plt.savefig(output_file, bbox_inches="tight")
                    plt.close()
                    print(f"Saved {output_file}")

                except Exception as e:
                    print(f"Error processing {filename}: {e}")

        print("All available plots and heatmaps generated successfully.")

    except Exception as e:
        print(f"Error generating plots: {e}")

# Example usage
if __name__ == "__main__":
    # Assuming organized structure from previous step
    generate_plots(".", "./organized")

Extract combined metrics

In [None]:

def extract_combined_metrics(input_dir=".", output_dir="."):
    """
    Extract the second column (Value) from combined CSVs and save to observation/metrics.

    Processes:
    - All {yyyy}_{mm}_combined.csv from input_dir/organized/combined/
    - Saves single-column CSVs to output_dir/observation/metrics/

    Parameters:
    input_dir (str): Directory containing organized/combined folder (default: current)
    output_dir (str): Directory to create observation/metrics folder (default: current)

    Returns:
    None: Saves extracted CSVs to observation/metrics
    """
    try:
        # Define input and output paths
        combined_dir = os.path.join(input_dir, "combined")
        metrics_dir = os.path.join(output_dir, "observation", "metrics")

        # Create observation/metrics folder
        os.makedirs(metrics_dir, exist_ok=True)
        print(f"Created/Verified folder: {metrics_dir}")

        # Find all combined CSV files
        combined_files = glob(os.path.join(combined_dir, "*_combined.csv"))
        if not combined_files:
            print(f"Error: No *_combined.csv files found in {combined_dir}.")
            return

        # Process each combined CSV
        for file in combined_files:
            try:
                # Extract filename
                filename = os.path.basename(file)

                # Read CSV (index is first column, Value is second)
                df = pd.read_csv(file, index_col=0)

                if df.empty:
                    print(f"Warning: {filename} is empty. Skipping.")
                    continue

                # Verify the second column (Value)
                if len(df.columns) != 1 or df.columns[0] != "Value":
                    print(f"Warning: {filename} does not have exactly one data column (Value). Skipping.")
                    continue

                # Extract the Value column with the index
                output_df = df[["Value"]]

                # Save to observation/metrics with the same filename
                output_file = os.path.join(metrics_dir, filename)
                output_df.to_csv(output_file)
                print(f"Saved {output_file} with {len(output_df)} rows")

                # Note if all values are zero
                if output_df["Value"].eq(0).all():
                    print(f"Note: {filename} contains all zero values.")

            except Exception as e:
                print(f"Error processing {filename}: {e}")

        print("All combined metrics CSVs processed successfully.")

    except Exception as e:
        print(f"Error accessing files: {e}")

# Example usage
if __name__ == "__main__":
    # Assuming organized structure from previous steps
    extract_combined_metrics(".", ".")

Remove index collumn

In [None]:

def remove_index_column(metrics_dir):
    """
    Remove the first column (index) from CSVs in observation/metrics, keeping only the Value column.

    Processes:
    - All CSVs in metrics_dir (e.g., 2003_01_combined.csv)
    - Overwrites each CSV with a single-column version (Value only, no index or header)

    Parameters:
    metrics_dir (str): Path to observation/metrics directory

    Returns:
    None: Overwrites CSVs in metrics_dir
    """
    try:
        # Find all CSV files in metrics_dir
        csv_files = glob(os.path.join(metrics_dir, "*.csv"))
        if not csv_files:
            print(f"Error: No CSV files found in {metrics_dir}.")
            return

        # Process each CSV
        for file in csv_files:
            try:
                # Extract filename
                filename = os.path.basename(file)

                # Read CSV (index is first column, Value is second)
                df = pd.read_csv(file, index_col=0)

                if df.empty:
                    print(f"Warning: {filename} is empty. Skipping.")
                    continue

                # Verify the second column (Value)
                if len(df.columns) != 1 or df.columns[0] != "Value":
                    print(f"Warning: {filename} does not have exactly one data column (Value). Skipping.")
                    continue

                # Extract the Value column without index
                values = df["Value"]

                # Save as single-column CSV without index or header
                values.to_csv(file, index=False, header=False)
                print(f"Processed {filename}: Removed index column, kept Value column ({len(values)} rows)")

                # Note if all values are zero
                if values.eq(0).all():
                    print(f"Note: {filename} contains all zero values.")

            except Exception as e:
                print(f"Error processing {filename}: {e}")

        print("All CSVs in observation/metrics processed successfully.")

    except Exception as e:
        print(f"Error accessing files: {e}")

# Example usage
if __name__ == "__main__":
    # Assuming organized structure from previous steps
    metrics_dir = "observation/metrics"
    remove_index_column(metrics_dir)

# Part 2: NLP

In [1]:
import pandas as pd
from urllib.parse import urlencode
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup
import time
import os
from urllib.parse import urlparse
import shutil
from transformers import pipeline

def generate_google_news_link(query, start_date, end_date):
    """
    Generate a Google News search URL with custom date range.

    Parameters:
        query (str): The search query.
        start_date (str): Start date in MM/DD/YYYY format.
        end_date (str): End date in MM/DD/YYYY format.

    Returns:
        str: Google News search URL.
    """
    base_url = "https://www.google.com/search"

    params = {
        "q": query,
        "tbs": f"cdr:1,cd_min:{start_date},cd_max:{end_date}",
        "tbm": "nws"
    }

    return f"{base_url}?{urlencode(params)}"

# List of tickers and their homologous terms
search_queries = [
    ["SP 500", "S&P 500", "Standard and Poor's 500", "^GSPC", "S&P 500 Index", "SPX", "S&P 500 ETF", "S&P500", "S&P Index", "Standard & Poor's 500 Index", "S&P 500 Stock Market", "US Stock Market", "American Stocks", "USA Economy", "U.S. Markets", "U.S. Economy", "Wall Street Index", "US Equity Market", "U.S. Stock Exchange", "S&P 500 Companies"],
    ["NASDAQ", "NASDAQ Composite", "NASDAQ Index", "^IXIC", "Nasdaq Composite Index", "NASDAQ-100", "Nasdaq 100", "NASDAQ-100 Index", "NASDAQ stocks", "NASDAQ Index ETF", "American Technology Stocks", "U.S. Tech Stocks", "Tech-heavy Index", "USA Stock Market", "Nasdaq Tech Index", "NASDAQ Growth", "Silicon Valley Stocks", "NASDAQ 100 Tech", "USA Technology", "Tech Stocks Index"],
    ["Dow Jones", "DJIA", "Dow Jones Industrial Average", "^DJI", "Dow Jones Index", "Dow Jones Average", "DJIA Index", "Dow Jones Industrial", "Dow Jones Industrial Stocks", "DJIA ETF", "US Blue-Chip Stocks", "USA Industrial Stocks", "U.S. Market Leaders", "Wall Street Benchmark", "Dow Jones Companies", "US Industrials", "American Economy", "US Industrial Market", "US Stock Index", "Wall Street Giants"],
    ["CAC 40", "Paris Stock Exchange", "Euronext Paris", "^FCHI", "French Stock Market", "Paris Index", "French Economy", "France Stock Exchange", "CAC 40 Companies", "Paris Bourse", "French Markets", "Paris Exchange", "Euronext Index", "French Blue Chip Stocks", "French Equity Market", "France Stock Index", "Paris Market", "Eurozone Stocks", "France Economy", "Eurozone Market"],
    ["FTSE 100", "London Stock Exchange", "UK 100 Index", "^FTSE", "FTSE 100 Index", "London Index", "UK Stock Market", "British Stock Exchange", "UK Economy", "FTSE 100 Companies", "London Market", "UK Economy Stocks", "FTSE Index ETF", "British Blue Chips", "London Exchange", "UK Markets", "British Economy", "UK Financial Markets", "London Stock Index", "UK Stock Exchange", "FTSE 100 Stocks"],
    ["^STOXX50E", "EuroStoxx 50", "EuroStoxx 50 Index", "European Stock Market", "Eurozone Stocks", "European Economy", "Stoxx Europe 50", "Eurozone 50 Index", "EuroStoxx Index", "Europe Market Leaders", "Eurozone Leaders", "Top European Stocks", "Eurozone Top Companies", "European Blue Chips", "European Equity Market", "Eurozone Financials", "Eurozone Benchmark", "European Blue Chip Stocks", "Eurozone Economic Index", "European Market Index"],
    ["^N225", "Nikkei 225", "Nikkei Index", "Japanese Stock Market", "Japan Economy", "Nikkei Average", "Tokyo Stock Exchange", "Japan Top 225 Stocks", "Japan Stock Index", "Japanese Equity Market", "Nikkei 225 Companies", "Japan's Leading Stocks", "Japanese Financial Market", "Tokyo Exchange", "Nikkei Market", "Japan Economic Index", "Japanese Economy", "Japan's Stock Exchange", "Top Japanese Companies", "Nikkei 225 ETF"],
    ["^HSI", "Hang Seng", "Hang Seng Index", "Hong Kong Stock Market", "Hong Kong Economy", "Hong Kong Exchange", "HSI Index", "Hang Seng Index ETF", "Hong Kong Financial Market", "Chinese Stock Market", "HSI Stocks", "Asia-Pacific Stocks", "Hong Kong Blue Chips", "Hong Kong's Leading Stocks", "HSI Index Stocks", "Asian Financial Market", "Hong Kong Leading Companies", "Asian Market Leaders", "Hang Seng Companies", "Hong Kong Stock Index"],
    ["000001.SS", "Shanghai Composite", "Shanghai Stock Exchange", "Chinese Stock Market", "China Economy", "Shanghai Index", "China Financial Market", "Shanghai Composite Index", "Shanghai Exchange", "Chinese Stock Index", "China's Leading Stocks", "Shanghai Exchange Companies", "China Blue Chip Stocks", "Chinese Equity Market", "Shanghai Financial Index", "China Benchmark", "Shanghai Composite ETF", "China's Leading Companies", "Chinese Market Leaders", "China's Economic Stocks"],
    ["^BSESN", "Bombay Sensex", "S&P BSE Sensex", "Indian Stock Market", "India Economy", "BSE Sensex", "Mumbai Stock Exchange", "Sensex Companies", "Indian Financial Market", "Sensex 30", "BSE 30 Index", "India's Leading Stocks", "Indian Market Leaders", "Indian Equity Market", "Bombay Exchange", "Indian Stock Index", "Sensex Index ETF", "BSE India", "Indian Economy Stocks", "Bombay Financial Index"],
    ["^NSEI", "Nifty 50", "National Stock Exchange of India", "Indian Stock Index", "India Stock Market", "Nifty Index", "India Economy", "Indian Market Leaders", "Nifty 50 Stocks", "NSE India", "Indian Blue Chips", "Indian Financial Market", "Indian Stock Market", "India 50 Index", "Nifty Index ETF", "India's Top 50", "India's Leading Stocks", "Nifty 50 Companies", "India Economic Stocks", "Indian Market Index"],
    ["^KS11", "KOSPI", "Korea Composite Stock Price Index", "Korean Stock Market", "Korea Economy", "KOSPI Index", "South Korean Stock Market", "Korean Market Leaders", "KOSPI 200", "South Korea Financial Market", "Korean Exchange", "KOSPI ETF", "Korean Leading Stocks", "South Korean Economy", "Korean Market Index", "South Korea Stock Index", "Korean Economy Stocks", "South Korea Exchange", "Korean Equity Market", "Korea Stock Index"],
    ["Gold", "XAU", "Gold Price", "Gold Market", "Precious Metals", "Gold Spot", "Gold Bullion", "Gold ETF", "Gold Investment", "Gold Mining", "Gold Futures", "Gold Stocks", "Gold Index", "Gold Commodity", "Gold Trading", "Gold Bullion ETF", "Gold Commodity Index", "Gold Market Trends", "Gold Investment Funds", "Gold Prices Today"],
    ["Silver", "XAG", "Silver Price", "Silver Market", "Precious Metals", "Silver Spot", "Silver Bullion", "Silver ETF", "Silver Investment", "Silver Mining", "Silver Futures", "Silver Stocks", "Silver Index", "Silver Commodity", "Silver Trading", "Silver Bullion ETF", "Silver Commodity Index", "Silver Market Trends", "Silver Investment Funds", "Silver Prices Today"],
    ["Oil", "Crude Oil", "WTI", "Brent Crude", "Oil Price", "Crude Oil Price", "OPEC", "Oil Futures", "Oil Market", "Oil Stocks", "Oil ETF", "Global Oil Supply", "Oil Trading", "Oil Production", "Brent Oil Futures", "Oil Price Index", "Oil Investment", "Oil Exploration", "Oil Trading Market", "Oil Price Trends"]
]


# Define the year range
start_year = 2003
end_year = 2024

# Store all generated links
links = []

for query_group in search_queries:  # Loop through each group of related terms
    for query in query_group:  # Loop through each term in the group
        for year in range(start_year, end_year + 1):  # Loop through years
            for month in range(1, 13):  # Loop through months
                start_date = f"{month}/1/{year}"
                # Calculate the last day of the month
                next_month = month % 12 + 1
                next_year = year if month < 12 else year + 1
                end_date = (datetime(next_year, next_month, 1) - timedelta(days=1)).strftime("%m/%d/%Y")

                # Generate the search link
                link = generate_google_news_link(query, start_date, end_date)
                links.append([query, start_date, end_date, link])

# Convert to DataFrame
df = pd.DataFrame(links, columns=["Query", "Start Date", "End Date", "Google News Link"])

# Save to CSV
csv_filename = "links.csv"
df.to_csv(csv_filename, index=False)

# Create the NLP_data folder if it doesn't exist
os.makedirs("NLP_data", exist_ok=True)

# Move the links.csv file into the NLP_data folder
shutil.move("links.csv", os.path.join("NLP_data", "links.csv"))


NameError: name 'datetime' is not defined

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import os
from urllib.parse import urlencode
from datetime import datetime, timedelta

def generate_google_news_link(query, start_date, end_date):
    """
    Generate a Google News search URL with custom date range.
    """
    base_url = "https://www.google.com/search"
    params = {
        "q": query,
        "tbs": f"cdr:1,cd_min:{start_date},cd_max:{end_date}",
        "tbm": "nws"
    }
    return f"{base_url}?{urlencode(params)}"

def get_news_links(search_url):
    """
    Fetch news article links from a Google News search results page.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    try:
        response = requests.get(search_url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        # Extract all links from the search results
        links = []
        for a_tag in soup.find_all("a", href=True):
            href = a_tag["href"]
            if "https://" in href and "google.com" not in href:
                links.append(href)
        return set(links)  # Remove duplicates
    except requests.exceptions.RequestException as e:
        print(f"Failed to retrieve {search_url}: {e}")
        return []

# Load CSV file with search queries from the NLP_data folder
csv_file_path = os.path.join("NLP_data", "links.csv")
df = pd.read_csv(csv_file_path)

# Base folder to save results
output_base_folder = "google_news_results"
os.makedirs(output_base_folder, exist_ok=True)

# Process each row in the CSV
for index, row in df.iterrows():
    query = row["Query"].replace(" ", "_")  # Sanitize folder name
    search_url = row["Google News Link"]

    # Create directory for the query
    query_folder = os.path.join(output_base_folder, query)
    os.makedirs(query_folder, exist_ok=True)

    # Get news article links
    news_links = get_news_links(search_url)

    # Sanitize file name by replacing slashes with underscores
    start_date_sanitized = row['Start Date'].replace("/", "_")
    end_date_sanitized = row['End Date'].replace("/", "_")
    file_path = os.path.join(query_folder, f"{start_date_sanitized}_to_{end_date_sanitized}.txt")

    # Save links to a text file
    with open(file_path, "w", encoding="utf-8") as f:
        for link in news_links:
            f.write(link + "\n")

    print(f"Saved {len(news_links)} links for {query} ({row['Start Date']} to {row['End Date']})")

    # Respect Google's policies, wait between requests
    time.sleep(3)


In [None]:
import os
from datetime import datetime
import shutil

# Base folder where raw results are stored
output_base_folder = "google_news_results"

# Move non-empty files and delete empty ones
for query in os.listdir(output_base_folder):
    query_path = os.path.join(output_base_folder, query)
    if os.path.isdir(query_path):
        for file_name in os.listdir(query_path):
            file_path = os.path.join(query_path, file_name)
            if os.path.isfile(file_path):
                # Delete file if it is empty
                if os.path.getsize(file_path) == 0:
                    os.remove(file_path)
                    print(f"Deleted empty file: {file_path}")
                else:
                    # Extract the date part from the filename (assumes format: MM_DD_YYYY_to_...)
                    try:
                        date_part = file_name.split("_to_")[0]  # Get the start date part
                        date_obj = datetime.strptime(date_part, "%m_%d_%Y")  # Convert to datetime
                        year = date_obj.year
                        month = date_obj.month
                        month_name = date_obj.strftime("%B")  # Full month name

                        # New folder structure: output_base_folder/year/MonthName/query
                        new_location = os.path.join(output_base_folder, str(year), month_name, query)
                        os.makedirs(new_location, exist_ok=True)

                        # Move the file to the new location
                        os.rename(file_path, os.path.join(new_location, file_name))
                        print(f"Moved file to {new_location}")
                    except ValueError:
                        print(f"Skipping file due to incorrect date format: {file_name}")

# Delete any now-empty query folders at the top level of output_base_folder
for query in os.listdir(output_base_folder):
    query_path = os.path.join(output_base_folder, query)
    if os.path.isdir(query_path) and not os.listdir(query_path):
        os.rmdir(query_path)
        print(f"Deleted empty group folder: {query_path}")

print("Cleanup and organization complete!")


In [None]:
import os

# Dictionary mapping standardized query groups to their variants
query_groups = {
    "SP_500": ["SP_500", "S&P_500", "Standard_and_Poor's_500", "^GSPC", "S&P_500_Index", "SPX", "S&P_500_ETF", "S&P500", "S&P_Index", "Standard_&_Poor's_500_Index", "S&P_500_Stock_Market", "US_Stock_Market", "American_Stocks", "USA_Economy", "U.S._Markets", "U.S._Economy", "Wall_Street_Index", "US_Equity_Market", "U.S._Stock_Exchange", "S&P_500_Companies"],
    "NASDAQ": ["NASDAQ", "NASDAQ_Composite", "NASDAQ_Index", "^IXIC", "Nasdaq_Composite_Index", "NASDAQ-100", "Nasdaq_100", "NASDAQ-100_Index", "NASDAQ_stocks", "NASDAQ_Index_ETF", "American_Technology_Stocks", "U.S._Tech_Stocks", "Tech-heavy_Index", "USA_Stock_Market", "Nasdaq_Tech_Index", "NASDAQ_Growth", "Silicon_Valley_Stocks", "NASDAQ_100_Tech", "USA_Technology", "Tech_Stocks_Index"],
    "Dow_Jones": ["Dow_Jones", "DJIA", "Dow_Jones_Industrial_Average", "^DJI", "Dow_Jones_Index", "Dow_Jones_Average", "DJIA_Index", "Dow_Jones_Industrial", "Dow_Jones_Industrial_Stocks", "DJIA_ETF", "US_Blue-Chip_Stocks", "USA_Industrial_Stocks", "U.S._Market_Leaders", "Wall_Street_Benchmark", "Dow_Jones_Companies", "US_Industrials", "American_Economy", "US_Industrial_Market", "US_Stock_Index", "Wall_Street_Giants"],
    "CAC_40": ["CAC_40", "Paris_Stock_Exchange", "Euronext_Paris", "^FCHI", "French_Stock_Market", "Paris_Index", "French_Economy", "France_Stock_Exchange", "CAC_40_Companies", "Paris_Bourse", "French_Markets", "Paris_Exchange", "Euronext_Index", "French_Blue_Chip_Stocks", "French_Equity_Market", "France_Stock_Index", "Paris_Market", "Eurozone_Stocks", "France_Economy", "Eurozone_Market"],
    "FTSE_100": ["FTSE_100", "London_Stock_Exchange", "UK_100_Index", "^FTSE", "FTSE_100_Index", "London_Index", "UK_Stock_Market", "British_Stock_Exchange", "UK_Economy", "FTSE_100_Companies", "London_Market", "UK_Economy_Stocks", "FTSE_Index_ETF", "British_Blue_Chips", "London_Exchange", "UK_Markets", "British_Economy", "UK_Financial_Markets", "London_Stock_Index", "UK_Stock_Exchange", "FTSE_100_Stocks"],
    "EuroStoxx_50": ["^STOXX50E", "EuroStoxx_50", "EuroStoxx_50_Index", "European_Stock_Market", "Eurozone_Stocks", "European_Economy", "Stoxx_Europe_50", "Eurozone_50_Index", "EuroStoxx_Index", "Europe_Market_Leaders", "Eurozone_Leaders", "Top_European_Stocks", "Eurozone_Top_Companies", "European_Blue_Chips", "European_Equity_Market", "Eurozone_Financials", "Eurozone_Benchmark", "European_Blue_Chip_Stocks", "Eurozone_Economic_Index", "European_Market_Index"],
    "Nikkei_225": ["^N225", "Nikkei_225", "Nikkei_Index", "Japanese_Stock_Market", "Japan_Economy", "Nikkei_Average", "Tokyo_Stock_Exchange", "Japan_Top_225_Stocks", "Japan_Stock_Index", "Japanese_Equity_Market", "Nikkei_225_Companies", "Japan's_Leading_Stocks", "Japanese_Financial_Market", "Tokyo_Exchange", "Nikkei_Market", "Japan_Economic_Index", "Japanese_Economy", "Japan's_Stock_Exchange", "Top_Japanese_Companies", "Nikkei_225_ETF"],
    "Hang_Seng": ["^HSI", "Hang_Seng", "Hang_Seng_Index", "Hong_Kong_Stock_Market", "Hong_Kong_Economy", "Hong_Kong_Exchange", "HSI_Index", "Hang_Seng_Index_ETF", "Hong_Kong_Financial_Market", "Chinese_Stock_Market", "HSI_Stocks", "Asia-Pacific_Stocks", "Hong_Kong_Blue_Chips", "Hong_Kong's_Leading_Stocks", "HSI_Index_Stocks", "Asian_Financial_Market", "Hong_Kong_Leading_Companies", "Asian_Market_Leaders", "Hang_Seng_Companies", "Hong_Kong_Stock_Index"],
    "Shanghai_Composite": ["000001.SS", "Shanghai_Composite", "Shanghai_Stock_Exchange", "Chinese_Stock_Market", "China_Economy", "Shanghai_Index", "China_Financial_Market", "Shanghai_Composite_Index", "Shanghai_Exchange", "Chinese_Stock_Index", "China's_Leading_Stocks", "Shanghai_Exchange_Companies", "China_Blue_Chip_Stocks", "Chinese_Equity_Market", "Shanghai_Financial_Index", "China_Benchmark", "Shanghai_Composite_ETF", "China's_Leading_Companies", "Chinese_Market_Leaders", "China's_Economic_Stocks"],
    "Bombay_Sensex": ["^BSESN", "Bombay_Sensex", "S&P_BSE_Sensex", "Indian_Stock_Market", "India_Economy", "BSE_Sensex", "Mumbai_Stock_Exchange", "Sensex_Companies", "Indian_Financial_Market", "Sensex_30", "BSE_30_Index", "India's_Leading_Stocks", "Indian_Market_Leaders", "Indian_Equity_Market", "Bombay_Exchange", "Indian_Stock_Index", "Sensex_Index_ETF", "BSE_India", "Indian_Economy_Stocks", "Bombay_Financial_Index"],
    "Nifty_50": ["^NSEI", "Nifty_50", "National_Stock_Exchange_of_India", "Indian_Stock_Index", "India_Stock_Market", "Nifty_Index", "India_Economy", "Indian_Market_Leaders", "Nifty_50_Stocks", "NSE_India", "Indian_Blue_Chips", "Indian_Financial_Market", "Indian_Stock_Market", "India_50_Index", "Nifty_Index_ETF", "India's_Top_50", "India's_Leading_Stocks", "Nifty_50_Companies", "India_Economic_Stocks", "Indian_Market_Index"],
    "KOSPI": ["^KS11", "KOSPI", "Korea_Composite_Stock_Price_Index", "Korean_Stock_Market", "Korea_Economy", "KOSPI_Index", "South_Korean_Stock_Market", "Korean_Market_Leaders", "KOSPI_200", "South_Korea_Financial_Market", "Korean_Exchange", "KOSPI_ETF", "Korean_Leading_Stocks", "South_Korean_Economy", "Korean_Market_Index", "South_Korea_Stock_Index", "Korean_Economy_Stocks", "South_Korea_Exchange", "Korean_Equity_Market", "Korea_Stock_Index"],
    "Gold": ["Gold", "XAU", "Gold_Price", "Gold_Market", "Precious_Metals", "Gold_Spot", "Gold_Bullion", "Gold_ETF", "Gold_Investment", "Gold_Mining", "Gold_Futures", "Gold_Stocks", "Gold_Index", "Gold_Commodity", "Gold_Trading", "Gold_Bullion_ETF", "Gold_Commodity_Index", "Gold_Market_Trends", "Gold_Investment_Funds", "Gold_Prices_Today"],
    "Silver": ["Silver", "XAG", "Silver_Price", "Silver_Market", "Precious_Metals", "Silver_Spot", "Silver_Bullion", "Silver_ETF", "Silver_Investment", "Silver_Mining", "Silver_Futures", "Silver_Stocks", "Silver_Index", "Silver_Commodity", "Silver_Trading", "Silver_Bullion_ETF", "Silver_Commodity_Index", "Silver_Market_Trends", "Silver_Investment_Funds", "Silver_Prices_Today"],
    "Oil": ["Oil", "Crude_Oil", "WTI", "Brent_Crude", "Oil_Price", "Crude_Oil_Price", "OPEC", "Oil_Futures", "Oil_Market", "Oil_Stocks", "Oil_ETF", "Global_Oil_Supply", "Oil_Trading", "Oil_Production", "Brent_Oil_Futures", "Oil_Price_Index", "Oil_Investment", "Oil_Exploration", "Oil_Trading_Market", "Oil_Price_Trends"]
}

# Loop through each year/month folder and unify text files for each query group
for year_folder in os.listdir(output_base_folder):
    year_path = os.path.join(output_base_folder, year_folder)
    if os.path.isdir(year_path):
        for month_folder in os.listdir(year_path):
            month_path = os.path.join(year_path, month_folder)
            if os.path.isdir(month_path):
                for query_group, query_list in query_groups.items():
                    unified_file_path = os.path.join(month_path, f"{query_group}_united.txt")
                    with open(unified_file_path, "w", encoding="utf-8") as unified_file:
                        for query in query_list:
                            query_folder_path = os.path.join(month_path, query)
                            if os.path.isdir(query_folder_path):
                                for file_name in os.listdir(query_folder_path):
                                    file_path = os.path.join(query_folder_path, file_name)
                                    if file_path.endswith(".txt") and os.path.isfile(file_path):
                                        with open(file_path, "r", encoding="utf-8") as f:
                                            content = f.read()
                                            unified_file.write(content + "\n")
                                        os.remove(file_path)
                                        print(f"Deleted {file_path}")
                                # Remove the now-empty query folder
                                os.rmdir(query_folder_path)
                                print(f"Deleted empty folder: {query_folder_path}")
                    print(f"Created unified file: {unified_file_path}")


In [None]:
import os

def remove_empty_lines(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    non_empty_lines = [line for line in lines if line.strip() != ""]
    with open(file_path, "w", encoding="utf-8") as f:
        f.writelines(non_empty_lines)
    print(f"Removed empty lines from: {file_path}")

# Loop through each year/month folder and clean text files
for year_folder in os.listdir(output_base_folder):
    year_path = os.path.join(output_base_folder, year_folder)
    if os.path.isdir(year_path):
        for month_folder in os.listdir(year_path):
            month_path = os.path.join(year_path, month_folder)
            if os.path.isdir(month_path):
                for query_folder in os.listdir(month_path):
                    query_folder_path = os.path.join(month_path, query_folder)
                    if os.path.isdir(query_folder_path):
                        for file_name in os.listdir(query_folder_path):
                            file_path = os.path.join(query_folder_path, file_name)
                            if file_path.endswith(".txt") and os.path.isfile(file_path):
                                remove_empty_lines(file_path)
print("Empty lines removal completed.")


In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

def extract_article_details(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        title = soup.title.string if soup.title else "No title found"
        paragraphs = soup.find_all('p')
        article_text = " ".join([p.get_text() for p in paragraphs])
        first_500_words = " ".join(article_text.split()[:500])
        return title, first_500_words
    except requests.exceptions.RequestException as e:
        print(f"Failed to retrieve {url}: {e}")
        return "Error", "Error"

# Process unified text files to extract article details and save as CSV
for year_folder in os.listdir(output_base_folder):
    year_path = os.path.join(output_base_folder, year_folder)
    if os.path.isdir(year_path):
        for month_folder in os.listdir(year_path):
            month_path = os.path.join(year_path, month_folder)
            if os.path.isdir(month_path):
                for file_name in os.listdir(month_path):
                    if file_name.endswith("_united.txt"):
                        file_path = os.path.join(month_path, file_name)
                        with open(file_path, "r", encoding="utf-8") as f:
                            urls = f.readlines()
                        article_data = []
                        for url in urls:
                            url = url.strip()
                            title, first_500_words = extract_article_details(url)
                            article_data.append([url, title, first_500_words])
                        csv_file_path = os.path.splitext(file_path)[0] + ".csv"
                        df = pd.DataFrame(article_data, columns=["URL", "Title", "First_500_Words"])
                        df.to_csv(csv_file_path, index=False, encoding="utf-8")
                        print(f"Created CSV for {file_name}: {csv_file_path}")
print("Data extraction and CSV creation completed.")


In [None]:
import shutil
import os

# Define source and target base folders
source_base_folder = "google_news_results"
target_base_folder = "sentiment_analysis"
os.makedirs(target_base_folder, exist_ok=True)

# Move CSV files from source to target structure, preserving year/month hierarchy
for year_folder in os.listdir(source_base_folder):
    year_path = os.path.join(source_base_folder, year_folder)
    if os.path.isdir(year_path):
        target_year_path = os.path.join(target_base_folder, year_folder)
        os.makedirs(target_year_path, exist_ok=True)
        for month_folder in os.listdir(year_path):
            month_path = os.path.join(year_path, month_folder)
            if os.path.isdir(month_path):
                target_month_path = os.path.join(target_year_path, month_folder)
                os.makedirs(target_month_path, exist_ok=True)
                for file_name in os.listdir(month_path):
                    if file_name.endswith(".csv"):
                        file_path = os.path.join(month_path, file_name)
                        target_file_path = os.path.join(target_month_path, file_name)
                        shutil.move(file_path, target_file_path)
                        print(f"Moved {file_name} to {target_file_path}")
print("CSV files have been moved to the sentiment_analysis folder.")


In [None]:
import pandas as pd
from transformers import pipeline
import os

# Load FinBERT sentiment analysis model
sentiment_pipeline = pipeline("text-classification", model="ProsusAI/finbert")
def add_sentiment_labels(csv_path):
    df = pd.read_csv(csv_path)
    if 'First_500_Words' not in df.columns:
        print(f"Skipping {csv_path} (no 'First_500_Words' column found)")
        return
    sentiment_labels = []
    sentiment_scores = []
    for text in df['First_500_Words']:
        if isinstance(text, str):
            truncated_text = text[:500]  # Truncate text if needed
            sentiment = sentiment_pipeline(truncated_text)[0]
            sentiment_labels.append(sentiment['label'])
            sentiment_scores.append(sentiment['score'])
        else:
            sentiment_labels.append("Error")
            sentiment_scores.append(0.0)
    df['Sentiment_Label'] = sentiment_labels
    df['Sentiment_Score'] = sentiment_scores
    df.to_csv(csv_path, index=False, encoding="utf-8")
    print(f"Updated sentiment for {csv_path}")

# Apply sentiment analysis to each CSV in sentiment_analysis folder
sentiment_base_folder = "sentiment_analysis"
for year_folder in os.listdir(sentiment_base_folder):
    year_path = os.path.join(sentiment_base_folder, year_folder)
    if os.path.isdir(year_path):
        for month_folder in os.listdir(year_path):
            month_path = os.path.join(year_path, month_folder)
            if os.path.isdir(month_path):
                for file_name in os.listdir(month_path):
                    if file_name.endswith(".csv"):
                        csv_path = os.path.join(month_path, file_name)
                        add_sentiment_labels(csv_path)
print("Sentiment analysis labels and scores have been added.")


In [None]:
import pandas as pd
import os
def calculate_average_score(csv_path):
    df = pd.read_csv(csv_path)
    if 'Sentiment_Label' not in df.columns or 'Sentiment_Score' not in df.columns:
        print(f"Skipping {csv_path} (required columns missing)")
        return None
    positive_scores = df[df['Sentiment_Label'] == 'positive']['Sentiment_Score']
    negative_scores = df[df['Sentiment_Label'] == 'negative']['Sentiment_Score']
    sum_positive = positive_scores.sum()
    sum_negative = negative_scores.sum()
    num_positive = len(positive_scores)
    num_negative = len(negative_scores)
    if num_positive + num_negative == 0:
        return None
    average_score = (sum_positive - sum_negative) / (num_positive + num_negative)
    return average_score
# Process each month folder to create a summary CSV
sentiment_base_folder = "sentiment_analysis"
for year_folder in os.listdir(sentiment_base_folder):
    year_path = os.path.join(sentiment_base_folder, year_folder)
    if os.path.isdir(year_path):
        for month_folder in os.listdir(year_path):
            month_path = os.path.join(year_path, month_folder)
            if os.path.isdir(month_path):
                summary_data = []
                print(f"Processing {month_folder}...")
                for file_name in os.listdir(month_path):
                    if file_name.endswith(".csv"):
                        csv_path = os.path.join(month_path, file_name)
                        # Extract the query group name from filename (remove _united.csv)
                        query_group_name = file_name.replace("_united.csv", "")
                        avg_score = calculate_average_score(csv_path)
                        if avg_score is not None:
                            print(f"Adding {query_group_name} with score: {avg_score}")
                            summary_data.append([query_group_name, avg_score])
                if summary_data:
                    summary_df = pd.DataFrame(summary_data, columns=["Query Group", "Average Score"])
                    summary_csv_path = os.path.join(month_path, "monthly_summary.csv")
                    summary_df.to_csv(summary_csv_path, index=False, encoding="utf-8")
                    print(f"Created monthly summary for {month_folder}: {summary_csv_path}")
                else:
                    print(f"No data for {month_folder}, skipping.")
print("Monthly summaries have been created.")


In [None]:
import pandas as pd
import os
import shutil

# Define base folders for final results
sentiment_base_folder = "sentiment_analysis"
final_results_folder = "final_results"
os.makedirs(final_results_folder, exist_ok=True)

# Process monthly summaries and copy them to final_results, while aggregating data for unified matrix
all_months_data = []

for year_folder in os.listdir(sentiment_base_folder):
    year_path = os.path.join(sentiment_base_folder, year_folder)
    if os.path.isdir(year_path):
        target_year_path = os.path.join(final_results_folder, year_folder)
        os.makedirs(target_year_path, exist_ok=True)
        for month_folder in os.listdir(year_path):
            month_path = os.path.join(year_path, month_folder)
            if os.path.isdir(month_path):
                summary_csv_path = os.path.join(month_path, "monthly_summary.csv")
                if os.path.exists(summary_csv_path):
                    target_month_path = os.path.join(target_year_path, month_folder)
                    os.makedirs(target_month_path, exist_ok=True)
                    shutil.copy(summary_csv_path, target_month_path)
                    month_data = pd.read_csv(summary_csv_path)
                    month_data['Month'] = month_folder
                    month_data['Year'] = year_folder
                    all_months_data.append(month_data)

# Create unified matrix if data is available
if all_months_data:
    unified_df = pd.concat(all_months_data)
    unified_matrix = unified_df.pivot_table(index=['Year', 'Month'], columns='Query Group', values='Average Score')
    unified_matrix_path = os.path.join(final_results_folder, "unified_matrix.csv")
    unified_matrix.to_csv(unified_matrix_path, encoding="utf-8")
    print(f"Unified matrix has been created at: {unified_matrix_path}")
else:
    print("No data available to create the unified matrix.")

# (Optional) Save a simulated unified matrix as a separate file
unified_matrix.to_csv("simulated_unified_matrix.csv", encoding="utf-8")
print("Simulated unified matrix saved to simulated_unified_matrix.csv")


In [None]:
import os
import glob
import pandas as pd
import numpy as np

def generate_sim_nlp_vectors(input_dir="organized", output_dir="organized", sentiment_data_path=None):
    """
    Create monthly CSVs with 28-dimensional vectors: 14 volatilities + 14 sentiments (from data if provided, else random).

    Parameters:
    input_dir (str): Directory containing organized/observation/metrics (default: 'organized')
    output_dir (str): Directory to create sim_nlp folder (default: 'organized')
    sentiment_data_path (str): Path to unified_matrix.csv containing sentiment data (default: None, uses random)

    Returns:
    None: Saves CSVs to sim_nlp
    """
    try:
        # Define tickers (in order)
        tickers = [
            'GC=F', 'SI=F', '^DJI', '^IXIC', 'CL=F', '^GSPC', '^STOXX50E',
            '^FCHI', '^FTSE', '^HSI', '000001.SS', '^KS11', '^BSESN', '^NSEI'
        ]

        # Define paths
        metrics_dir = os.path.join(input_dir, "observation", "metrics")
        sim_nlp_dir = os.path.join(output_dir, "sim_nlp")

        # Create sim_nlp folder
        os.makedirs(sim_nlp_dir, exist_ok=True)
        print(f"Created/Verified folder: {sim_nlp_dir}")

        # Find all combined CSV files
        combined_files = glob(os.path.join(metrics_dir, "*_combined.csv"))
        if not combined_files:
            print(f"Error: No *_combined.csv files found in {metrics_dir}.")
            return

        # Load sentiment data if provided
        if sentiment_data_path:
            sentiment_df = pd.read_csv(sentiment_data_path)
            # Define mapping from tickers to Query_Groups
            ticker_to_query = {
                'GC=F': 'Gold',
                'SI=F': 'Silver',
                '^DJI': 'Dow Jones',
                '^IXIC': 'NASDAQ',
                'CL=F': 'Crude Oil',
                '^GSPC': 'S&P 500',
                '^STOXX50E': 'EURO STOXX 50',
                '^FCHI': 'CAC 40',
                '^FTSE': 'FTSE 100',
                '^HSI': 'Hang Seng',
                '000001.SS': 'Shanghai Composite',
                '^KS11': 'KOSPI',
                '^BSESN': 'BSE Sensex',
                '^NSEI': 'Nifty 50'
            }
        else:
            print("No sentiment data provided, using random sentiments.")

        # Process each combined CSV
        for file in combined_files:
            try:
                # Extract year and month from filename (e.g., 2003_01_combined.csv)
                filename = os.path.basename(file)
                year_month = filename.replace("_combined.csv", "")
                year, month = year_month.split("_")
                output_filename = f"{year}{month}.csv"

                # Read CSV (single-column, no header)
                df = pd.read_csv(file, header=None)

                if df.empty:
                    print(f"Warning: {filename} is empty. Skipping.")
                    continue

                # Ensure correct number of rows
                expected_rows = 14 * 7 + (14 * 13 // 2)  # 98 metrics + 91 correlations
                if len(df) != expected_rows:
                    print(f"Warning: {filename} has {len(df)} rows, expected {expected_rows}. Skipping.")
                    continue

                # Extract volatilities (rows 3rd to 16th, assuming 7 metrics per ticker)
                volatility_indices = [2 + i * 7 for i in range(14)]  # Volatility is 3rd metric (index 2)
                volatilities = df.iloc[volatility_indices, 0].values

                # Get sentiments
                if sentiment_data_path:
                    y = int(year)
                    m = int(month)
                    sentiments = []
                    for ticker in tickers:
                        query_group = ticker_to_query.get(ticker, None)
                        if query_group:
                            df_sent = sentiment_df[(sentiment_df['Year'] == y) & (sentiment_df['Month'] == m) & (sentiment_df['Query_Group'] == query_group)]
                            if not df_sent.empty:
                                sentiments.append(df_sent['Average_Sentiment'].values[0])
                            else:
                                sentiments.append(0)  # Default to 0 if no data
                        else:
                            sentiments.append(0)  # Default to 0 if no mapping
                    sentiments = np.array(sentiments)
                else:
                    sentiments = np.random.uniform(-1, 1, 14)

                # Combine into 28-dimensional vector
                vector = np.concatenate([volatilities, sentiments])

                # Create column names
                columns = [f"Vol_{ticker}" for ticker in tickers] + [f"Sent_{ticker}" for ticker in tickers]

                # Create single-row DataFrame
                output_df = pd.DataFrame([vector], columns=columns)

                # Save to sim_nlp
                output_file = os.path.join(sim_nlp_dir, output_filename)
                output_df.to_csv(output_file, index=False)
                print(f"Saved {output_file} with 28-dimensional vector")

                # Note if volatilities are all zero
                if np.all(volatilities == 0):
                    print(f"Note: {filename} has all zero volatilities.")

            except Exception as e:
                print(f"Error processing {filename}: {e}")

        print("All sim_nlp CSVs generated successfully.")

    except Exception as e:
        print(f"Error accessing files: {e}")

# Example usage
if __name__ == "__main__":
    generate_sim_nlp_vectors(".", ".", sentiment_data_path="path/to/unified_matrix.csv")

# 3. The rest

In [None]:
import os
import shutil

def move_folders_to_metrics_used(input_dir="organized", output_dir="."):
    """
    Create a metrics_used folder and move all folders from input_dir except observation.

    Moves:
    - combined, price, usage, plots to metrics_used
    - Leaves observation in input_dir

    Parameters:
    input_dir (str): Directory containing organized folders (default: 'organized')
    output_dir (str): Directory to create metrics_used folder (default: current)

    Returns:
    None: Moves folders to metrics_used
    """
    try:
        # Define paths
        metrics_used_dir = os.path.join(output_dir, "metrics_used")

        # Create metrics_used folder
        os.makedirs(metrics_used_dir, exist_ok=True)
        print(f"Created/Verified folder: {metrics_used_dir}")

        # Check if input_dir exists
        if not os.path.exists(input_dir):
            print(f"Error: Input directory {input_dir} does not exist.")
            return

        # Get list of folders in input_dir
        folders = [f for f in os.listdir(input_dir)
                  if os.path.isdir(os.path.join(input_dir, f))]

        if not folders:
            print(f"Error: No folders found in {input_dir}.")
            return

        # Exclude observation folder
        folders_to_move = [f for f in folders if f != "observation"]

        if not folders_to_move:
            print(f"Warning: No folders to move (only observation found in {input_dir}).")
            return

        # Move each folder to metrics_used
        for folder in folders_to_move:
            source_path = os.path.join(input_dir, folder)
            target_path = os.path.join(metrics_used_dir, folder)

            # Skip if folder already exists in target
            if os.path.exists(target_path):
                print(f"Skipped {folder}: Already exists in {metrics_used_dir}")
                continue

            try:
                shutil.move(source_path, target_path)
                print(f"Moved {folder} to {metrics_used_dir}")
            except Exception as e:
                print(f"Error moving {folder}: {e}")

        # Verify observation remains
        observation_path = os.path.join(input_dir, "observation")
        if os.path.exists(observation_path):
            print(f"Confirmed: observation folder remains in {input_dir}")
        else:
            print(f"Warning: observation folder not found in {input_dir}")

        print("All specified folders moved successfully.")

    except Exception as e:
        print(f"Error accessing directories: {e}")

# Example usage
if __name__ == "__main__":
    move_folders_to_metrics_used(".", ".")

# Stablebaseline3

In [None]:

class CustomPortfolioEnv(gym.Env):
    """
    Custom Gymnasium environment for portfolio management.

    - Observation: 189-dimensional vector from organized/observation/metrics/{yyyy}_{mm}_combined.csv
    - Action: 14-dimensional weight allocations for 14 assets (sum to 1)
    - Reward: 2 * ROI - 0.7 * volatility - 0.5 * MDD, computed over the next month
    """
    def __init__(self, price_dir="metrics_used/price", metrics_dir="organized/observation/metrics"):
        super().__init__()

        # Define tickers
        self.tickers = [
            'GC=F', 'SI=F', '^DJI', '^IXIC', 'CL=F', '^GSPC', '^STOXX50E',
            '^FCHI', '^FTSE', '^HSI', '000001.SS', '^KS11', '^BSESN', '^NSEI'
        ]

        # Load daily price data for reward calculation
        price_file = os.path.join(price_dir, "clean_data.csv")
        if not os.path.exists(price_file):
            raise FileNotFoundError(f"Price file {price_file} not found.")
        self.daily_prices = pd.read_csv(price_file, index_col="Date", parse_dates=True)

        # Load monthly observations
        self.metrics_dir = metrics_dir
        months = pd.date_range(start="2003-01-01", end="2017-12-31", freq="ME")
        self.observations = []
        self.month_files = []
        for month in months:
            file_path = os.path.join(metrics_dir, f"{month.year}_{month.month:02d}_combined.csv")
            if os.path.exists(file_path):
                try:
                    df = pd.read_csv(file_path, header=None)
                    if len(df) == 189:  # Ensure correct observation size
                        self.observations.append(df.iloc[:, 0].values.astype(np.float32))
                        self.month_files.append(file_path)
                    else:
                        print(f"Warning: {file_path} has {len(df)} rows, expected 189. Skipping.")
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")

        if not self.observations:
            raise ValueError("No valid observation files found.")

        # Define observation and action spaces
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(189,), dtype=np.float32)
        self.action_space = spaces.Box(low=0, high=1, shape=(14,), dtype=np.float32)

        # Get monthly last trading days
        self.monthly_last_days = self.daily_prices.resample("ME").last().index.tolist()
        self.monthly_last_days = [d for d in self.monthly_last_days
                                if d >= pd.Timestamp("2003-01-01") and d <= pd.Timestamp("2024-12-31")]

        # Align observations with price data
        self.total_steps = min(len(self.observations), len(self.monthly_last_days) - 1)
        if self.total_steps < 1:
            raise ValueError("Insufficient data for training.")

        # Initialize state
        self.current_step = 0

    def reset(self, seed=None):
        """
        Reset the environment to the initial state.
        """
        if seed is not None:
            np.random.seed(seed)
        self.current_step = 0
        obs = self.observations[0]
        info = {}
        return obs, info

    def step(self, action):
        """
        Take an action (weights) and compute the reward for the next month.
        """
        if self.current_step >= self.total_steps:
            raise ValueError("Episode is over")

        # Normalize action to sum to 1
        action_sum = np.sum(action)
        if action_sum > 0:
            weights = action / action_sum
        else:
            weights = np.ones(14) / 14  # Equal weights if invalid

        # Get start and end dates for the month
        start_date = self.monthly_last_days[self.current_step]
        end_date = self.monthly_last_days[self.current_step + 1]

        # Get daily prices for the period
        try:
            daily_prices = self.daily_prices.loc[start_date:end_date, self.tickers]
        except KeyError as e:
            print(f"Warning: Missing price data for period {start_date} to {end_date}. Using zeros for reward.")
            daily_prices = pd.DataFrame(0, index=pd.date_range(start_date, end_date), columns=self.tickers)

        # Compute portfolio values
        initial_prices = daily_prices.iloc[0].values
        daily_portfolio_values = np.sum(daily_prices.values * weights, axis=1)

        # Compute ROI
        if daily_portfolio_values[0] > 0:
            roi = (daily_portfolio_values[-1] / daily_portfolio_values[0]) - 1
        else:
            roi = 0

        # Compute daily returns
        daily_returns = daily_portfolio_values[1:] / daily_portfolio_values[:-1] - 1 if len(daily_portfolio_values) > 1 else np.array([0])

        # Compute volatility
        volatility = np.std(daily_returns) if len(daily_returns) > 0 else 0

        # Compute MDD
        cummax = np.maximum.accumulate(daily_portfolio_values)
        drawdown = (cummax - daily_portfolio_values) / cummax if cummax[0] > 0 else np.zeros_like(daily_portfolio_values)
        mdd = np.max(drawdown) if len(drawdown) > 0 else 0

        # Compute reward
        reward = 2 * roi - 0.7 * volatility - 0.5 * mdd

        # Move to next step
        self.current_step += 1

        # Check if done
        done = self.current_step >= self.total_steps
        truncated = False

        # Get next observation
        obs = self.observations[self.current_step] if not done else self.observations[-1]
        info = {"roi": roi, "volatility": volatility, "mdd": mdd}

        return obs, reward, done, truncated, info

    def render(self):
        pass

def train_rl_agents(price_dir="metrics_used/price", metrics_dir="organized/observation/metrics",
                   output_dir="results", seeds=[1, 2, 3, 4, 5], total_timesteps=10000):
    """
    Train PPO, SAC, DDPG, TD3 models with specified seeds and save results.

    Parameters:
    price_dir (str): Directory containing clean_data.csv
    metrics_dir (str): Directory containing observation vectors
    output_dir (str): Directory to save models and evaluation
    seeds (list): List of random seeds
    total_timesteps (int): Number of timesteps for training

    Saves:
    - Models: results/{model}_seed_{seed}.zip
    - Evaluation: results/evaluation.csv
    """
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    print(f"Created/Verified output directory: {output_dir}")

    # Initialize evaluation results
    evaluation_results = []

    # Define models
    models = {
        "ppo": PPO,
        "sac": SAC,
        "ddpg": DDPG,
        "td3": TD3
    }

    # Create vectorized environment
    def make_env():
        return CustomPortfolioEnv(price_dir=price_dir, metrics_dir=metrics_dir)

    for seed in seeds:
        print(f"\nTraining with seed {seed}")

        # Set random seed for reproducibility
        np.random.seed(seed)
        random.seed(seed)

        # Create environment
        env = make_vec_env(make_env, n_envs=1, seed=seed)

        for model_name, model_class in models.items():
            print(f"Training {model_name.upper()}...")

            try:
                # Initialize model
                model = model_class(
                    policy="MlpPolicy",
                    env=env,
                    verbose=0,
                    seed=seed
                )

                # Train model
                model.learn(total_timesteps=total_timesteps, progress_bar=True)

                # Evaluate model
                mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=5)

                # Save model
                model_path = os.path.join(output_dir, f"{model_name}_seed_{seed}.zip")
                model.save(model_path)
                print(f"Saved model: {model_path}")

                # Store evaluation results
                evaluation_results.append({
                    "model": model_name,
                    "seed": seed,
                    "mean_reward": mean_reward,
                    "std_reward": std_reward
                })

            except Exception as e:
                print(f"Error training {model_name} with seed {seed}: {e}")

        # Reset environment seed for next iteration
        env.reset()

    # Save evaluation results
    eval_df = pd.DataFrame(evaluation_results)
    eval_path = os.path.join(output_dir, "evaluation.csv")
    eval_df.to_csv(eval_path, index=False)
    print(f"Saved evaluation results: {eval_path}")

if __name__ == "__main__":
    train_rl_agents(
        price_dir="./metrics_used/price",
        metrics_dir="./observation/metrics",
        output_dir="./results",
        seeds=[1, 2, 3, 4, 5],
        total_timesteps=10000
    )

In [None]:

class CustomPortfolioEnv(gym.Env):
    """
    Custom Gymnasium environment for portfolio management backtesting.

    - Observation: 189-dimensional vector from observation/metrics/{yyyy}_{mm}_combined.csv
    - Action: 14-dimensional weight allocations for 14 assets (sum to 1)
    - Reward: 2 * ROI - 0.7 * volatility - 0.5 * MDD, computed over the next month
    """
    def __init__(self, price_dir="metrics_used/price", metrics_dir="observation/metrics", start_month=None, end_month=None):
        super().__init__()

        # Define tickers
        self.tickers = [
            'GC=F', 'SI=F', '^DJI', '^IXIC', 'CL=F', '^GSPC', '^STOXX50E',
            '^FCHI', '^FTSE', '^HSI', '000001.SS', '^KS11', '^BSESN', '^NSEI'
        ]

        # Load daily price data
        price_file = os.path.join(price_dir, "clean_data.csv")
        if not os.path.exists(price_file):
            raise FileNotFoundError(f"Price file {price_file} not found.")
        self.daily_prices = pd.read_csv(price_file, index_col="Date", parse_dates=True)

        # Load monthly observations into a dictionary
        months = pd.date_range(start="2003-01-01", end="2024-12-31", freq="ME")
        self.observations = {}
        for month in months:
            file_path = os.path.join(metrics_dir, f"{month.year}_{month.month:02d}_combined.csv")
            if os.path.exists(file_path):
                try:
                    df = pd.read_csv(file_path, header=None)
                    if len(df) == 189:
                        self.observations[month.strftime("%Y-%m")] = df.iloc[:, 0].values.astype(np.float32)
                    else:
                        print(f"Warning: {file_path} has {len(df)} rows, expected 189. Skipping.")
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")

        # Get monthly last trading days
        self.monthly_last_days = self.daily_prices.resample("ME").last().index.tolist()
        self.month_list = [d.strftime("%Y-%m") for d in self.monthly_last_days]

        # Define backtest period
        self.backtest_months = pd.date_range(start="2003-01-01", end="2024-11-01", freq="ME").strftime("%Y-%m").tolist()
        for month in self.backtest_months:
            if month not in self.observations:
                raise ValueError(f"Observation file for {month} not found.")

        self.total_steps = len(self.backtest_months) - 1  # 83 steps for 84 months of performance

        # Define spaces
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(189,), dtype=np.float32)
        self.action_space = spaces.Box(low=0, high=1, shape=(14,), dtype=np.float32)

        # Initialize state
        self.current_step = 0

    def reset(self, seed=None):
        """
        Reset the environment to the initial state (December 2017 for January 2018 allocations).
        """
        if seed is not None:
            np.random.seed(seed)
        self.current_step = 0
        obs = self.observations[self.backtest_months[0]]
        info = {}
        return obs, info

    def step(self, action):
        """
        Take an action (weights) and compute the reward for the next month.
        """
        if self.current_step >= self.total_steps:
            raise ValueError("Episode is over")

        # Normalize action
        action_sum = np.sum(action)
        if action_sum > 0:
            weights = action / action_sum
        else:
            weights = np.ones(14) / 14

        # Get allocation month (the month for which these weights apply)
        allocation_month = self.backtest_months[self.current_step + 1]

        # Get start and end dates for the allocation month
        start_date = pd.to_datetime(allocation_month + "-01")
        end_date = start_date + pd.offsets.MonthEnd(0)

        # Get daily prices for the period
        try:
            daily_prices = self.daily_prices.loc[start_date:end_date, self.tickers]
        except KeyError as e:
            print(f"Warning: Missing price data for period {start_date} to {end_date}. Using zeros.")
            daily_prices = pd.DataFrame(0, index=pd.date_range(start_date, end_date), columns=self.tickers)

        # Compute portfolio values
        initial_prices = daily_prices.iloc[0].values
        daily_portfolio_values = np.sum(daily_prices.values * weights, axis=1)

        # Compute ROI
        if daily_portfolio_values[0] > 0:
            roi = (daily_portfolio_values[-1] / daily_portfolio_values[0]) - 1
        else:
            roi = 0

        # Compute daily returns
        daily_returns = daily_portfolio_values[1:] / daily_portfolio_values[:-1] - 1 if len(daily_portfolio_values) > 1 else np.array([0])

        # Compute volatility
        volatility = np.std(daily_returns) if len(daily_returns) > 0 else 0

        # Compute MDD
        cummax = np.maximum.accumulate(daily_portfolio_values)
        drawdown = (cummax - daily_portfolio_values) / cummax if cummax[0] > 0 else np.zeros_like(daily_portfolio_values)
        mdd = np.max(drawdown) if len(drawdown) > 0 else 0

        # Compute reward
        reward = 2 * roi - 0.7 * volatility - 0.5 * mdd

        # Advance step
        self.current_step += 1

        # Check done
        done = self.current_step >= self.total_steps
        truncated = False

        # Get next observation
        if not done:
            obs = self.observations[self.backtest_months[self.current_step]]
        else:
            obs = np.zeros(189)

        # Info
        info = {
            "allocation_month": allocation_month,
            "roi": roi,
            "volatility": volatility,
            "mdd": mdd
        }

        return obs, reward, done, truncated, info

    def render(self):
        pass

def backtest_model(model, env, output_file):
    """
    Backtest a trained RL model over the specified period, recording monthly allocations and performance.

    Parameters:
    model: Trained Stable Baselines 3 model
    env: CustomPortfolioEnv instance
    output_file (str): Path to save backtest results CSV

    Saves:
    - CSV with columns: month, weight_{ticker}, roi, volatility, mdd, reward
    """
    obs, _ = env.reset()
    done = False
    records = []
    while not done:
        # Predict action
        action, _ = model.predict(obs, deterministic=True)
        # Normalize action
        if np.sum(action) > 0:
            weights = action / np.sum(action)
        else:
            weights = np.ones(len(env.tickers)) / len(env.tickers)
        # Record allocation
        record = {"month": env.backtest_months[env.current_step + 1]}
        for i, ticker in enumerate(env.tickers):
            record[f"weight_{ticker}"] = weights[i]
        # Take step
        obs, reward, done, truncated, info = env.step(weights)
        # Update record
        record.update({
            "roi": info["roi"],
            "volatility": info["volatility"],
            "mdd": info["mdd"],
            "reward": reward
        })
        records.append(record)
    # Save results
    df = pd.DataFrame(records)
    df.to_csv(output_file, index=False)
    print(f"Saved backtest results to {output_file}")

def run_backtests(price_dir="metrics_used/price", metrics_dir="observation/metrics",
                 results_dir="results", output_dir="backtest_results", seeds=[1, 2, 3, 4, 5]):
    """
    Run backtests for all models and seeds over 2018-2024, saving monthly allocations and performance.

    Parameters:
    price_dir (str): Directory containing clean_data.csv
    metrics_dir (str): Directory containing observation vectors
    results_dir (str): Directory containing trained models
    output_dir (str): Directory to save backtest results
    seeds (list): List of random seeds

    Saves:
    - Backtest results: backtest_results/{model}/seed_{seed}.csv
    """
    # Define model classes
    model_classes = {
        "ppo": PPO,
        "sac": SAC,
        "ddpg": DDPG,
        "td3": TD3
    }

    # Run backtests
    for model_type in model_classes:
        model_dir = os.path.join(output_dir, model_type)
        os.makedirs(model_dir, exist_ok=True)
        for seed in seeds:
            model_path = os.path.join(results_dir, f"{model_type}_seed_{seed}.zip")
            if not os.path.exists(model_path):
                print(f"Model {model_path} not found. Skipping. Please re-run training for this seed.")
                continue
            try:
                model = model_classes[model_type].load(model_path)
                env = CustomPortfolioEnv(
                    price_dir=price_dir,
                    metrics_dir=metrics_dir
                )
                output_file = os.path.join(model_dir, f"seed_{seed}.csv")
                backtest_model(model, env, output_file)
            except Exception as e:
                print(f"Error backtesting {model_type} seed {seed}: {e}")

if __name__ == "__main__":
    run_backtests()

In [None]:

# Define directories and parameters
PRICE_DIR = "metrics_used/price"
METRICS_DIR = "metrics_used/sim_nlp"
RESULTS_DIR = "results_NLP"
BACKTEST_DIR = "backtest_results_NLP"
SEEDS = [1, 2, 3, 4, 5]
TOTAL_TIMESTEPS = 20000
TRAIN_START_MONTH = "2003-01"
TRAIN_END_MONTH = "2017-11"
BACKTEST_START_MONTH = "2003-01"
BACKTEST_END_MONTH = "2024-11"

# Define the list of tickers (14 assets)
TICKERS = [
    'GC=F', 'SI=F', '^DJI', '^IXIC', 'CL=F', '^GSPC', '^STOXX50E',
    '^FCHI', '^FTSE', '^HSI', '000001.SS', '^KS11', '^BSESN', '^NSEI'
]

class CustomPortfolioEnv(gym.Env):
    """Custom Gym environment for portfolio optimization with NLP vectors."""
    def __init__(self, price_dir, metrics_dir, obs_months):
        super().__init__()
        self.tickers = TICKERS

        # Load daily price data
        price_file = os.path.join(price_dir, "clean_data.csv")
        if not os.path.exists(price_file):
            raise FileNotFoundError(f"Price file {price_file} not found.")
        self.daily_prices = pd.read_csv(price_file, index_col="Date", parse_dates=True)

        # Load observation vectors
        self.observations = {}
        for month in obs_months:
            file_path = os.path.join(metrics_dir, f"{month[:4]}{month[5:]}.csv")
            if os.path.exists(file_path):
                try:
                    df = pd.read_csv(file_path)
                    if df.shape[1] == 28:
                        self.observations[month] = df.iloc[0].values.astype(np.float32)
                    else:
                        print(f"Warning: {file_path} has {df.shape[1]} columns, expected 28.")
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
            else:
                print(f"Warning: Observation file for {month} not found.")

        self.obs_months = obs_months
        self.total_steps = len(obs_months) - 1  # Steps = number of observations - 1

        # Define observation and action spaces
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(28,), dtype=np.float32)
        self.action_space = spaces.Box(low=0, high=1, shape=(14,), dtype=np.float32)

        self.current_step = 0

    def reset(self, seed=None):
        """Reset the environment to the initial state."""
        if seed is not None:
            np.random.seed(seed)
        self.current_step = 0
        obs = self.observations[self.obs_months[0]]
        return obs, {}

    def step(self, action):
        """Perform one step: allocate weights and compute reward."""
        if self.current_step >= self.total_steps:
            raise ValueError("Episode has ended.")

        # Normalize weights to sum to 1
        action_sum = np.sum(action)
        weights = action / action_sum if action_sum > 0 else np.ones(14) / 14

        # Determine the allocation month (next month)
        allocation_month = self.obs_months[self.current_step + 1]
        start_date = pd.to_datetime(allocation_month + "-01")
        end_date = start_date + pd.offsets.MonthEnd(0)

        # Extract daily prices for the allocation month
        try:
            daily_prices = self.daily_prices.loc[start_date:end_date, self.tickers]
        except KeyError:
            print(f"Warning: Missing price data for {start_date} to {end_date}. Using zeros.")
            daily_prices = pd.DataFrame(0, index=pd.date_range(start_date, end_date), columns=self.tickers)

        # Compute portfolio performance
        initial_prices = daily_prices.iloc[0].values
        daily_values = np.sum(daily_prices.values * weights, axis=1)

        # Calculate ROI
        roi = (daily_values[-1] / daily_values[0] - 1) if daily_values[0] > 0 else 0

        # Calculate daily returns and volatility
        daily_returns = daily_values[1:] / daily_values[:-1] - 1 if len(daily_values) > 1 else np.array([0])
        volatility = np.std(daily_returns) if len(daily_returns) > 0 else 0

        # Calculate Maximum Drawdown (MDD)
        cummax = np.maximum.accumulate(daily_values)
        drawdown = (cummax - daily_values) / cummax if cummax[0] > 0 else np.zeros_like(daily_values)
        mdd = np.max(drawdown) if len(daily_values) > 0 else 0

        # Compute reward
        reward = 2 * roi - 0.7 * volatility - 0.5 * mdd

        # Advance step
        self.current_step += 1
        done = self.current_step >= self.total_steps
        truncated = False

        # Next observation
        next_obs = self.observations[self.obs_months[self.current_step]] if not done else np.zeros(28)

        info = {"allocation_month": allocation_month, "roi": roi, "volatility": volatility, "mdd": mdd}
        return next_obs, reward, done, truncated, info

def get_obs_months(start_month, end_month):
    """Generate a list of month strings between start and end months."""
    start = pd.to_datetime(start_month + "-01")
    end = pd.to_datetime(end_month + "-01")
    return pd.date_range(start=start, end=end, freq="ME").strftime("%Y-%m").tolist()

def train_models():
    """Train RL models for each seed and save them."""
    obs_months = get_obs_months(TRAIN_START_MONTH, TRAIN_END_MONTH)
    env = make_vec_env(lambda: CustomPortfolioEnv(PRICE_DIR, METRICS_DIR, obs_months), n_envs=1)
    os.makedirs(RESULTS_DIR, exist_ok=True)

    for seed in SEEDS:
        np.random.seed(seed)
        random.seed(seed)
        for model_name in ["ppo", "sac", "ddpg", "td3"]:
            model_class = globals()[model_name.upper()]
            model = model_class("MlpPolicy", env, verbose=0, seed=seed)
            model.learn(total_timesteps=TOTAL_TIMESTEPS)
            model_path = os.path.join(RESULTS_DIR, f"{model_name}_seed_{seed}.zip")
            model.save(model_path)
            print(f"Trained and saved {model_name} with seed {seed} to {model_path}")

def backtest_model(model, env, output_file):
    """Run backtest for a single model and save results."""
    obs, _ = env.reset()
    done = False
    records = []

    while not done:
        action, _ = model.predict(obs, deterministic=True)
        weights = action / np.sum(action) if np.sum(action) > 0 else np.ones(len(TICKERS)) / len(TICKERS)
        record = {"month": env.obs_months[env.current_step + 1]}
        for i, ticker in enumerate(TICKERS):
            record[f"weight_{ticker}"] = weights[i]
        obs, reward, done, _, info = env.step(action)
        record.update({"roi": info["roi"], "volatility": info["volatility"], "mdd": info["mdd"], "reward": reward})
        records.append(record)

    df = pd.DataFrame(records)
    df.to_csv(output_file, index=False)
    print(f"Backtest results saved to {output_file}")

def run_backtests():
    """Backtest all trained models and save results."""
    obs_months = get_obs_months(BACKTEST_START_MONTH, BACKTEST_END_MONTH)
    env = CustomPortfolioEnv(PRICE_DIR, METRICS_DIR, obs_months)

    for model_type in ["ppo", "sac", "ddpg", "td3"]:
        model_dir = os.path.join(BACKTEST_DIR, model_type)
        os.makedirs(model_dir, exist_ok=True)
        for seed in SEEDS:
            model_path = os.path.join(RESULTS_DIR, f"{model_type}_seed_{seed}.zip")
            if not os.path.exists(model_path):
                print(f"Model {model_path} not found. Skipping.")
                continue
            try:
                model = globals()[model_type.upper()].load(model_path)
                output_file = os.path.join(model_dir, f"seed_{seed}.csv")
                backtest_model(model, env, output_file)
            except Exception as e:
                print(f"Error backtesting {model_type} seed {seed}: {e}")

if __name__ == "__main__":
    print("Starting training phase...")
    train_models()
    print("\nStarting backtesting phase...")
    run_backtests()
    print("Pipeline completed.")

# Meta

In [None]:
import os

def create_meta_folders():
    """
    Create a Meta folder with two subfolders: NLP_obs and Metrics_obs.

    Parameters:
    None

    Returns:
    None: Creates empty folders
    """
    try:
        # Define the folder paths
        meta_dir = "Meta"
        nlp_obs_dir = os.path.join(meta_dir, "NLP_obs")
        metrics_obs_dir = os.path.join(meta_dir, "Metrics_obs")

        # Create the Meta folder
        os.makedirs(meta_dir, exist_ok=True)
        print(f"Created/Verified folder: {meta_dir}")

        # Create the NLP_obs subfolder
        os.makedirs(nlp_obs_dir, exist_ok=True)
        print(f"Created/Verified folder: {nlp_obs_dir}")

        # Create the Metrics_obs subfolder
        os.makedirs(metrics_obs_dir, exist_ok=True)
        print(f"Created/Verified folder: {metrics_obs_dir}")

        print("Folder structure created successfully.")

    except Exception as e:
        print(f"Error creating folders: {e}")

if __name__ == "__main__":
    create_meta_folders()

In [None]:
import pandas as pd
import os

def merge_backtest_results(backtest_dir="backtest_results_NLP", output_dir="Meta/NLP_obs", seeds=[1, 2, 3, 4, 5]):
    """
    Merge backtest CSV files for each agent in backtest_results_NLP, combining all seeds side by side.

    Parameters:
    backtest_dir (str): Directory containing backtest results (backtest_results_NLP)
    output_dir (str): Directory to save merged CSV files (Meta/NLP_obs)
    seeds (list): List of seeds to process

    Saves:
    - Merged CSV files in output_dir as {agent}_merged.csv
    """
    try:
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        print(f"Created/Verified output directory: {output_dir}")

        # Define agents (subfolders in backtest_results_NLP)
        agents = ["ppo", "sac", "ddpg", "td3"]

        # Process each agent
        for agent in agents:
            agent_dir = os.path.join(backtest_dir, agent)
            if not os.path.exists(agent_dir):
                print(f"Agent directory {agent_dir} not found. Skipping.")
                continue

            # Initialize list to hold DataFrames for merging
            merged_dfs = []

            # Process each seed
            for seed in seeds:
                csv_file = os.path.join(agent_dir, f"seed_{seed}.csv")
                if not os.path.exists(csv_file):
                    print(f"CSV file {csv_file} not found. Skipping seed {seed} for agent {agent}.")
                    continue

                try:
                    # Read the CSV file
                    df = pd.read_csv(csv_file)

                    # Expected number of rows (263 months: Feb 2003 to Dec 2024)
                    expected_rows = 261
                    if len(df) != expected_rows:
                        print(f"Warning: {csv_file} has {len(df)} rows, expected {expected_rows}. Skipping.")
                        continue

                    # Rename columns to include seed identifier (except for 'month' in the first seed)
                    if seed == seeds[0]:
                        # For the first seed, keep the 'month' column as is
                        renamed_columns = {'month': 'month'}
                        for col in df.columns[1:]:  # Skip 'month'
                            renamed_columns[col] = f"{col}_seed_{seed}"
                    else:
                        # For other seeds, exclude 'month' and rename all columns
                        renamed_columns = {col: f"{col}_seed_{seed}" for col in df.columns if col != 'month'}
                        df = df.drop(columns=['month'])

                    df = df.rename(columns=renamed_columns)
                    merged_dfs.append(df)

                except Exception as e:
                    print(f"Error processing {csv_file}: {e}")
                    continue

            # Check if any DataFrames were loaded
            if not merged_dfs:
                print(f"No valid CSV files found for agent {agent}. Skipping.")
                continue

            # Merge DataFrames side by side on 'month' (from the first DataFrame)
            merged_df = merged_dfs[0]
            for df in merged_dfs[1:]:
                merged_df = pd.concat([merged_df, df], axis=1)

            # Verify the number of columns (should be 20 * number of seeds)
            expected_columns = 1 + (20 * len(merged_dfs) - (len(merged_dfs) - 1))  # 1 for 'month', 19 additional per seed
            if len(merged_df.columns) != expected_columns:
                print(f"Warning: Merged DataFrame for {agent} has {len(merged_df.columns)} columns, expected {expected_columns}.")

            # Save the merged DataFrame
            output_file = os.path.join(output_dir, f"{agent}_merged.csv")
            merged_df.to_csv(output_file, index=False)
            print(f"Merged CSV for agent {agent} saved to {output_file} with {len(merged_df)} rows and {len(merged_df.columns)} columns.")

        print("Merging process completed successfully.")

    except Exception as e:
        print(f"Error during merging process: {e}")

if __name__ == "__main__":
    merge_backtest_results()

In [None]:
import pandas as pd
import os

def merge_nlp_obs_results(input_dir="Meta/NLP_obs", output_dir="Meta/NLP_obs"):
    """
    Merge the four agent-specific merged CSV files in Meta/NLP_obs into a single CSV file.

    Parameters:
    input_dir (str): Directory containing the agent-specific merged CSV files (Meta/NLP_obs)
    output_dir (str): Directory to save the final merged CSV (Meta/NLP_obs)

    Saves:
    - Merged CSV file as nlp_obs_unclean.csv in output_dir
    """
    try:
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        print(f"Created/Verified output directory: {output_dir}")

        # Define agents
        agents = ["ppo", "sac", "ddpg", "td3"]

        # Initialize list to hold DataFrames for merging
        merged_dfs = []

        # Process each agent's merged CSV
        for agent in agents:
            csv_file = os.path.join(input_dir, f"{agent}_merged.csv")
            if not os.path.exists(csv_file):
                print(f"CSV file {csv_file} not found. Skipping agent {agent}.")
                continue

            try:
                # Read the CSV file
                df = pd.read_csv(csv_file)

                # Expected number of rows (263 months: Feb 2003 to Dec 2024)
                expected_rows = 261
                if len(df) != expected_rows:
                    print(f"Warning: {csv_file} has {len(df)} rows, expected {expected_rows}. Skipping.")
                    continue

                # Rename columns to include agent identifier (except for 'month' in the first agent)
                if agent == agents[0]:
                    # For the first agent, keep the 'month' column as is
                    renamed_columns = {'month': 'month'}
                    for col in df.columns[1:]:  # Skip 'month'
                        renamed_columns[col] = f"{col}_{agent}"
                else:
                    # For other agents, exclude 'month' and rename all columns
                    renamed_columns = {col: f"{col}_{agent}" for col in df.columns if col != 'month'}
                    df = df.drop(columns=['month'])

                df = df.rename(columns=renamed_columns)
                merged_dfs.append(df)

            except Exception as e:
                print(f"Error processing {csv_file}: {e}")
                continue

        # Check if any DataFrames were loaded
        if not merged_dfs:
            print("No valid CSV files found to merge.")
            return

        # Merge DataFrames side by side on 'month' (from the first DataFrame)
        merged_df = merged_dfs[0]
        for df in merged_dfs[1:]:
            merged_df = pd.concat([merged_df, df], axis=1)

        # Verify the number of columns (should be 1 + (100 columns per agent × 4 agents))
        expected_columns = 1 + (100 * len(merged_dfs))  # 1 for 'month', 100 columns per agent
        if len(merged_df.columns) != expected_columns:
            print(f"Warning: Merged DataFrame has {len(merged_df.columns)} columns, expected {expected_columns}.")

        # Save the final merged DataFrame
        output_file = os.path.join(output_dir, "nlp_obs_unclean.csv")
        merged_df.to_csv(output_file, index=False)
        print(f"Final merged CSV saved to {output_file} with {len(merged_df)} rows and {len(merged_df.columns)} columns.")

        print("Merging process completed successfully.")

    except Exception as e:
        print(f"Error during merging process: {e}")

if __name__ == "__main__":
    merge_nlp_obs_results()

In [None]:
import pandas as pd
import os

def clean_nlp_obs_results(input_file="Meta/NLP_obs/nlp_obs_unclean.csv", output_file="Meta/NLP_obs/nlp_obs_clean.csv"):
    """
    Clean the merged NLP observation CSV by keeping only the 'month' and weight columns.

    Parameters:
    input_file (str): Path to the input merged CSV file (nlp_obs_unclean.csv)
    output_file (str): Path to save the cleaned CSV file (nlp_obs_clean.csv)

    Saves:
    - Cleaned CSV file with only 'month' and weight columns in output_file
    """
    try:
        # Read the input CSV
        if not os.path.exists(input_file):
            raise FileNotFoundError(f"Input file {input_file} not found.")

        df = pd.read_csv(input_file)

        # Expected number of rows (263 months: Feb 2003 to Dec 2024)
        expected_rows = 263
        if len(df) != expected_rows:
            print(f"Warning: {input_file} has {len(df)} rows, expected {expected_rows}.")

        # Expected number of columns (401: 1 month + 100 per agent × 4 agents)
        expected_columns = 401
        if len(df.columns) != expected_columns:
            print(f"Warning: {input_file} has {len(df.columns)} columns, expected {expected_columns}.")

        # Identify columns to keep: 'month' and all columns containing 'weight'
        columns_to_keep = ['month']
        for col in df.columns:
            if 'weight' in col:
                columns_to_keep.append(col)

        # Create the cleaned DataFrame
        cleaned_df = df[columns_to_keep]

        # Verify the number of columns (should be 1 + (14 weights × 5 seeds × 4 agents) = 281)
        expected_cleaned_columns = 1 + (14 * 5 * 4)  # 1 month + 14 weights × 5 seeds × 4 agents
        if len(cleaned_df.columns) != expected_cleaned_columns:
            print(f"Warning: Cleaned DataFrame has {len(cleaned_df.columns)} columns, expected {expected_cleaned_columns}.")

        # Save the cleaned DataFrame
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        cleaned_df.to_csv(output_file, index=False)
        print(f"Cleaned CSV saved to {output_file} with {len(cleaned_df)} rows and {len(cleaned_df.columns)} columns.")

        print("Cleaning process completed successfully.")

    except Exception as e:
        print(f"Error during cleaning process: {e}")

if __name__ == "__main__":
    clean_nlp_obs_results()

In [None]:
import pandas as pd
import os

def merge_backtest_results(backtest_dir="backtest_results", output_dir="Meta/Metrics_obs", seeds=[1, 2, 3, 4, 5]):
    """
    Merge backtest CSV files for each agent in backtest_results_NLP, combining all seeds side by side.

    Parameters:
    backtest_dir (str): Directory containing backtest results (backtest_results_NLP)
    output_dir (str): Directory to save merged CSV files (Meta/NLP_obs)
    seeds (list): List of seeds to process

    Saves:
    - Merged CSV files in output_dir as {agent}_merged.csv
    """
    try:
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        print(f"Created/Verified output directory: {output_dir}")

        # Define agents (subfolders in backtest_results_NLP)
        agents = ["ppo", "sac", "ddpg", "td3"]

        # Process each agent
        for agent in agents:
            agent_dir = os.path.join(backtest_dir, agent)
            if not os.path.exists(agent_dir):
                print(f"Agent directory {agent_dir} not found. Skipping.")
                continue

            # Initialize list to hold DataFrames for merging
            merged_dfs = []

            # Process each seed
            for seed in seeds:
                csv_file = os.path.join(agent_dir, f"seed_{seed}.csv")
                if not os.path.exists(csv_file):
                    print(f"CSV file {csv_file} not found. Skipping seed {seed} for agent {agent}.")
                    continue

                try:
                    # Read the CSV file
                    df = pd.read_csv(csv_file)

                    # Expected number of rows (263 months: Feb 2003 to Dec 2024)
                    expected_rows = 261
                    if len(df) != expected_rows:
                        print(f"Warning: {csv_file} has {len(df)} rows, expected {expected_rows}. Skipping.")
                        continue

                    # Rename columns to include seed identifier (except for 'month' in the first seed)
                    if seed == seeds[0]:
                        # For the first seed, keep the 'month' column as is
                        renamed_columns = {'month': 'month'}
                        for col in df.columns[1:]:  # Skip 'month'
                            renamed_columns[col] = f"{col}_seed_{seed}"
                    else:
                        # For other seeds, exclude 'month' and rename all columns
                        renamed_columns = {col: f"{col}_seed_{seed}" for col in df.columns if col != 'month'}
                        df = df.drop(columns=['month'])

                    df = df.rename(columns=renamed_columns)
                    merged_dfs.append(df)

                except Exception as e:
                    print(f"Error processing {csv_file}: {e}")
                    continue

            # Check if any DataFrames were loaded
            if not merged_dfs:
                print(f"No valid CSV files found for agent {agent}. Skipping.")
                continue

            # Merge DataFrames side by side on 'month' (from the first DataFrame)
            merged_df = merged_dfs[0]
            for df in merged_dfs[1:]:
                merged_df = pd.concat([merged_df, df], axis=1)

            # Verify the number of columns (should be 20 * number of seeds)
            expected_columns = 1 + (20 * len(merged_dfs) - (len(merged_dfs) - 1))  # 1 for 'month', 19 additional per seed
            if len(merged_df.columns) != expected_columns:
                print(f"Warning: Merged DataFrame for {agent} has {len(merged_df.columns)} columns, expected {expected_columns}.")

            # Save the merged DataFrame
            output_file = os.path.join(output_dir, f"{agent}_merged.csv")
            merged_df.to_csv(output_file, index=False)
            print(f"Merged CSV for agent {agent} saved to {output_file} with {len(merged_df)} rows and {len(merged_df.columns)} columns.")

        print("Merging process completed successfully.")

    except Exception as e:
        print(f"Error during merging process: {e}")

if __name__ == "__main__":
    merge_backtest_results()

In [None]:
import pandas as pd
import os

def merge_nlp_obs_results(input_dir="Meta/Metrics_obs", output_dir="Meta/Metrics_obs"):
    """
    Merge the four agent-specific merged CSV files in Meta/NLP_obs into a single CSV file.

    Parameters:
    input_dir (str): Directory containing the agent-specific merged CSV files (Meta/NLP_obs)
    output_dir (str): Directory to save the final merged CSV (Meta/NLP_obs)

    Saves:
    - Merged CSV file as nlp_obs_unclean.csv in output_dir
    """
    try:
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        print(f"Created/Verified output directory: {output_dir}")

        # Define agents
        agents = ["ppo", "sac", "ddpg", "td3"]

        # Initialize list to hold DataFrames for merging
        merged_dfs = []

        # Process each agent's merged CSV
        for agent in agents:
            csv_file = os.path.join(input_dir, f"{agent}_merged.csv")
            if not os.path.exists(csv_file):
                print(f"CSV file {csv_file} not found. Skipping agent {agent}.")
                continue

            try:
                # Read the CSV file
                df = pd.read_csv(csv_file)

                # Expected number of rows (263 months: Feb 2003 to Dec 2024)
                expected_rows = 261
                if len(df) != expected_rows:
                    print(f"Warning: {csv_file} has {len(df)} rows, expected {expected_rows}. Skipping.")
                    continue

                # Rename columns to include agent identifier (except for 'month' in the first agent)
                if agent == agents[0]:
                    # For the first agent, keep the 'month' column as is
                    renamed_columns = {'month': 'month'}
                    for col in df.columns[1:]:  # Skip 'month'
                        renamed_columns[col] = f"{col}_{agent}"
                else:
                    # For other agents, exclude 'month' and rename all columns
                    renamed_columns = {col: f"{col}_{agent}" for col in df.columns if col != 'month'}
                    df = df.drop(columns=['month'])

                df = df.rename(columns=renamed_columns)
                merged_dfs.append(df)

            except Exception as e:
                print(f"Error processing {csv_file}: {e}")
                continue

        # Check if any DataFrames were loaded
        if not merged_dfs:
            print("No valid CSV files found to merge.")
            return

        # Merge DataFrames side by side on 'month' (from the first DataFrame)
        merged_df = merged_dfs[0]
        for df in merged_dfs[1:]:
            merged_df = pd.concat([merged_df, df], axis=1)

        # Verify the number of columns (should be 1 + (100 columns per agent × 4 agents))
        expected_columns = 1 + (100 * len(merged_dfs))  # 1 for 'month', 100 columns per agent
        if len(merged_df.columns) != expected_columns:
            print(f"Warning: Merged DataFrame has {len(merged_df.columns)} columns, expected {expected_columns}.")

        # Save the final merged DataFrame
        output_file = os.path.join(output_dir, "nlp_obs_unclean.csv")
        merged_df.to_csv(output_file, index=False)
        print(f"Final merged CSV saved to {output_file} with {len(merged_df)} rows and {len(merged_df.columns)} columns.")

        print("Merging process completed successfully.")

    except Exception as e:
        print(f"Error during merging process: {e}")

if __name__ == "__main__":
    merge_nlp_obs_results()

In [None]:
import pandas as pd
import os

def clean_nlp_obs_results(input_file="Meta/Metrics_obs/nlp_obs_unclean.csv", output_file="Meta/Metrics_obs/metrics_obs_clean.csv"):
    """
    Clean the merged NLP observation CSV by keeping only the 'month' and weight columns.

    Parameters:
    input_file (str): Path to the input merged CSV file (nlp_obs_unclean.csv)
    output_file (str): Path to save the cleaned CSV file (nlp_obs_clean.csv)

    Saves:
    - Cleaned CSV file with only 'month' and weight columns in output_file
    """
    try:
        # Read the input CSV
        if not os.path.exists(input_file):
            raise FileNotFoundError(f"Input file {input_file} not found.")

        df = pd.read_csv(input_file)

        # Expected number of rows (263 months: Feb 2003 to Dec 2024)
        expected_rows = 263
        if len(df) != expected_rows:
            print(f"Warning: {input_file} has {len(df)} rows, expected {expected_rows}.")

        # Expected number of columns (401: 1 month + 100 per agent × 4 agents)
        expected_columns = 401
        if len(df.columns) != expected_columns:
            print(f"Warning: {input_file} has {len(df.columns)} columns, expected {expected_columns}.")

        # Identify columns to keep: 'month' and all columns containing 'weight'
        columns_to_keep = ['month']
        for col in df.columns:
            if 'weight' in col:
                columns_to_keep.append(col)

        # Create the cleaned DataFrame
        cleaned_df = df[columns_to_keep]

        # Verify the number of columns (should be 1 + (14 weights × 5 seeds × 4 agents) = 281)
        expected_cleaned_columns = 1 + (14 * 5 * 4)  # 1 month + 14 weights × 5 seeds × 4 agents
        if len(cleaned_df.columns) != expected_cleaned_columns:
            print(f"Warning: Cleaned DataFrame has {len(cleaned_df.columns)} columns, expected {expected_cleaned_columns}.")

        # Save the cleaned DataFrame
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        cleaned_df.to_csv(output_file, index=False)
        print(f"Cleaned CSV saved to {output_file} with {len(cleaned_df)} rows and {len(cleaned_df.columns)} columns.")

        print("Cleaning process completed successfully.")

    except Exception as e:
        print(f"Error during cleaning process: {e}")

if __name__ == "__main__":
    clean_nlp_obs_results()

In [None]:
import pandas as pd
import numpy as np
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
import os

# Define tickers
TICKERS = [
    'GC=F', 'SI=F', '^DJI', '^IXIC', 'CL=F', '^GSPC', '^STOXX50E',
    '^FCHI', '^FTSE', '^HSI', '000001.SS', '^KS11', '^BSESN', '^NSEI'
]

class CustomMetaEnv(gym.Env):
    """Custom Gym environment for meta-agent portfolio optimization."""
    def __init__(self, csv_path="Meta/Metrics_obs/metrics_obs_clean.csv", price_path="metrics_used/price/clean_data.csv"):
        super().__init__()

        # Load observation data
        df = pd.read_csv(csv_path)
        self.months = df['month'].tolist()
        self.observations = df.iloc[:, 1:].astype(np.float32).values  # Shape: (263, 280)

        # Load price data
        self.price_data = pd.read_csv(price_path, index_col="Date", parse_dates=True)
        self.tickers = TICKERS

        # Validate tickers
        for ticker in self.tickers:
            if ticker not in self.price_data.columns:
                raise ValueError(f"Ticker {ticker} not in price data")

        self.total_steps = len(self.months)

        # Define observation and action spaces
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(280,), dtype=np.float32)
        # Define action space with finite bounds
        self.action_space = spaces.Box(low=-10, high=10, shape=(14,), dtype=np.float32)

        self.current_step = 0

    def reset(self, seed=None):
        if seed is not None:
            np.random.seed(seed)
        self.current_step = 0
        obs = self.observations[0]
        return obs, {}

    def step(self, action):
        if self.current_step >= self.total_steps:
            raise ValueError("Episode has ended.")

        # Normalize action to sum to 1 using softmax
        weights = np.exp(action) / np.sum(np.exp(action))

        # Get current month
        month = self.months[self.current_step]

        # Determine start and end dates
        start_date = pd.to_datetime(month + '-01')
        end_date = start_date + pd.offsets.MonthEnd(0)

        # Extract daily prices
        try:
            daily_prices = self.price_data.loc[start_date:end_date, self.tickers]
        except KeyError:
            print(f"Warning: Missing price data for {start_date} to {end_date}. Using zeros.")
            daily_prices = pd.DataFrame(0, index=pd.date_range(start_date, end_date), columns=self.tickers)

        # Compute daily portfolio values
        daily_values = (daily_prices * weights).sum(axis=1)

        # Calculate ROI
        roi = (daily_values.iloc[-1] / daily_values.iloc[0] - 1) if daily_values.iloc[0] > 0 else 0

        # Calculate daily returns and volatility
        daily_returns = daily_values.pct_change().dropna()
        volatility = daily_returns.std() if len(daily_returns) > 0 else 0

        # Calculate Maximum Drawdown (MDD)
        cummax = daily_values.cummax()
        drawdown = (cummax - daily_values) / cummax
        mdd = drawdown.max() if len(drawdown) > 0 else 0

        # Compute reward
        reward = 2 * roi - 0.7 * volatility - 0.5 * mdd

        # Advance step
        self.current_step += 1
        terminated = self.current_step >= self.total_steps
        truncated = False

        # Next observation
        next_obs = self.observations[self.current_step] if not terminated else np.zeros(280)

        info = {"allocation_month": month, "roi": roi, "volatility": volatility, "mdd": mdd}
        return next_obs, reward, terminated, truncated, info

    def render(self):
        pass

def train_meta_agent():
    """Train the meta-agent and save the model."""
    env = make_vec_env(lambda: CustomMetaEnv(), n_envs=1, seed=1)
    model = PPO(
        "MlpPolicy",
        env,
        policy_kwargs={'net_arch': [256, 256, 256]},
        verbose=1,
        seed=1
    )
    model.learn(total_timesteps=20000)
    os.makedirs("meta_results", exist_ok=True)
    model_path = "meta_results/meta_agent_seed1"
    model.save(model_path)
    print(f"Meta-agent trained and saved to {model_path}.zip")
    return model

def backtest_meta_agent(model, env, output_file="Meta/Metrics_obs/backtest_meta_seed_1.csv"):
    """Backtest the trained meta-agent and save results."""
    obs, _ = env.reset()
    done = False
    records = []

    while not done:
        action, _ = model.predict(obs, deterministic=True)
        # Normalize action to sum to 1 using softmax
        weights = np.exp(action) / np.sum(np.exp(action))
        record = {"month": env.months[env.current_step]}
        for i, ticker in enumerate(TICKERS):
            record[f"weight_{ticker}"] = weights[i]
        obs, reward, done, _, info = env.step(action)
        record.update({
            "roi": info["roi"],
            "volatility": info["volatility"],
            "mdd": info["mdd"],
            "reward": reward
        })
        records.append(record)

    # Save backtest results
    df = pd.DataFrame(records)
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    df.to_csv(output_file, index=False)
    print(f"Backtest results saved to {output_file} with {len(df)} rows and {len(df.columns)} columns.")

def run_pipeline():
    """Run the full pipeline: train and backtest the meta-agent."""
    print("Starting training phase...")
    model = train_meta_agent()
    print("\nStarting backtesting phase...")
    env = CustomMetaEnv()
    backtest_meta_agent(model, env)
    print("Pipeline completed.")

if __name__ == "__main__":
    run_pipeline()

In [None]:
import pandas as pd
import numpy as np
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
import os

# Define tickers
TICKERS = [
    'GC=F', 'SI=F', '^DJI', '^IXIC', 'CL=F', '^GSPC', '^STOXX50E',
    '^FCHI', '^FTSE', '^HSI', '000001.SS', '^KS11', '^BSESN', '^NSEI'
]

class CustomMetaEnv(gym.Env):
    """Custom Gym environment for meta-agent portfolio optimization."""
    def __init__(self, csv_path="Meta/NLP_obs/nlp_obs_clean.csv", price_path="metrics_used/price/clean_data.csv"):
        super().__init__()

        # Load observation data
        df = pd.read_csv(csv_path)
        self.months = df['month'].tolist()
        self.observations = df.iloc[:, 1:].astype(np.float32).values  # Shape: (263, 280)

        # Load price data
        self.price_data = pd.read_csv(price_path, index_col="Date", parse_dates=True)
        self.tickers = TICKERS

        # Validate tickers
        for ticker in self.tickers:
            if ticker not in self.price_data.columns:
                raise ValueError(f"Ticker {ticker} not in price data")

        self.total_steps = len(self.months)

        # Define observation and action spaces
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(280,), dtype=np.float32)
        # Define action space with finite bounds
        self.action_space = spaces.Box(low=-10, high=10, shape=(14,), dtype=np.float32)

        self.current_step = 0

    def reset(self, seed=None):
        if seed is not None:
            np.random.seed(seed)
        self.current_step = 0
        obs = self.observations[0]
        return obs, {}

    def step(self, action):
        if self.current_step >= self.total_steps:
            raise ValueError("Episode has ended.")

        # Normalize action to sum to 1 using softmax
        weights = np.exp(action) / np.sum(np.exp(action))

        # Get current month
        month = self.months[self.current_step]

        # Determine start and end dates
        start_date = pd.to_datetime(month + '-01')
        end_date = start_date + pd.offsets.MonthEnd(0)

        # Extract daily prices
        try:
            daily_prices = self.price_data.loc[start_date:end_date, self.tickers]
        except KeyError:
            print(f"Warning: Missing price data for {start_date} to {end_date}. Using zeros.")
            daily_prices = pd.DataFrame(0, index=pd.date_range(start_date, end_date), columns=self.tickers)

        # Compute daily portfolio values
        daily_values = (daily_prices * weights).sum(axis=1)

        # Calculate ROI
        roi = (daily_values.iloc[-1] / daily_values.iloc[0] - 1) if daily_values.iloc[0] > 0 else 0

        # Calculate daily returns and volatility
        daily_returns = daily_values.pct_change().dropna()
        volatility = daily_returns.std() if len(daily_returns) > 0 else 0

        # Calculate Maximum Drawdown (MDD)
        cummax = daily_values.cummax()
        drawdown = (cummax - daily_values) / cummax
        mdd = drawdown.max() if len(drawdown) > 0 else 0

        # Compute reward
        reward = 2 * roi - 0.7 * volatility - 0.5 * mdd

        # Advance step
        self.current_step += 1
        terminated = self.current_step >= self.total_steps
        truncated = False

        # Next observation
        next_obs = self.observations[self.current_step] if not terminated else np.zeros(280)

        info = {"allocation_month": month, "roi": roi, "volatility": volatility, "mdd": mdd}
        return next_obs, reward, terminated, truncated, info

    def render(self):
        pass

def train_meta_agent():
    """Train the meta-agent and save the model."""
    env = make_vec_env(lambda: CustomMetaEnv(), n_envs=1, seed=1)
    model = PPO(
        "MlpPolicy",
        env,
        policy_kwargs={'net_arch': [256, 256, 256]},
        verbose=1,
        seed=1
    )
    model.learn(total_timesteps=20000)
    os.makedirs("meta_results", exist_ok=True)
    model_path = "meta_results/meta_agent_seed1_nlp"
    model.save(model_path)
    print(f"Meta-agent trained and saved to {model_path}.zip")
    return model

def backtest_meta_agent(model, env, output_file="Meta/NLP_obs/backtest_meta_seed_1.csv"):
    """Backtest the trained meta-agent and save results."""
    obs, _ = env.reset()
    done = False
    records = []

    while not done:
        action, _ = model.predict(obs, deterministic=True)
        # Normalize action to sum to 1 using softmax
        weights = np.exp(action) / np.sum(np.exp(action))
        record = {"month": env.months[env.current_step]}
        for i, ticker in enumerate(TICKERS):
            record[f"weight_{ticker}"] = weights[i]
        obs, reward, done, _, info = env.step(action)
        record.update({
            "roi": info["roi"],
            "volatility": info["volatility"],
            "mdd": info["mdd"],
            "reward": reward
        })
        records.append(record)

    # Save backtest results
    df = pd.DataFrame(records)
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    df.to_csv(output_file, index=False)
    print(f"Backtest results saved to {output_file} with {len(df)} rows and {len(df.columns)} columns.")

def run_pipeline():
    """Run the full pipeline: train and backtest the meta-agent."""
    print("Starting training phase...")
    model = train_meta_agent()
    print("\nStarting backtesting phase...")
    env = CustomMetaEnv()
    backtest_meta_agent(model, env)
    print("Pipeline completed.")

if __name__ == "__main__":
    run_pipeline()

# SUPER

In [None]:
import os

def create_super_folder():
    """
    Create an empty SUPER folder.

    Parameters:
    None

    Returns:
    None: Creates an empty SUPER folder
    """
    try:
        # Define the folder path
        super_dir = "SUPER"

        # Create the SUPER folder
        os.makedirs(super_dir, exist_ok=True)
        print(f"Created/Verified folder: {super_dir}")

        print("SUPER folder created successfully.")

    except Exception as e:
        print(f"Error creating SUPER folder: {e}")

if __name__ == "__main__":
    create_super_folder()

In [None]:
import os
import shutil

def copy_backtests_to_super():
    """
    Copy backtest files from Meta/NLP_obs and Meta/Metrics_obs to the SUPER folder.

    Parameters:
    None

    Returns:
    None: Copies files to the SUPER folder
    """
    try:
        # Define source and destination directories
        nlp_source_dir = "Meta/NLP_obs"
        metrics_source_dir = "Meta/Metrics_obs"
        dest_dir = "SUPER"

        # Validate source directories
        if not os.path.exists(nlp_source_dir):
            print(f"Source directory {nlp_source_dir} does not exist.")
            return
        if not os.path.exists(metrics_source_dir):
            print(f"Source directory {metrics_source_dir} does not exist.")
            return

        # Validate destination directory
        if not os.path.exists(dest_dir):
            print(f"Destination directory {dest_dir} does not exist. Please create it first.")
            return

        # Copy files from NLP_obs
        for filename in os.listdir(nlp_source_dir):
            if filename.endswith(".csv"):
                src_path = os.path.join(nlp_source_dir, filename)
                dest_filename = f"nlp_{filename}"
                dest_path = os.path.join(dest_dir, dest_filename)
                try:
                    shutil.copy2(src_path, dest_path)
                    print(f"Copied {src_path} to {dest_path}")
                except Exception as e:
                    print(f"Error copying {src_path} to {dest_path}: {e}")

        # Copy files from Metrics_obs
        for filename in os.listdir(metrics_source_dir):
            if filename.endswith(".csv") and "backtest_meta_seed_1" in filename:
                src_path = os.path.join(metrics_source_dir, filename)
                dest_filename = f"metrics_{filename}"
                dest_path = os.path.join(dest_dir, dest_filename)
                try:
                    shutil.copy2(src_path, dest_path)
                    print(f"Copied {src_path} to {dest_path}")
                except Exception as e:
                    print(f"Error copying {src_path} to {dest_path}: {e}")

        print("Copy process completed successfully.")

    except Exception as e:
        print(f"Error during copy process: {e}")

if __name__ == "__main__":
    copy_backtests_to_super()

In [None]:
import os

files_to_delete = ["SUPER/nlp_ddpg_merged.csv", "SUPER/nlp_nlp_obs_clean.csv", "SUPER/nlp_nlp_obs_unclean.csv", "SUPER/nlp_ppo_merged.csv", "SUPER/nlp_td3_merged.csv", "SUPER/nlp_sac_merged.csv"]

for file in files_to_delete:
    if os.path.exists(file):
        os.remove(file)
        print(f"Deleted: {file}")
    else:
        print(f"Not found: {file}")


In [None]:
import pandas as pd
import os

def concatenate_super_backtests(super_dir="SUPER", output_file="SUPER/super_weights_clean.csv"):
    """
    Concatenate the backtest CSV files in the SUPER folder, keeping only the 'month' and weight columns.

    Parameters:
    super_dir (str): Directory containing the backtest CSV files (SUPER)
    output_file (str): Path to save the concatenated and cleaned CSV file (SUPER/super_weights_clean.csv)

    Saves:
    - Concatenated CSV file with only 'month' and weight columns in output_file
    """
    try:
        # Define the input files
        nlp_file = os.path.join(super_dir, "nlp_backtest_meta_seed_1.csv")
        metrics_file = os.path.join(super_dir, "metrics_backtest_meta_seed_1.csv")

        # Validate input files
        if not os.path.exists(nlp_file):
            raise FileNotFoundError(f"NLP backtest file {nlp_file} not found.")
        if not os.path.exists(metrics_file):
            raise FileNotFoundError(f"Metrics backtest file {metrics_file} not found.")

        # Read the CSV files
        nlp_df = pd.read_csv(nlp_file)
        metrics_df = pd.read_csv(metrics_file)

        # Expected number of rows (263 months: Feb 2003 to Dec 2024)
        expected_rows = 263
        if len(nlp_df) != expected_rows or len(metrics_df) != expected_rows:
            print(f"Warning: Expected {expected_rows} rows. NLP has {len(nlp_df)}, Metrics has {len(metrics_df)}.")

        # Keep only 'month' and weight columns
        weight_cols = [col for col in nlp_df.columns if col.startswith("weight_")]
        nlp_df = nlp_df[['month'] + weight_cols]
        metrics_df = metrics_df[['month'] + weight_cols]

        # Rename weight columns to include source identifier
        nlp_df = nlp_df.rename(columns={col: f"{col}_nlp" for col in weight_cols})
        metrics_df = metrics_df.rename(columns={col: f"{col}_metrics" for col in weight_cols})

        # Drop 'month' from metrics_df to avoid duplication
        metrics_df = metrics_df.drop(columns=['month'])

        # Concatenate side by side on 'month'
        concatenated_df = pd.concat([nlp_df, metrics_df], axis=1)

        # Verify the number of columns (should be 1 + (14 weights × 2 sources) = 29)
        expected_columns = 1 + (14 * 2)
        if len(concatenated_df.columns) != expected_columns:
            print(f"Warning: Concatenated DataFrame has {len(concatenated_df.columns)} columns, expected {expected_columns}.")

        # Save the concatenated DataFrame
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        concatenated_df.to_csv(output_file, index=False)
        print(f"Concatenated and cleaned CSV saved to {output_file} with {len(concatenated_df)} rows and {len(concatenated_df.columns)} columns.")

        print("Concatenation process completed successfully.")

    except Exception as e:
        print(f"Error during concatenation process: {e}")

if __name__ == "__main__":
    concatenate_super_backtests()

In [None]:
import pandas as pd
import numpy as np
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
import os

# Define tickers
TICKERS = [
    'GC=F', 'SI=F', '^DJI', '^IXIC', 'CL=F', '^GSPC', '^STOXX50E',
    '^FCHI', '^FTSE', '^HSI', '000001.SS', '^KS11', '^BSESN', '^NSEI'
]

class CustomSuperMetaEnv(gym.Env):
    """Custom Gym environment for super meta-agent portfolio optimization."""
    def __init__(self, csv_path="SUPER/super_weights_clean.csv", price_path="metrics_used/price/clean_data.csv"):
        super().__init__()

        # Load observation data
        df = pd.read_csv(csv_path)
        self.months = df['month'].tolist()
        self.observations = df.iloc[:, 1:].astype(np.float32).values  # Shape: (263, 28)

        # Load price data
        self.price_data = pd.read_csv(price_path, index_col="Date", parse_dates=True)
        self.tickers = TICKERS

        # Validate tickers
        for ticker in self.tickers:
            if ticker not in self.price_data.columns:
                raise ValueError(f"Ticker {ticker} not in price data")

        self.total_steps = len(self.months)

        # Define observation and action spaces
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(28,), dtype=np.float32)
        self.action_space = spaces.Box(low=-10, high=10, shape=(14,), dtype=np.float32)

        self.current_step = 0

    def reset(self, seed=None):
        if seed is not None:
            np.random.seed(seed)
        self.current_step = 0
        obs = self.observations[0]
        return obs, {}

    def step(self, action):
        if self.current_step >= self.total_steps:
            raise ValueError("Episode has ended.")

        # Normalize action to sum to 1 using softmax
        weights = np.exp(action) / np.sum(np.exp(action))

        # Get current month
        month = self.months[self.current_step]

        # Determine start and end dates
        start_date = pd.to_datetime(month + '-01')
        end_date = start_date + pd.offsets.MonthEnd(0)

        # Extract daily prices
        try:
            daily_prices = self.price_data.loc[start_date:end_date, self.tickers]
        except KeyError:
            print(f"Warning: Missing price data for {start_date} to {end_date}. Using zeros.")
            daily_prices = pd.DataFrame(0, index=pd.date_range(start_date, end_date), columns=self.tickers)

        # Compute daily portfolio values
        daily_values = (daily_prices * weights).sum(axis=1)

        # Calculate ROI
        roi = (daily_values.iloc[-1] / daily_values.iloc[0] - 1) if daily_values.iloc[0] > 0 else 0

        # Calculate daily returns and volatility
        daily_returns = daily_values.pct_change().dropna()
        volatility = daily_returns.std() if len(daily_returns) > 0 else 0

        # Calculate Maximum Drawdown (MDD)
        cummax = daily_values.cummax()
        drawdown = (cummax - daily_values) / cummax
        mdd = drawdown.max() if len(drawdown) > 0 else 0

        # Compute reward
        reward = 2 * roi - 0.7 * volatility - 0.5 * mdd

        # Advance step
        self.current_step += 1
        terminated = self.current_step >= self.total_steps
        truncated = False

        # Next observation
        next_obs = self.observations[self.current_step] if not terminated else np.zeros(28)

        info = {"allocation_month": month, "roi": roi, "volatility": volatility, "mdd": mdd}
        return next_obs, reward, terminated, truncated, info

    def render(self):
        pass

def train_super_meta_agent():
    """Train the super meta-agent and save the model."""
    env = make_vec_env(lambda: CustomSuperMetaEnv(), n_envs=1, seed=1)
    model = PPO(
        "MlpPolicy",
        env,
        policy_kwargs={'net_arch': [256, 256, 256]},
        verbose=1,
        seed=1
    )
    model.learn(total_timesteps=1000)
    os.makedirs("results", exist_ok=True)
    model_path = "results/super_meta_agent_seed1"
    model.save(model_path)
    print(f"Super meta-agent trained and saved to {model_path}.zip")
    return model

def backtest_super_meta_agent(model, env, output_file="SUPER/backtest_super_meta_seed_1.csv"):
    """Backtest the trained super meta-agent and save results."""
    obs, _ = env.reset()
    done = False
    records = []

    while not done:
        action, _ = model.predict(obs, deterministic=True)
        # Normalize action to sum to 1 using softmax
        weights = np.exp(action) / np.sum(np.exp(action))
        record = {"month": env.months[env.current_step]}
        for i, ticker in enumerate(TICKERS):
            record[f"weight_{ticker}"] = weights[i]
        obs, reward, done, _, info = env.step(action)
        record.update({
            "roi": info["roi"],
            "volatility": info["volatility"],
            "mdd": info["mdd"],
            "reward": reward
        })
        records.append(record)

    # Save backtest results
    df = pd.DataFrame(records)
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    df.to_csv(output_file, index=False)
    print(f"Backtest results saved to {output_file} with {len(df)} rows and {len(df.columns)} columns.")

def run_pipeline():
    """Run the full pipeline: train and backtest the super meta-agent."""
    print("Starting training phase...")
    model = train_super_meta_agent()
    print("\nStarting backtesting phase...")
    env = CustomSuperMetaEnv()
    backtest_super_meta_agent(model, env)
    print("Pipeline completed.")

if __name__ == "__main__":
    run_pipeline()

In [None]:
import os
import shutil

def organize_all_backtests():
    """
    Create the ALL_BACKTEST folder with subfolders Meta, Super, Metrics, NLP,
    and copy all backtest CSV files into the appropriate subfolders with consistent structure.

    Parameters:
    None

    Returns:
    None: Creates the folder structure and copies files
    """
    try:
        # Define the main directory and subfolders
        all_backtest_dir = "ALL_BACKTEST"
        subfolders = ["Meta", "Super", "Metrics", "NLP"]

        # Create the main directory and subfolders
        os.makedirs(all_backtest_dir, exist_ok=True)
        print(f"Created/Verified folder: {all_backtest_dir}")

        for subfolder in subfolders:
            subfolder_path = os.path.join(all_backtest_dir, subfolder)
            os.makedirs(subfolder_path, exist_ok=True)
            print(f"Created/Verified folder: {subfolder_path}")

        # Define agents and seeds for Metrics and NLP pipelines
        agents = ["ppo", "sac", "ddpg", "td3"]
        seeds = [1, 2, 3, 4, 5]

        # Copy Metrics pipeline files
        metrics_source_dir = "backtest_results"
        if not os.path.exists(metrics_source_dir):
            print(f"Metrics source directory {metrics_source_dir} does not exist.")
        else:
            for agent in agents:
                agent_source_dir = os.path.join(metrics_source_dir, agent)
                agent_dest_dir = os.path.join(all_backtest_dir, "Metrics", agent)
                os.makedirs(agent_dest_dir, exist_ok=True)
                for seed in seeds:
                    src_file = os.path.join(agent_source_dir, f"seed_{seed}.csv")
                    if os.path.exists(src_file):
                        dest_file = os.path.join(agent_dest_dir, f"seed_{seed}.csv")
                        shutil.copy2(src_file, dest_file)
                        print(f"Copied {src_file} to {dest_file}")
                    else:
                        print(f"File {src_file} does not exist. Skipping.")

        # Copy NLP pipeline files
        nlp_source_dir = "backtest_results_NLP"
        if not os.path.exists(nlp_source_dir):
            print(f"NLP source directory {nlp_source_dir} does not exist.")
        else:
            for agent in agents:
                agent_source_dir = os.path.join(nlp_source_dir, agent)
                agent_dest_dir = os.path.join(all_backtest_dir, "NLP", agent)
                os.makedirs(agent_dest_dir, exist_ok=True)
                for seed in seeds:
                    src_file = os.path.join(agent_source_dir, f"seed_{seed}.csv")
                    if os.path.exists(src_file):
                        dest_file = os.path.join(agent_dest_dir, f"seed_{seed}.csv")
                        shutil.copy2(src_file, dest_file)
                        print(f"Copied {src_file} to {dest_file}")
                    else:
                        print(f"File {src_file} does not exist. Skipping.")

        # Copy Meta pipeline file
        meta_source_dir = "Meta/Metrics_obs"
        meta_file = os.path.join(meta_source_dir, "backtest_meta_seed_1.csv")
        meta_dest_dir = os.path.join(all_backtest_dir, "Meta", "meta")
        os.makedirs(meta_dest_dir, exist_ok=True)
        if os.path.exists(meta_file):
            dest_file = os.path.join(meta_dest_dir, "seed_1.csv")
            shutil.copy2(meta_file, dest_file)
            print(f"Copied {meta_file} to {dest_file}")
        else:
            print(f"Meta backtest file {meta_file} does not exist. Skipping.")

                # Copy Meta pipeline file
        meta_source_dir = "Meta/NLP_obs"
        meta_file = os.path.join(meta_source_dir, "backtest_meta_seed_1.csv")
        meta_dest_dir = os.path.join(all_backtest_dir, "Meta", "meta")
        os.makedirs(meta_dest_dir, exist_ok=True)
        if os.path.exists(meta_file):
            dest_file = os.path.join(meta_dest_dir, "seed_1_nlp.csv")
            shutil.copy2(meta_file, dest_file)
            print(f"Copied {meta_file} to {dest_file}")
        else:
            print(f"Meta backtest file {meta_file} does not exist. Skipping.")

        # Copy Super pipeline file
        super_source_dir = "SUPER"
        super_file = os.path.join(super_source_dir, "backtest_super_meta_seed_1.csv")
        super_dest_dir = os.path.join(all_backtest_dir, "Super", "super_meta")
        os.makedirs(super_dest_dir, exist_ok=True)
        if os.path.exists(super_file):
            dest_file = os.path.join(super_dest_dir, "seed_1.csv")
            shutil.copy2(super_file, dest_file)
            print(f"Copied {super_file} to {dest_file}")
        else:
            print(f"Super backtest file {super_file} does not exist. Skipping.")

        print("Backtest organization process completed successfully.")

    except Exception as e:
        print(f"Error during backtest organization process: {e}")

if __name__ == "__main__":
    organize_all_backtests()

In [None]:
import pandas as pd
import os

def create_roi_matrix(all_backtest_dir="ALL_BACKTEST", output_file="ALL_BACKTEST/ROI.csv"):
    """
    Create a matrix with rows as months and columns as models and seeds,
    where each cell contains the ROI of the model at that month.

    Parameters:
    all_backtest_dir (str): Directory containing the backtest files (ALL_BACKTEST)
    output_file (str): Path to save the ROI matrix (SUPER/ROI.csv)

    Saves:
    - ROI matrix as a CSV file in output_file
    """
    try:
        # Define all files and their identifiers
        all_files = [
            # Metrics
            (f"{all_backtest_dir}/Metrics/{agent}/seed_{seed}.csv", f"Metrics_{agent}_{seed}") for agent in ["ppo", "sac", "ddpg", "td3"] for seed in [1,2,3,4,5]
        ] + [
            # NLP
            (f"{all_backtest_dir}/NLP/{agent}/seed_{seed}.csv", f"NLP_{agent}_{seed}") for agent in ["ppo", "sac", "ddpg", "td3"] for seed in [1,2,3,4,5]
        ] + [
            # Meta Metrics
            (f"{all_backtest_dir}/Meta/meta/seed_1.csv", "Meta_meta_metrics_1"),
            # Meta NLP
            (f"{all_backtest_dir}/Meta/meta/seed_1_nlp.csv", "Meta_meta_nlp_1"),
            # Super
            (f"{all_backtest_dir}/Super/super_meta/seed_1.csv", "Super_super_meta_1")
        ]

        # Initialize list for DataFrames
        dfs = []

        # Read each file and add model_seed column
        for file_path, identifier in all_files:
            if os.path.exists(file_path):
                df = pd.read_csv(file_path)
                # Validate the number of rows
                expected_rows = 261
                if len(df) != expected_rows:
                    print(f"Warning: {file_path} has {len(df)} rows, expected {expected_rows}. Skipping.")
                    continue
                # Validate the presence of required columns
                if 'month' not in df.columns or 'roi' not in df.columns:
                    print(f"Warning: 'month' or 'roi' column not found in {file_path}. Skipping.")
                    continue
                df['model_seed'] = identifier
                df = df[['month', 'roi', 'model_seed']]
                dfs.append(df)
            else:
                print(f"File {file_path} does not exist. Skipping.")

        # Concatenate all DataFrames
        if dfs:
            combined_df = pd.concat(dfs, ignore_index=True)
        else:
            print("No dataframes to concatenate. Exiting.")
            return

        # Pivot the combined DataFrame
        roi_matrix = combined_df.pivot(index='month', columns='model_seed', values='roi')

        # Sort the index (months)
        roi_matrix = roi_matrix.sort_index()

        # Save to CSV
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        roi_matrix.to_csv(output_file)
        print(f"ROI matrix saved to {output_file} with {len(roi_matrix)} rows and {len(roi_matrix.columns)} columns.")

        print("ROI matrix creation completed successfully.")

    except Exception as e:
        print(f"Error during ROI matrix creation: {e}")

if __name__ == "__main__":
    create_roi_matrix()

In [None]:
import pandas as pd
import os
import numpy as np

def create_portfolio_value_matrix(all_backtest_dir="ALL_BACKTEST", output_file="ALL_BACKTEST/portfolio_value.csv"):
    """
    Create a matrix with rows as months and columns as models and seeds,
    where each cell contains the cumulative portfolio value of the model at that month.

    Parameters:
    all_backtest_dir (str): Directory containing the backtest files (ALL_BACKTEST)
    output_file (str): Path to save the portfolio value matrix (SUPER/portfolio_value.csv)

    Saves:
    - Portfolio value matrix as a CSV file in output_file
    """
    try:
        # Define all files and their identifiers
        all_files = [
            # Metrics
            (f"{all_backtest_dir}/Metrics/{agent}/seed_{seed}.csv", f"Metrics_{agent}_{seed}") for agent in ["ppo", "sac", "ddpg", "td3"] for seed in [1, 2, 3, 4, 5]
        ] + [
            # NLP
            (f"{all_backtest_dir}/NLP/{agent}/seed_{seed}.csv", f"NLP_{agent}_{seed}") for agent in ["ppo", "sac", "ddpg", "td3"] for seed in [1, 2, 3, 4, 5]
        ] + [
            # Meta Metrics
            (f"{all_backtest_dir}/Meta/meta/seed_1.csv", "Meta_meta_metrics_1"),
            # Meta NLP
            (f"{all_backtest_dir}/Meta/meta/seed_1_nlp.csv", "Meta_meta_nlp_1"),
            # Super
            (f"{all_backtest_dir}/Super/super_meta/seed_1.csv", "Super_super_meta_1")
        ]

        # Initialize list for DataFrames
        dfs = []

        # Read each file and compute cumulative ROI
        for file_path, identifier in all_files:
            if os.path.exists(file_path):
                df = pd.read_csv(file_path)
                # Validate the number of rows
                expected_rows = 261
                if len(df) != expected_rows:
                    print(f"Warning: {file_path} has {len(df)} rows, expected {expected_rows}. Skipping.")
                    continue
                # Validate the presence of required columns
                if 'month' not in df.columns or 'roi' not in df.columns:
                    print(f"Warning: 'month' or 'roi' column not found in {file_path}. Skipping.")
                    continue
                # Compute cumulative ROI
                df['cumulative_roi'] = (1 + df['roi']).cumprod()
                df['model_seed'] = identifier
                dfs.append(df[['month', 'cumulative_roi', 'model_seed']])
            else:
                print(f"File {file_path} does not exist. Skipping.")

        # Concatenate all DataFrames
        if dfs:
            combined_df = pd.concat(dfs, ignore_index=True)
        else:
            print("No dataframes to concatenate. Exiting.")
            return

        # Pivot the combined DataFrame
        portfolio_value_matrix = combined_df.pivot(index='month', columns='model_seed', values='cumulative_roi')

        # Sort the index (months)
        portfolio_value_matrix = portfolio_value_matrix.sort_index()

        # Save to CSV
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        portfolio_value_matrix.to_csv(output_file)
        print(f"Portfolio value matrix saved to {output_file} with {len(portfolio_value_matrix)} rows and {len(portfolio_value_matrix.columns)} columns.")

        print("Portfolio value matrix creation completed successfully.")

    except Exception as e:
        print(f"Error during portfolio value matrix creation: {e}")

if __name__ == "__main__":
    create_portfolio_value_matrix()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

def plot_portfolio_value_evolution(csv_file="ALL_BACKTEST/portfolio_value.csv", output_file="ALL_BACKTEST/portfolio_value_evolution.png"):
    """
    Plot the portfolio value evolution for all model-seed combinations in a single graph.

    Parameters:
    csv_file (str): Path to the portfolio value CSV file
    output_file (str): Path to save the plot image

    Saves:
    - Plot image as a PNG file
    """
    try:
        # Check if the CSV file exists
        if not os.path.exists(csv_file):
            raise FileNotFoundError(f"The file {csv_file} does not exist. Please generate it using create_portfolio_value_matrix.py.")

        # Read the CSV file
        df = pd.read_csv(csv_file, index_col="month")

        # Validate the number of rows
        expected_rows = 263
        if len(df) != expected_rows:
            print(f"Warning: {csv_file} has {len(df)} rows, expected {expected_rows}.")

        # Create the plot
        plt.figure(figsize=(15, 10))
        for column in df.columns:
            plt.plot(df.index, df[column], label=column)

        # Customize the plot
        plt.xlabel("Month")
        plt.ylabel("Portfolio Value")
        plt.title("Portfolio Value Evolution for All Models and Seeds")
        plt.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize=8)
        plt.xticks(rotation=45)
        plt.grid(True)
        plt.tight_layout()

        # Save the plot
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        plt.savefig(output_file)
        print(f"Plot saved to {output_file}")

        # Show the plot (optional, for interactive environments)
        plt.show()

        print("Plotting completed successfully.")

    except Exception as e:
        print(f"Error during plotting: {e}")

if __name__ == "__main__":
    plot_portfolio_value_evolution()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

def plot_model_seeds(df, models, title, output_file):
    """
    Plot portfolio value evolution for specified models and seeds.
    """
    plt.figure(figsize=(12, 8))
    for model in models:
        model_columns = [col for col in df.columns if col.startswith(model)]
        for column in model_columns:
            plt.plot(df.index, df[column], label=column)

    plt.xlabel("Month")
    plt.ylabel("Portfolio Value")
    plt.title(title)
    plt.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize=8)
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(output_file)
    plt.close()
    print(f"Plot saved to {output_file}")

def plot_meta_super(df, title, output_file):
    """
    Plot portfolio value evolution for meta and super agents.
    """
    plt.figure(figsize=(12, 8))
    meta_super_columns = [col for col in df.columns if col.startswith('Meta_') or col.startswith('Super_')]
    for column in meta_super_columns:
        plt.plot(df.index, df[column], label=column)

    plt.xlabel("Month")
    plt.ylabel("Portfolio Value")
    plt.title(title)
    plt.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize=8)
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(output_file)
    plt.close()
    print(f"Plot saved to {output_file}")

def create_plots():
    """
    Create and save plots for portfolio value evolution in specified time periods,
    with 2018-2024 plots normalized to start at 1 in January 2018.
    """
    # Define directories
    plot_dir = "ALL_BACKTEST/plot"
    full_period_dir = os.path.join(plot_dir, "2003-2024")
    recent_period_dir = os.path.join(plot_dir, "2018-2024")

    # Create directories if they don't exist
    os.makedirs(full_period_dir, exist_ok=True)
    os.makedirs(recent_period_dir, exist_ok=True)

    # Load the portfolio value data
    csv_file = "SUPER/portfolio_value.csv"
    if not os.path.exists(csv_file):
        raise FileNotFoundError(f"File {csv_file} not found.")

    df = pd.read_csv(csv_file, index_col="month")

    # Verify that '2018-01' exists in the index
    if '2018-01' not in df.index:
        raise ValueError("The date '2018-01' is not found in the data.")

    # Filter data for the 2018-2024 period
    df_recent = df.loc['2018-01':].copy()

    # Normalize the 2018-2024 data to start at 1 in January 2018
    start_values = df_recent.iloc[0]  # Values at '2018-01'
    normalized_df = df_recent / start_values  # Divide all values by January 2018 values

    # Define models to plot
    models = ["ppo", "sac", "td3", "ddpg"]

    # Generate plots for the full period (2003-2024)
    for model in models:
        plot_model_seeds(
            df,
            [f"Metrics_{model}", f"NLP_{model}"],
            f"Portfolio Value Evolution - {model.upper()} Seeds (2003-2024)",
            os.path.join(full_period_dir, f"{model}_seeds_2003-2024.png")
        )

    plot_meta_super(
        df,
        "Portfolio Value Evolution - Meta and Super Agents (2003-2024)",
        os.path.join(full_period_dir, "meta_super_2003-2024.png")
    )

    # Generate normalized plots for the recent period (2018-2024)
    for model in models:
        plot_model_seeds(
            normalized_df,
            [f"Metrics_{model}", f"NLP_{model}"],
            f"Portfolio Value Evolution - {model.upper()} Seeds (2018-2024, Normalized to Start at 1 in Jan 2018)",
            os.path.join(recent_period_dir, f"{model}_seeds_2018-2024.png")
        )

    plot_meta_super(
        normalized_df,
        "Portfolio Value Evolution - Meta and Super Agents (2018-2024, Normalized to Start at 1 in Jan 2018)",
        os.path.join(recent_period_dir, "meta_super_2018-2024.png")
    )

    print("All plots generated successfully.")

if __name__ == "__main__":
    try:
        create_plots()
    except Exception as e:
        print(f"Error: {e}")