In [1]:
import pandas as pd
import numpy as np
import quantstats as qs # Keep for potential later use or plotting, but not core to the request
import warnings
from tqdm.auto import tqdm
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr # Needed for IC calculation

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)
warnings.filterwarnings('ignore', category=UserWarning) # Quantstats might issue user warnings
# --- Function Definitions ---

# Step 1: Load Data (Unchanged - Ensure 'open', 'high', 'close', 'pct_chg' exist)
def load_data(cb_path, index_path):
    """Loads CB and index data, ensures DatetimeIndex."""
    print("--- Step 1: Loading Data ---")
    try:
        df = pd.read_parquet(cb_path)
        index_df = pd.read_parquet(index_path) # Index data might not be needed unless comparing results later

        # Ensure index_df has DatetimeIndex (optional but good practice)
        if index_df is not None and not isinstance(index_df.index, pd.DatetimeIndex):
            index_df.index = pd.to_datetime(index_df.index)

        # Ensure df has correct MultiIndex with DatetimeIndex for trade_date
        required_levels = ['code', 'trade_date']
        if all(level in df.index.names for level in required_levels):
            date_level_idx = df.index.names.index('trade_date')
            if not isinstance(df.index.levels[date_level_idx], pd.DatetimeIndex):
                 df.index = df.index.set_levels(pd.to_datetime(df.index.levels[date_level_idx]), level='trade_date')
        else: # Try setting index if columns exist
            if all(col in df.columns for col in required_levels):
                 df['trade_date'] = pd.to_datetime(df['trade_date'])
                 df = df.set_index(required_levels)
            else: raise ValueError("CB data missing 'code' or 'trade_date' for index.")

        # --- Add check for required return calculation columns ---
        required_cols = ['open', 'high', 'close', 'pct_chg'] # pct_chg is raw % change from prev close to current close
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
             raise ValueError(f"Required columns for return calculation missing: {missing_cols}")
        # --- End Check ---

        print(f"Loaded CB data shape: {df.shape}")
        if index_df is not None:
            print(f"Loaded Index data shape: {index_df.shape}")
        return df, index_df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None, None

# Step 2: Filter Data (Unchanged - But ensure factors needed are just the ones to analyze)
def filter_data(df, start_date, end_date, filter_rules, factors_to_analyze):
    """Applies date range and custom filters."""
    print("--- Step 2: Filtering Data ---")
    if df is None: return None

    # Check if factor columns exist BEFORE filtering
    missing_factors = [f for f in factors_to_analyze if f not in df.columns]
    if missing_factors:
        print(f"Error: Required factor columns missing from data: {missing_factors}")
        return None

    # Date filtering
    try:
        if 'trade_date' not in df.index.names:
             raise KeyError("'trade_date' not found in DataFrame index levels.")
        trade_date_level = df.index.get_level_values('trade_date')
        date_mask = (trade_date_level >= start_date) & (trade_date_level <= end_date)
        df_filtered = df[date_mask].copy()
        if df_filtered.empty: raise ValueError("No data remaining after date filtering.")
        print(f"Filtered by date: {start_date} to {end_date}. Shape: {df_filtered.shape}")
    except Exception as e:
        print(f"Error during date filtering: {e}")
        return None

    # Apply standard filters (Redemption, Listing days)
    df_filtered['filter_out'] = False
    redeem_statuses = ['已公告强赎', '公告到期赎回', '公告实施强赎', '公告提示强赎', '已满足强赎条件']
    if 'is_call' in df_filtered.columns: df_filtered.loc[df_filtered['is_call'].isin(redeem_statuses), 'filter_out'] = True
    if 'list_days' in df_filtered.columns: df_filtered.loc[df_filtered['list_days'] <= 3, 'filter_out'] = True

    # Apply custom filters
    print("Applying custom filters...")
    for rule in filter_rules:
        try:
            print(f" - Applying: {rule}")
            matching_indices = df_filtered.query(rule).index
            df_filtered.loc[matching_indices, 'filter_out'] = True
        except Exception as e:
            print(f"  - Warning: Could not apply filter rule '{rule}'. Error: {e}")

    # --- IMPORTANT: Filter out rows where any needed factor is NaN before calculating returns ---
    # We need valid factor values *today* to correlate with *tomorrow's* return
    print("Filtering rows with NaN in factor values...")
    nan_mask = df_filtered[factors_to_analyze].isna().any(axis=1)
    df_filtered.loc[nan_mask, 'filter_out'] = True

    eligible_count = len(df_filtered[~df_filtered['filter_out']])
    print(f"Filtering complete. Eligible bond-days: {eligible_count}")
    if eligible_count == 0:
        print("Warning: No bonds eligible after applying all filters.")
    return df_filtered

# --- NEW Step 3: Calculate Multiple Forward Returns ---
def calculate_multiple_fwd_returns(df, pulse_percentages):
    """
    Calculates various next-day return metrics for each bond.
    - 'fwd_ret_close': Raw percentage change from current close to next close.
    - 'fwd_ret_pulse_X': Return based on pulse stop-profit logic at X%.
    """
    print("--- Step 3: Calculating Multiple Forward Returns ---")
    if df is None: return None
    required_cols = ['open', 'high', 'close', 'pct_chg']
    if not all(col in df.columns for col in required_cols):
        print(f"Error: Missing required columns for return calc: {required_cols}")
        return None
    if not isinstance(df.index, pd.MultiIndex) or 'code' not in df.index.names:
        print("Error: DataFrame needs MultiIndex with 'code' level for forward returns.")
        return None

    df_with_fwd = df.copy()
    grouped = df_with_fwd.groupby(level='code')

    # Get next day's data
    df_with_fwd['next_open'] = grouped['open'].shift(-1)
    df_with_fwd['next_high'] = grouped['high'].shift(-1)
    df_with_fwd['next_close'] = grouped['close'].shift(-1)
    # Note: 'pct_chg' is the change from T-1 close to T close.
    # We need T close to T+1 close. Calculate manually or use shifted pct_chg.
    # Using shifted pct_chg assumes it represents close-to-close.
    df_with_fwd['fwd_ret_close'] = grouped['pct_chg'].shift(-1)

    # Calculate pulse returns
    current_close = df_with_fwd['close']
    next_open = df_with_fwd['next_open']
    next_high = df_with_fwd['next_high']
    next_close = df_with_fwd['next_close'] # Not strictly needed if using fwd_ret_close
    raw_next_day_ret = df_with_fwd['fwd_ret_close'] # (next_close - current_close) / current_close

    # Handle cases where next day data is missing (last day for a bond)
    valid_next_day = next_open.notna() & next_high.notna() & current_close.notna() & (current_close > 0)

    for pct in pulse_percentages:
        ret_col_name = f'fwd_ret_pulse_{pct:.1f}' # e.g., fwd_ret_pulse_2.5
        stop_profit_pct = pct / 100.0
        threshold_price = current_close * (1 + stop_profit_pct)

        # Initialize return column with NaN
        df_with_fwd[ret_col_name] = np.nan

        # Calculate returns only where next day data is valid
        # Condition 1: Triggered at open
        cond_open_trig = valid_next_day & (next_open >= threshold_price)
        df_with_fwd.loc[cond_open_trig, ret_col_name] = (next_open[cond_open_trig] - current_close[cond_open_trig]) / current_close[cond_open_trig]

        # Condition 2: Triggered intraday (high >= threshold, but open < threshold)
        cond_intra_trig = valid_next_day & ~cond_open_trig & (next_high >= threshold_price)
        df_with_fwd.loc[cond_intra_trig, ret_col_name] = stop_profit_pct

        # Condition 3: Not triggered, use close-to-close return
        cond_no_trig = valid_next_day & ~cond_open_trig & ~cond_intra_trig
        df_with_fwd.loc[cond_no_trig, ret_col_name] = raw_next_day_ret[cond_no_trig] # Use pre-calculated raw return

    # Clean up intermediate columns if desired
    # df_with_fwd = df_with_fwd.drop(columns=['next_open', 'next_high', 'next_close'])

    return_cols = ['fwd_ret_close'] + [f'fwd_ret_pulse_{pct:.1f}' for pct in pulse_percentages]
    nan_counts = df_with_fwd[return_cols].isna().sum()
    print(f"Calculated forward returns. Example NaN counts:\n{nan_counts}")
    return df_with_fwd, return_cols


# --- NEW Step 4: Analyze Factor vs. Each Return Type Relationship (IC/IR) ---
def analyze_factor_return_relationships(df, factors, return_cols):
    """
    Calculates Information Coefficient (IC) and Information Ratio (IR)
    for each factor against each specified forward return column.
    """
    print(f"--- Step 4: Analyzing Factor Relationships with {len(return_cols)} Return Types ---")
    if df is None:
        print("Error: DataFrame is missing.")
        return None
    if not isinstance(df.index, pd.MultiIndex) or 'trade_date' not in df.index.names:
        print("Error: DataFrame needs MultiIndex with 'trade_date' level for IC calc.")
        return None

    all_ic_results = {}
    daily_ic_data = {} # Optional: Store all daily ICs for plotting if needed

    # --- Use eligible rows only ---
    # Filter_out was applied before return calculation.
    # Now, we need rows that were eligible AND have valid return values for the specific return column being analyzed.
    df_eligible_base = df[~df['filter_out']].copy()

    if df_eligible_base.empty:
        print("Warning: No eligible bond-days found based on initial filters.")
        return pd.DataFrame()

    # Iterate through each type of forward return
    for return_col in tqdm(return_cols, desc="Analyzing Return Types"):
        if return_col not in df_eligible_base.columns:
            print(f"Warning: Return column '{return_col}' not found. Skipping.")
            continue

        print(f"\n-- Analyzing Factors vs. Return: '{return_col}' --")
        # Drop rows where *this specific* return is NaN for this analysis pass
        df_analysis = df_eligible_base.dropna(subset=[return_col])

        if df_analysis.empty:
            print(f"Warning: No eligible data with valid '{return_col}' for IC calculation.")
            continue

        # Group by date to calculate daily IC
        grouped = df_analysis.groupby(level='trade_date')
        num_days = len(grouped)
        print(f"Analyzing {num_days} days for '{return_col}'...")

        # Iterate through each factor for the current return type
        for factor in factors:
            if factor not in df_analysis.columns:
                print(f"Warning: Factor '{factor}' not found. Skipping for '{return_col}'.")
                continue

            # Function to safely calculate Spearman correlation per day
            def safe_spearman(group):
                # Drop NaNs *within the group* for this specific factor and return_col
                # Factor NaNs should have been filtered already, but double-check
                group_cleaned = group[[factor, return_col]].dropna()
                if len(group_cleaned) < 5: # Need sufficient pairs for meaningful correlation (adjust as needed)
                    return np.nan
                try:
                    corr, p_val = spearmanr(group_cleaned[factor], group_cleaned[return_col])
                    return corr
                except ValueError: # Handle cases like zero variance
                    return np.nan

            # Apply the function to each day's group
            try:
                daily_ic = grouped.apply(safe_spearman)
                daily_ic_clean = daily_ic.dropna() # Store non-NaN ICs for stats

                # Store daily IC series if needed later
                # daily_ic_data[(factor, return_col)] = daily_ic_clean

                if daily_ic_clean.empty:
                    print(f" - Factor '{factor}': No valid daily ICs calculated.")
                    mean_ic, std_ic, ir, ic_positive_ratio, num_obs = np.nan, np.nan, np.nan, np.nan, 0
                elif len(daily_ic_clean) < 2:
                    # print(f" - Factor '{factor}': Only 1 valid daily IC. Cannot calculate Std Dev/IR.")
                    mean_ic = daily_ic_clean.mean()
                    std_ic = np.nan
                    ir = np.nan
                    ic_positive_ratio = (daily_ic_clean > 0).mean()
                    num_obs = len(daily_ic_clean)
                else:
                    mean_ic = daily_ic_clean.mean()
                    std_ic = daily_ic_clean.std()
                    ir = mean_ic / std_ic if std_ic != 0 and not np.isnan(std_ic) else np.nan # Avoid division by zero/NaN
                    ic_positive_ratio = (daily_ic_clean > 0).mean()
                    num_obs = len(daily_ic_clean)

                all_ic_results[(factor, return_col)] = {
                    'Mean IC': mean_ic,
                    'IC Std Dev': std_ic,
                    'IR (IC Mean/Std)': ir,
                    'IC > 0 Ratio': ic_positive_ratio,
                    'Num Observations (Days)': num_obs
                }
                # Optional: Print summary per factor
                # print(f"   - Factor '{factor}': Mean IC={mean_ic:.4f}, IR={ir:.4f}, Obs={num_obs}")

            except Exception as e:
                print(f"Error calculating IC for factor '{factor}' vs '{return_col}': {e}")
                all_ic_results[(factor, return_col)] = {k: np.nan for k in ['Mean IC', 'IC Std Dev', 'IR (IC Mean/Std)', 'IC > 0 Ratio', 'Num Observations (Days)']}

    print("\nIC/IR calculation complete for all factor/return pairs.")
    if not all_ic_results:
        return pd.DataFrame()

    # Format results into a DataFrame
    results_df = pd.DataFrame.from_dict(all_ic_results, orient='index')
    results_df.index = pd.MultiIndex.from_tuples(results_df.index, names=['Factor', 'Return Type'])
    results_df = results_df.sort_index()

    return results_df


# Step 5: Analyze Factor Correlation (Optional, Unchanged)
def analyze_factor_correlation(df, factors):
    """Calculates and displays the correlation matrix for the selected factors."""
    print("--- Step 5: Analyzing Factor Correlation (Optional) ---")
    if df is None or 'filter_out' not in df.columns: return None
    if not isinstance(df.index, pd.MultiIndex) or not all(name in df.index.names for name in ['code', 'trade_date']):
         print(f"Error: DataFrame index is not the expected MultiIndex for correlation. Index: {df.index}")
         return None

    # Use data *before* forward returns were added, but after filtering
    df_eligible = df[~df['filter_out']].copy()
    if df_eligible.empty: print("Warning: No eligible bonds for correlation."); return None
    if len(df_eligible) < 2: print("Warning: Less than 2 data points for correlation."); return None

    missing_factors = [f for f in factors if f not in df_eligible.columns]
    if missing_factors: print(f"Warning: Factors missing for correlation: {missing_factors}");
    present_factors = [f for f in factors if f in df_eligible.columns]
    if len(present_factors) < 2: print("Warning: Need at least 2 factors for correlation."); return None

    factor_data = df_eligible[present_factors]
    factor_data = factor_data.replace([np.inf, -np.inf], np.nan)
    # Drop rows with NaNs only in the columns used for this specific calculation
    factor_data = factor_data.dropna() # Drop rows if *any* factor is NaN

    if len(factor_data) < 2: print("Warning: Less than 2 valid data points after NaN drop for correlation."); return None
    if len(factor_data) > 50000: # Limit size for performance if dataset is huge
        print(f"Sampling {50000} rows for factor correlation calculation...")
        factor_data = factor_data.sample(50000, random_state=42)


    print("Calculating Spearman rank correlation matrix...")
    try:
        correlation_matrix = factor_data.corr(method='spearman')
        print("Factor Correlation Matrix:")
        plt.figure(figsize=(max(6, len(present_factors)*0.8), max(5, len(present_factors)*0.6))) # Adjust size
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5, annot_kws={"size": 8})
        plt.title('Factor Spearman Rank Correlation Heatmap')
        plt.xticks(rotation=45, ha='right')
        plt.yticks(rotation=0)
        plt.tight_layout()
        plt.show()
        return correlation_matrix
    except Exception as corr_e:
         print(f"Error calculating or plotting correlation: {corr_e}")
         return None


# --- Main Execution Function (Simplified) ---
def run_simplified_factor_analysis(config):
    """Orchestrates the simplified factor vs. return analysis."""

    # Step 1: Load Data
    df_cb_raw, _ = load_data(config['cb_data_path'], config.get('index_data_path')) # Index not strictly needed now
    if df_cb_raw is None: return None

    # Extract config parameters
    start_date = config['start_date']
    end_date = config['end_date']
    filters = config['filters']
    factors_to_analyze = config['factors_to_analyze'] # List of factor names
    pulse_percentages = config['pulse_percentages']

    # Step 2: Filter Data
    df_filtered = filter_data(df_cb_raw, start_date, end_date, filters, factors_to_analyze)
    if df_filtered is None or df_filtered[~df_filtered['filter_out']].empty:
        print("Stopping analysis due to filtering issues or no eligible data.")
        return None

    # Step 3: Calculate Multiple Forward Returns
    df_with_returns, return_cols = calculate_multiple_fwd_returns(df_filtered, pulse_percentages)
    if df_with_returns is None:
         print("Stopping analysis: Failed to calculate forward returns.")
         return None

    # Step 4: Analyze Factor vs. Each Return Type Relationship
    ic_results_df = analyze_factor_return_relationships(df_with_returns, factors_to_analyze, return_cols)

    # Step 5: Analyze Factor Correlation (Optional)
    factor_correlation_matrix = None
    if config.get('analyze_factor_correlation', False): # Add a flag in config
        # Pass df_filtered (before returns were added) to avoid issues if return calc failed partially
        factor_correlation_matrix = analyze_factor_correlation(df_filtered, factors_to_analyze)


    # --- Step 6: Report Results ---
    print("\n" + "="*30 + " Simplified Factor Analysis Report " + "="*30)

    # --- IC / IR Results ---
    print("\n--- Factor vs. Return Relationship Analysis (IC/IR) ---")
    if ic_results_df is not None and not ic_results_df.empty:
        display(ic_results_df.style.format({
            'Mean IC': '{:.4f}',
            'IC Std Dev': '{:.4f}',
            'IR (IC Mean/Std)': '{:.3f}',
            'IC > 0 Ratio': '{:.1%}',
            'Num Observations (Days)': '{:,.0f}'
        }))

        # --- Highlight Strongest Relationships ---
        print("\n--- Strongest Relationships (Highest Absolute Mean IC per Return Type) ---")
        # Group by return type and find the factor with max abs Mean IC
        idx_max_abs_ic = ic_results_df.loc[ic_results_df.groupby(level='Return Type')['Mean IC'].idxmax(skipna=True)]
        idx_min_abs_ic = ic_results_df.loc[ic_results_df.groupby(level='Return Type')['Mean IC'].idxmin(skipna=True)]

        # Combine and select the one with larger absolute value
        strongest_ic = {}
        for ret_type in return_cols:
             max_row = idx_max_abs_ic.loc[idx_max_abs_ic.index.get_level_values('Return Type') == ret_type]
             min_row = idx_min_abs_ic.loc[idx_min_abs_ic.index.get_level_values('Return Type') == ret_type]

             best_row = None
             if not max_row.empty and not min_row.empty:
                 if abs(max_row['Mean IC'].iloc[0]) >= abs(min_row['Mean IC'].iloc[0]):
                     best_row = max_row
                 else:
                     best_row = min_row
             elif not max_row.empty:
                 best_row = max_row
             elif not min_row.empty:
                 best_row = min_row

             if best_row is not None:
                strongest_ic[ret_type] = best_row

        if strongest_ic:
            summary_df = pd.concat(strongest_ic.values())
            display(summary_df.style.format({
                'Mean IC': '{:.4f}', 'IC Std Dev': '{:.4f}', 'IR (IC Mean/Std)': '{:.3f}',
                'IC > 0 Ratio': '{:.1%}', 'Num Observations (Days)': '{:,.0f}'
            }))
        else:
            print("Could not determine strongest relationships.")


        print("\n--- Strongest Relationships (Highest Absolute IR per Return Type) ---")
        # Similar logic for IR
        # Need to handle NaN IRs carefully
        ic_results_df_ir = ic_results_df.copy()
        ic_results_df_ir['Abs IR'] = ic_results_df_ir['IR (IC Mean/Std)'].abs()
        idx_max_abs_ir = ic_results_df_ir.loc[ic_results_df_ir.groupby(level='Return Type')['Abs IR'].idxmax(skipna=True)]

        if not idx_max_abs_ir.empty:
             display(idx_max_abs_ir.drop(columns=['Abs IR']).style.format({ # Display original columns
                 'Mean IC': '{:.4f}', 'IC Std Dev': '{:.4f}', 'IR (IC Mean/Std)': '{:.3f}',
                 'IC > 0 Ratio': '{:.1%}', 'Num Observations (Days)': '{:,.0f}'
             }))
        else:
             print("Could not determine strongest relationships based on IR (potentially all NaNs).")


    else:
        print("Factor vs. Return relationship results are not available.")

    # --- Factor Correlation Results ---
    if config.get('analyze_factor_correlation', False):
        print("\n--- Factor Correlation Matrix ---")
        if factor_correlation_matrix is not None:
            print("(See heatmap plot above)")
        else:
            print("Factor correlation matrix could not be calculated.")

    print("\n" + "="*30 + " Analysis Complete " + "="*30)

    # Return key results
    return {
        "factor_return_ic_ir": ic_results_df,
        "factor_correlation": factor_correlation_matrix,
        "final_data_with_returns": df_with_returns # Include data for inspection
    }

# --- Example Configuration (Simplified) ---
CONFIG = {
    'cb_data_path': 'cb_data.parquet',       # Replace with your actual path
    'index_data_path': 'index_data.parquet', # Optional, not used in core analysis now
    'start_date': '2022-01-01',
    'end_date': '2023-12-31',
    'filters': [
        "`转股溢价率` < 0.5",   # Example: Premium < 50%
        "`剩余规模` > 0.1",     # Example: Remaining size > 0.1 Billion
        "`close` < 150",       # Example: Price < 150
        # Add more filter rules as strings usable by df.query()
    ],
    'factors_to_analyze': [
        'ytm',              # Example: Yield to maturity
        '转股溢价率',        # Example: Conversion premium
        '剩余规模',         # Example: Remaining size
        'volume_ratio',     # Example: Volume ratio (ensure this exists)
        'double_low',       # Example: Double low value (ensure this exists)
        # Add all factor column names you want to analyze
    ],
    'pulse_percentages': [2.5, 2.8, 3.0, 3.8, 5.0, 5.8, 8.0], # Define the stop-profit thresholds
    'analyze_factor_correlation': True # Set to True to run factor correlation analysis
}


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# --- Configuration ---
CONFIG = {
    # File Paths
    'cb_data_path': '/Users/yiwei/Desktop/git/cb_data_with_factors2.pq', # CHANGE TO YOUR PATH
    'index_data_path': '/Users/yiwei/Desktop/git/index.pq', # CHANGE TO YOUR PATH

    # Analysis Settings
    'start_date': '20220801',
    'end_date': '20240101',   # Adjust end date as needed
    'num_bins': 5,           # Number of quantiles (e.g., 5 for quintiles)
    'benchmark_col': 'index_jsl',

    # Stop-Profit & Commission (for return calculation)
    'stop_profit_pct': 0.03,
    'commission_rate': 2 / 1000,

    # Data Filtering Rules (applied first)
    'filters': [
        "close < 102",
        "close > 155", # Example: wider price range for analysis
        "left_years < 0.5",
        "amount < 500",  # Example: lower liquidity threshold
        # Add other essential filters if needed (like redeem status, list_days)
        # "is_call.isin(['已公告强赎', '公告到期赎回', '公告实施强赎', '公告提示强赎', '已满足强赎条件']) == False", # Example keeping only non-redeem
        # "list_days > 3"
    ],

    # Factors to Analyze and their Weights for the Composite Score
    # Keys: Factor column names from df.
    # Values: Weight (positive means higher factor value -> better score contribution,
    #         negative means lower factor value -> better score contribution).
    # The ranking logic internally handles this direction based on weight sign.
    'factors_and_weights': {
        'ytm': 1.0,           # Higher YTM is better
        'conv_prem': -1.0,    # Lower premium is better
        'turnover_5': 1.5,    # Higher turnover might indicate interest
        'bond_prem': -1.0,    # Lower bond premium (closer to pure bond value) might be safer
        'theory_bias': -1,
        # Add other factors you want to combine
    }
}


# Run the simplified analysis
analysis_results = run_simplified_factor_analysis(CONFIG)

    # You can access results like:
    # if analysis_results:
    #     ic_ir_df = analysis_results["factor_return_ic_ir"]
    #     if ic_ir_df is not None:
    #          print("\n --- IC/IR Results Head ---")
    #          display(ic_ir_df.head())

--- Step 1: Loading Data ---
Loaded CB data shape: (593654, 374)
Loaded Index data shape: (1765, 8)


KeyError: 'factors_to_analyze'