In [None]:
import pandas as pd
import numpy as np
import ast
from sklearn.decomposition import KernelPCA

def kernel_pca_contexts(
    input_csv='data.csv',
    output_csv='data_kpca.csv',
    n_context_columns=30,
    dim_out=4
):
    """
    1. Reads a CSV that has columns:
       - 'num_option' (indicating how many contexts are valid in this row)
       - 'context_0', 'context_1', ..., up to 'context_{n_context_columns-1}'
         each is either NaN or a string like "[0.75, 0.75, ...]"
    2. Collects all valid contexts across all rows into one list (N x 9).
    3. Applies Kernel PCA to reduce from 9 dims to dim_out.
    4. Places the transformed contexts back into the DataFrame.
    5. Saves to a new CSV file.
    """
    # 1. Read the CSV
    df = pd.read_csv(input_csv)

    # 2. Prepare to collect all contexts across rows
    #    We'll remember which (row, context_index) each vector came from
    all_contexts = []       # will be a list of length-9 lists
    mapping = []            # will hold (row_index, context_col_index)
    
    # For each row, read the number of valid contexts (num_option),
    # parse them, and append to all_contexts
    for row_idx, row in df.iterrows():
        num_opt = int(row['num_option']) if not pd.isna(row['num_option']) else 0
        
        # For each context_i up to num_opt
        for c in range(num_opt):
            col_name = f"context_{c}"
            if col_name not in df.columns:
                continue  # if the column doesn't exist, skip
            
            val = row[col_name]
            if pd.isna(val):
                continue  # no content in this context
            val_str = str(val).strip()
            if not val_str:
                continue  # empty string
            # Parse the string (e.g. "[0.75, 0.75, 0.3333, 1, 0, 0, 0, 0, 0]") into a list
            parsed_list = ast.literal_eval(val_str)  # safer than eval
            
            # Expecting each context to have exactly 9 floats
            if len(parsed_list) != 9:
                print(f"Warning: row={row_idx}, col={col_name} has unexpected length {len(parsed_list)}")
            
            all_contexts.append(parsed_list)
            mapping.append((row_idx, c))
    
    # If no contexts found, just save the file as-is or handle differently
    if not all_contexts:
        print("No valid contexts found. Saving original CSV without changes.")
        df.to_csv(output_csv, index=False)
        return
    
    # 3. Convert to a NumPy array (N x 9)
    X = np.array(all_contexts, dtype=float)

    # 4. Perform Kernel PCA (reduce to dim_out)
    kpca = KernelPCA(n_components=dim_out, kernel='rbf')
    X_kpca = kpca.fit_transform(X)  # shape: N x dim_out
    
    # 5. Place the transformed vectors back into the DataFrame
    #    We'll store them as a string, e.g. "[0.123, 0.456, ...]"
    for i, (row_idx, context_idx) in enumerate(mapping):
        new_vec = X_kpca[i, :]
        df.loc[row_idx, f"context_{context_idx}"] = str(new_vec.tolist())
    
    # 6. Save to a new CSV file
    df.to_csv(output_csv, index=False)
    print(f"Saved dimension-reduced contexts to {output_csv}")

# Example usage
d=6
kernel_pca_contexts(
    input_csv='../data/user_data_filtered.csv',       # Your input CSV
    output_csv=f'd={d}/user_data_filtered.csv', # Output CSV with dimension-reduced contexts
    n_context_columns=30,       # Maximum number of context columns
    dim_out=d                   # Desired output dimension
)


Saved dimension-reduced contexts to d=6/user_data_filtered.csv


: 