In [6]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
# from scipy.stats import pearsonr
# import itertools
import os,sys 
current_dir = os.getcwd()
# project_root = os.path.dirname(current_dir)  
project_root = "/Users/rb666/Projects/MOSAIC" #/raid/home/rbeaute/Projects/MOSAIC"
sys.path.append(project_root)


print(f"cwd: {os.getcwd()}")
print(f"project_root: {project_root}")
print(f"configs exists: {os.path.exists(os.path.join(project_root, 'configs'))}")

# import nltk
# nltk.download('stopwords')
from configs.dreamachine2 import config
# from src.preprocessor import preproc
from preproc.preprocessor import preproc

EMBEDDING_MODEL_NAME = config.transformer_model
print(f"Transformer model: {EMBEDDING_MODEL_NAME}")

cwd: /Users/rb666/Projects/MOSAIC/EVAL/dreamachine/stability_tests
project_root: /Users/rb666/Projects/MOSAIC
configs exists: True
Transformer model: Qwen/Qwen3-Embedding-0.6B


In [None]:
dataset = "DREAMACHINE"
condition = "DL"  # "HS" or "DL"
sentences = True

print(f"Current working directory: {os.getcwd()}")
BOX_DIR = os.path.join(os.path.expanduser("~"), "Library", "CloudStorage", "Box-Box", "TMDATA")
print(f"Retrieving data from BOX, locally stored at: {BOX_DIR}")
DATA_DIR = os.path.join(BOX_DIR, dataset)
print(f"Data directory: {DATA_DIR}")
results_dir = os.path.join(project_root, "EVAL",dataset.lower())

reports_path = os.path.join(DATA_DIR, f"{condition}_reflections_APIcleaned.csv")
print("Using data from:", reports_path)

# load data and divide into sentences if needed
df_reports = pd.read_csv(reports_path)['cleaned_reflection'].dropna().reset_index(drop=True)
df_reports

#preproc reports using preproc function in src
df_reports = preproc(df_reports)
print(f"Loaded and preprocessed {len(df_reports)} sentences for condition '{condition}'.")

Current working directory: /Users/rb666/Projects/MOSAIC/EVAL/dreamachine/stability_tests
Retrieving data from BOX, locally stored at: /Users/rb666/Library/CloudStorage/Box-Box/TMDATA
Data directory: /Users/rb666/Library/CloudStorage/Box-Box/TMDATA/DREAMACHINE
Using data from: /Users/rb666/Library/CloudStorage/Box-Box/TMDATA/DREAMACHINE/DL_reflections_APIcleaned.csv

Successfully loaded and processed 205 sentences.
After removing short sentences, 198 sentences remain.
After removing duplicates, 198 remain.
Loaded and preprocessed 198 sentences for condition 'DL'.


In [None]:
# --- PARAMETER SELECTION (from optuna results) ---

if condition == "DL":
    print("Using MANUAL optimal parameters for Deep Listening (DL)...")
    chosen_params = {
        'n_components': 7,
        'n_neighbors': 8,
        'min_dist': 0.04,
        'min_cluster_size': 10,
        'min_samples': 9
    }
elif condition == "HS":
    print("Using MANUAL optimal parameters for High Sensory (HS)...")
    chosen_params = {
        'n_components': 20,
        'n_neighbors': 26,
        'min_dist': 0.015,
        'min_cluster_size': 10,
        'min_samples': 8
    }

print("Parameters loaded:")
print(chosen_params)

Using MANUAL optimal parameters for Deep Listening (DL)...
Parameters loaded:
{'n_components': 7, 'n_neighbors': 8, 'min_dist': 0.04, 'min_cluster_size': 10, 'min_samples': 9}


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# --- CONFIGURATION ---
NUM_BOOTSTRAPS = 100    # As requested by reviewer
SAMPLE_FRAC = 0.80      # 80% subsample
top_n_words = 15





# --- 1. SETUP MODELS (Fixed Parameters) ---
# We use the exact parameters found by Optuna (chosen_params)
# This ensures we test DATA stability, not parameter stability.

embedding_model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B") 

def create_model_instance():
    umap_model = UMAP(
        n_neighbors=int(chosen_params['n_neighbors']),
        n_components=int(chosen_params['n_components']),
        min_dist=float(chosen_params['min_dist']),
        metric='cosine',
        random_state=42  # Fixed seed for UMAP to isolate data variance
    )
    
    hdbscan_model = HDBSCAN(
        min_cluster_size=int(chosen_params['min_cluster_size']),
        min_samples=int(chosen_params['min_samples']),
        prediction_data=True
    )
    
    vectorizer_model = CountVectorizer(stop_words="english") 

    return BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        top_n_words=top_n_words,
        nr_topics="auto",
        verbose=False
    )





In [None]:


# --- 2. TRAIN REFERENCE MODEL (100% Data) ---
print("Training Reference Model on 100% of data...")
sentences_array = df_reports
embeddings_ref = embedding_model.encode(sentences_array, show_progress_bar=True)

ref_model = create_model_instance()
topics_ref, probs_ref = ref_model.fit_transform(sentences_array, embeddings_ref)

# Get Reference Centroids (Topic Embeddings)
# Note: topic_embeddings_ is a list where index corresponds to topic ID + 1 (usually)
# Use get_topic_info() to map ID to actual embedding safely
ref_info = ref_model.get_topic_info()
ref_topics = ref_info[ref_info['Topic'] != -1]['Topic'].tolist() # Exclude outliers

# Extract actual embeddings for valid topics
ref_centroids = []
for tid in ref_topics:
    #  retrieves the embedding vector for topic ID 'tid'
    ref_centroids.append(ref_model.topic_embeddings_[tid + ref_model._outliers]) 

ref_centroids = np.array(ref_centroids)
print(f"Reference Model has {len(ref_topics)} topics.")


# --- 3. BOOTSTRAP LOOP ---
print(f"\nStarting {NUM_BOOTSTRAPS} Bootstrap Iterations ({SAMPLE_FRAC*100}% sample)...")

similarity_scores = {tid: [] for tid in ref_topics} # Store scores for each topic separately

for i in range(NUM_BOOTSTRAPS):
    #  sample indices so we can grab the corresponding pre-computed embeddings (faster)
    n_samples = int(len(sentences_array) * SAMPLE_FRAC)
    indices = np.random.choice(len(sentences_array), n_samples, replace=False)
    
    subset_sentences = [sentences_array[j] for j in indices]
    subset_embeddings = embeddings_ref[indices]
    
    # train new model on subset
    bs_model = create_model_instance()
    bs_model.fit(subset_sentences, subset_embeddings)
    
    # extract new centroids
    bs_info = bs_model.get_topic_info()
    bs_valid_topics = bs_info[bs_info['Topic'] != -1]['Topic'].tolist()
    
    if not bs_valid_topics:
        print(f"Iter {i}: No topics found. Skipping.")
        continue
        
    bs_centroids = np.array([bs_model.topic_embeddings_[tid + bs_model._outliers] for tid in bs_valid_topics])
    
    # compare: find best match for each reference topic
    # calculate similarity matrix (Rows: Ref Topics, Cols: Bootstrap Topics)
    sim_matrix = cosine_similarity(ref_centroids, bs_centroids)
    
    # for each reference topic, find the max similarity in the new model
    for idx, topic_id in enumerate(ref_topics):
        best_match_score = np.max(sim_matrix[idx]) # Best match found in this bootstrap run
        similarity_scores[topic_id].append(best_match_score)
        
    if (i+1) % 10 == 0:
        print(f"Completed {i+1}/{NUM_BOOTSTRAPS} runs...")

# --- 4. RESULTS AGGREGATION ---
print("\n--- BOOTSTRAP ROBUSTNESS RESULTS ---")
final_scores = []
for tid in ref_topics:
    scores = similarity_scores[tid]
    mean_score = np.mean(scores)
    std_score = np.std(scores)
    final_scores.append(mean_score)
    print(f"Topic {tid}: Mean Sim = {mean_score:.3f} (+/- {std_score:.3f})")

grand_mean = np.mean(final_scores)
grand_std = np.std(final_scores)

print(f"\nGLOBAL STABILITY (Average Cosine Similarity): {grand_mean:.3f} (+/- {grand_std:.3f})")


Training Reference Model on 100% of data...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Reference Model has 7 topics.

Starting 100 Bootstrap Iterations (80.0% sample)...
Completed 10/100 runs...
Completed 20/100 runs...
Completed 30/100 runs...
Completed 40/100 runs...
Completed 50/100 runs...
Completed 60/100 runs...
Completed 70/100 runs...
Completed 80/100 runs...
Completed 90/100 runs...
Completed 100/100 runs...

--- BOOTSTRAP ROBUSTNESS RESULTS ---
Topic 0: Mean Sim = 0.969 (+/- 0.023)
Topic 1: Mean Sim = 0.956 (+/- 0.020)
Topic 2: Mean Sim = 0.980 (+/- 0.025)
Topic 3: Mean Sim = 0.894 (+/- 0.060)
Topic 4: Mean Sim = 0.911 (+/- 0.039)
Topic 5: Mean Sim = 0.900 (+/- 0.048)
Topic 6: Mean Sim = 0.906 (+/- 0.032)

GLOBAL STABILITY (Average Cosine Similarity): 0.931 (+/- 0.033)
