# Helper Functions

## Monte Carlo Functions

### DGP

In [1]:
import pandas as pd

In [None]:
from itertools import product
import pandas as pd
from tqdm import tqdm

def expand_grid(dictionary):
    # build small DataFrames for each section
    dfs = {}
    for section, params in dictionary.items():
        dfs[section] = pd.DataFrame([row for row in product(*params.values())],
                       columns=params.keys())

    # cross-join all section DataFrames
    full = dfs.popitem()[1]
    for name, df in dfs.items():
        full = full.merge(df, how="cross", suffixes=("", f"_{name}"))

    return full

# Sample input:
# dictionary = {
#     'umap' : {
#         'n_neighbors' : [5, 10, 15, 30],
#         'min_dist' : [0.0, 0.1],
#         'n_components' : [5, 10],
#     },
#     'hdbscan' : {
#         'min_cluster_size' : [20, 30, 40],
#         'min_samples' : [1, 10],
#     },
#     'embedding' : {
#         'type': ['bert', 'sbert']
#     }
# }

In [None]:
from random import sample
import random
random.seed(23)

def generate_responses(df, distribution = 'uniform', n_ideal = None, noise_rate = 0, n_per_group = None):
  """
  Data Generating Conditions
  - # of ideal lives
  - n_per_group: # Samples per group (max = 60)
  - noise_rate: Adds % of docs that don't belong to true clusters

  """

  if distribution == 'z':

    # First we obtain sets of adjectives that satisfy requirement until no sets satisfy

    ds = [790, 400, 280, 210, 170, 150, 130, 110, 100, 90, 90, 80, 80, 70, 70]

    groups = {}

    for d in ds:
      # Adjectives with value counts >= d
      z_adj = df['adj'].value_counts().loc[lambda s: (s >= d) & (s<= d+10)].index.tolist()

      # Assign each incrementing numbers
      for i, a in enumerate(z_adj):
        key = str(i)
        if key not in groups.keys():
          groups[key] = []
        groups[key].append(a)

    for i, adj in groups.items():
      if n_ideal:
        adj = adj[:n_ideal]



      groups[i] = df[df['adj'].isin(adj)].copy()

      # print(len(groups[i]))

      groups[i].reset_index(drop=True, inplace=True)

    return groups


  elif distribution == 'uniform':

    if n_ideal:
      df = df[df.groupby('adj').cumcount() < n_per_group].copy()

    df = df.reset_index(drop=True)

    # Sample Adjectives
    if n_ideal:
      adj = sample(list(df['adj'].unique()), n_ideal)
      subset = df[df['adj'].isin(adj)].copy()
    else:
      subset = df.copy()

    # Noise
    if noise_rate:
      noise = [a for a in df['adj'].unique() if a not in adj]
      subset = pd.concat([subset, df[df['adj'].isin(noise)].sample(n=int(noise_rate * len(subset)), replace=False, random_state=23)])
    return subset

In [3]:
def key_from_params(params: dict):
    # drop Nones (optional), then sort for stability
    cleaned = {k: v for k, v in params.items() if v is not None}
    return tuple(sorted(cleaned.items()))   # hashable, usable as a dict key

def params_from_key(key) -> dict:
    return dict(key)

In [4]:
def dgp(df, grid, subsets):
  expanded_grid = expand_grid(grid).to_dict(orient='records')
  for params in expanded_grid:

    subset = generate_responses(df, **params)

    if params['distribution'] == 'z':


      variant_grid = []

      for i, s in subset.items():
        if int(i) > 2:
          break
        subset_name = key_from_params(params | {'variant': i})
        subsets[subset_name] = s

    else:
      subset_name = key_from_params(params)
      subsets[subset_name] = subset

### LDA

In [None]:

# LDA

import re
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from tqdm import tqdm
tqdm.pandas()

stop_words = set(stopwords.words('english'))

def _preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = text.split()  # Tokenization using simple split

    # Define custom words to remove (including "Life")
    custom_stopwords = stop_words.union({"life"})  # Add "life" to stopword set

    # Remove stopwords + "life"
    tokens = [word for word in tokens if word not in custom_stopwords]
    return tokens


def _get_dominant_topic(lda_model, doc):
  topic_probs = lda_model.get_document_topics(doc)
  return max(topic_probs, key=lambda x: x[1])[0] + 1  # Adding 1 for 1-based index


def run_lda(df, params, n_true):
    """
    params: num_topics

    Takes in docs (list) and n_components (int)

    Returns cluster label for each doc (list)
    """

    df = df.copy()

    cleaned_text = df['response'].progress_apply(_preprocess_text)
    dictionary = Dictionary(cleaned_text)
    corpus = [dictionary.doc2bow(text) for text in cleaned_text]

    if params['num_topics'] == 'auto':

      pass

    elif params['num_topics'] == 'true':
        lda_model = LdaModel(
            corpus=corpus,
            id2word=dictionary,
            random_state=23,
            num_topics=n_true
        )
        clusters = [_get_dominant_topic(lda_model, doc) for doc in corpus]

    else:
        lda_model = LdaModel(
            corpus=corpus,
            id2word=dictionary,
            random_state=23,
            num_topics=params['num_topics']
        )
        clusters = [_get_dominant_topic(lda_model, doc) for doc in corpus]

    df['cluster'] = clusters

    output = {
        'df': df,
        'topics': clusters,
        'lda_model': lda_model
    }

    return output


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Kmeans

In [7]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np

def run_kmeans(df, embedding_colname, n_true, n_clusters):
  # Silhouette test

  # Choose 5 points before and after
  if n_clusters == 'auto':
    before = [n_true - x*2 for x in range(1, 3) if n_true - x*2 > 0]
    after = [n_true + x*2 for x in range(1, 3)]

    outputs = []

    for n in before + after + [n_true]:
      # kmeans = KMeans(
      #     n_clusters=n,
      #     random_state=23,
      # )
      # clusters = kmeans.fit(np.vstack(df[embedding_colname]))

      # outputs.append({
      #     'n': n,
      #     'silhouette': silhouette_score(np.vstack(df[embedding_colname]), clusters.labels_),
      #     'kmeans_model': kmeans
      # }
      # )

      # Using bertopic


      bertopic_out = run_bertopic(list(df['response']), np.vstack(df[embedding_colname]), {'kmeans': {'n_clusters': n, 'random_state':23}}, representation_model=None, bertopic_kwargs={})

      outputs.append({
          'n': n,
          'silhouette': silhouette_score(np.vstack(df[embedding_colname]), bertopic_out['topics']),
          'model': bertopic_out
      })



    # Pick best model
    best_model = max(outputs, key=lambda x: x['silhouette'])
    final = {
        'topics': best_model['model']['topics'],
        'model': best_model['model'],
        'info_criteria_test': outputs
    }

  if n_clusters == 'true':
    # kmeans = KMeans(
    #     n_clusters=n_true,
    #     random_state=23,
    # )
    # clusters = kmeans.fit(np.vstack(df[embedding_colname]))
    # final = {
    #     'topics': clusters.labels_,
    #     'kmeans_model': kmeans
    # }

    bertopic_out = run_bertopic(list(df['response']), np.vstack(df[embedding_colname]), {'kmeans': {'n_clusters': n_true, 'random_state': 23}}, representation_model=None, bertopic_kwargs={})
    final = {
        'topics': bertopic_out['topics'],
        'model': bertopic_out
    }


  return final

### BERTopic

In [8]:
import numpy as np
import inspect
from typing import Dict, Optional

from bertopic import BERTopic
from bertopic.dimensionality import BaseDimensionalityReduction
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.cluster import KMeans

def _build_umap(umap_params: Optional[Dict]) -> object:
    """Return a UMAP model or a no-op dimensionality reducer."""
    if umap_params is None:
        return BaseDimensionalityReduction()  # disables DR
    return UMAP(**umap_params)

def _build_clusterer(hdbscan_params: Optional[Dict], kmeans_params: Optional[Dict]) -> object:
    """Return exactly one clusterer; raise if none or both are provided."""
    provided = sum(p is not None for p in (hdbscan_params, kmeans_params))
    if provided == 0:
        raise ValueError("You must provide either 'hdbscan' or 'kmeans' parameters.")
    if provided > 1:
        raise ValueError("Please provide only one of 'hdbscan' or 'kmeans' parameters, not both.")

    if hdbscan_params is not None:
        return HDBSCAN(**hdbscan_params)

    return KMeans(**kmeans_params)

def _make_topic_model(umap_model, cluster_model, representation_model=None, bertopic_kwargs: Optional[Dict]=None) -> BERTopic:
    """Instantiate BERTopic across versions that may/may not accept `cluster_model` kwarg."""
    bertopic_kwargs = bertopic_kwargs or {}

    return BERTopic(
        embedding_model=None,
        umap_model=umap_model,
        hdbscan_model=cluster_model,
        representation_model=representation_model,
        **bertopic_kwargs
    )

def run_bertopic(
    docs,
    embeddings,
    grid: Dict[str, Dict] = None,
    *,
    representation_model=None,
    bertopic_kwargs: Optional[Dict] = None,
):
    """
    Run BERTopic with optional UMAP and exactly one clusterer (HDBSCAN or KMeans).

    Parameters
    ----------
    docs : list[str]
        The original documents.
    embeddings : array-like
        Precomputed embeddings aligned with `docs`. Will be converted via np.asarray.
    grid : dict
        Parameter grid with optional keys:
          - 'umap':   dict of UMAP parameters (or None to disable DR)
          - 'hdbscan': dict of HDBSCAN parameters (mutually exclusive with 'kmeans')
          - 'kmeans': dict of KMeans parameters (mutually exclusive with 'hdbscan')
    representation_model : optional
        A BERTopic-compatible representation model (e.g., bertopic.representation.OpenAI(...)).
        Pass None to disable GPT-based representations.
    bertopic_kwargs : optional dict
        Extra kwargs forwarded to BERTopic (e.g., verbose=True, calculate_probabilities=True, etc.)

    Returns
    -------
    topic_model : BERTopic
    topics : list[int]
    probs : np.ndarray or None
    """
    grid = grid or {}
    umap_params    = grid.get("umap", None)
    hdbscan_params = grid.get("hdbscan", None)
    kmeans_params  = grid.get("kmeans", None)

    umap_model    = _build_umap(umap_params)
    cluster_model = _build_clusterer(hdbscan_params, kmeans_params)


    if len(umap_params) == 0 and len(hdbscan_params) == 0:
      topic_model = BERTopic(representation_model = None)


    else:
      topic_model = _make_topic_model(
          umap_model=umap_model,
          cluster_model=cluster_model,
          representation_model=representation_model,
          bertopic_kwargs=bertopic_kwargs,
      )

    embeddings_np = np.array(embeddings)

    topics, probs = topic_model.fit_transform(docs, embeddings_np)

    output = {}
    output['model'] = topic_model
    output['topics'] = topics
    output['probs'] = probs
    output['topic_info'] = topic_model.get_topic_info().copy()

    # Get Figures
    output['fig'] = topic_model.visualize_documents(docs = docs, embeddings = embeddings_np)

    output['hierarchy'] = topic_model.visualize_hierarchy()

    # Get top 10 rep docs
    doc_topic = pd.DataFrame({
      'Topic':topic_model.topics_,
      'ID':range(len(topic_model.topics_)),
      'Document':docs}
    ) # topics and docs combined, required by internal functions

    topic_model._create_topic_vectors(doc_topic,embeddings_np) # populate topic embeddings
    #topic_model._save_representative_docs(doc_topic)
    repr_docs, _, _, _=  topic_model._extract_representative_docs(
        topic_model.c_tf_idf_,
        doc_topic,
        topic_model.topic_representations_,
        nr_samples=1000,
        nr_repr_docs=10
    )
    topic_model.representative_docs_ = repr_docs

    rep_cols = (
    topic_model.get_topic_info()["Representative_Docs"]
      .apply(pd.Series)        # turn each list into its own row‑wise Series
      .rename(columns=lambda i: f"rep{i+1}")   # 0 → rep1, 1 → rep2, …
    )

    df = pd.concat([topic_model.get_topic_info().drop(columns=["Representative_Docs"]), rep_cols], axis=1)

    output['rep_docs'] = df.copy()

    return output


  axis.set_ylabel('$\lambda$ value')
  $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.


### Cosine Similarity

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from sklearn.cluster import KMeans
from tqdm import tqdm
tqdm.pandas()

import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from string import punctuation
import unicodedata
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained(
    'bert-base-uncased',
    output_hidden_states=True,
    return_dict=True
)

###############################
# 2. Preprocessing function
###############################
def preprocess_text(text):
    import unicodedata

    if pd.isnull(text) or text.strip() == "":
        return None

    # Normalize Unicode (remov accents, special characters)
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

    # Remove redundant whitespace
    text = " ".join(text.split())

    return text.strip()

###############################
# 3. BERT embedding function
###############################
def get_bert_embedding(text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Use GPU
    model.to(device)

    # This function should rely on a loaded tokenizer & model with hidden_states=True
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True, return_dict=True)
        # second-to-last layer
        second_last_layer = outputs.hidden_states[-2]  # shape: [batch_size, seq_len, hidden_size]
        embeddings = second_last_layer.mean(dim=1)     # mean pooling
    return embeddings.cpu().numpy().flatten()

###############################
# 4. Cosine similarity function
###############################
def compute_similarity(emb1, emb2):
    emb1_normalized = torch.nn.functional.normalize(emb1, p=2, dim=1)
    emb2_normalized = torch.nn.functional.normalize(emb2, p=2, dim=1)
    return torch.mm(emb1_normalized, emb2_normalized.t())

#### Get base embeddings

def get_base_embs(phrase, adjective_list):
  base_embs = {}
  for adj in adjective_list:
    base_sentence = phrase.replace("...", f"{adj}")
    base_emb = get_bert_embedding(base_sentence)
    base_embs[adj] = base_emb
  return base_embs


tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained(
    'bert-base-uncased',
    output_hidden_states=True,
    return_dict=True
)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [10]:
from sentence_transformers import SentenceTransformer

qwen = SentenceTransformer("Qwen/Qwen3-Embedding-4B")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.08G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

In [11]:
qwen = model

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# TODO: Figure out why gotta reshape

def find_best_match(text_embedding, phrase_embeddings):
  max = 0
  for adj, embedding in phrase_embeddings.items():
    text_embedding = np.array(text_embedding).reshape(1, -1)
    embedding = np.array(embedding).reshape(1, -1)
    sim = cosine_similarity(text_embedding, embedding)
    if sim > max:
      max = sim
      best_match = adj
  return best_match


def cos_sim(df, model = 'qwen'):
  adjs = df['adj'].unique()

  if model == 'qwen':
    phrase_embeddings = {
        adj: qwen.encode(f"My ideal life is {adj}", convert_to_numpy=True)
        for adj in adjs
    }

    clusters = df['qwen'].progress_apply(lambda x: find_best_match(x, phrase_embeddings))

  if model == 'bert':
    phrase_embeddings = {
        adj: get_bert_embedding(f"My ideal life is {adj}")
        for adj in adjs
    }

    clusters = df['bert'].progress_apply(lambda x: find_best_match(x, phrase_embeddings))


  return clusters

### Monte Carlo Wrapper

In [13]:
grid = {
    'kmeans' : {
        'n_clusters' : [5, 10, 15, 20, 50, 'true', 'auto'],
    },
    'embedding' : {
        'type': ['bert']
    }
}


In [14]:
# TODO: Better way of storing BERTopic results

def monte_carlo(df, dgp, grid, results, method = 'embedding'):

  expanded_grid = expand_grid(grid)

  total = len(dgp) * len(expanded_grid)
  with tqdm(total=total, desc="Total combos") as pbar:
      for dgp_key, subset in dgp.items():
          for _, row in expanded_grid.iterrows():


              # print(row)
              input = {}

              # TOPIC MODELLING
              if 'lda' in grid.keys():
                lda_params_col = grid['lda'].keys()
                input['lda'] = row[list(lda_params_col)].to_dict() # Convert to dict for run_bertopic

                results.append(
                    {
                        'name': 'lda',
                        'params': params_from_key(dgp_key) | row.to_dict(),
                        'output': run_lda(subset, input['lda'], n_true=params_from_key(dgp_key)['n_ideal']),
                        'df': subset
                    }
                )

                break


              embedding_type_str = row['type']
              embedding_data_series = subset[embedding_type_str]

              # Convert the embeddings Series to a NumPy array
              embedding_data_np = embedding_data_series.tolist()


              if 'kmeans' in grid.keys() and (row['n_clusters'] == 'auto' or row['n_clusters'] == 'true'):

                results.append(
                    {
                        'name': 'kmeans',
                        'params': params_from_key(dgp_key) | row.to_dict(),
                        'output': run_kmeans(subset, embedding_type_str, params_from_key(dgp_key)['n_ideal'], row['n_clusters']),
                        'df': subset
                    }
                )

              else:

                # EMBEDDING METHOD
                umap_params = None
                name = None

                if 'umap' in grid.keys():
                  umap_params_col = grid['umap'].keys()
                  input['umap'] = row[list(umap_params_col)].to_dict() # Convert to dict for run_bertopic

                if 'hdbscan' in grid.keys():
                  hdbscan_params_col = grid['hdbscan'].keys()
                  input['hdbscan'] = row[list(hdbscan_params_col)].to_dict() # Convert to dict for run_bertopic
                  name = 'hdbscan'

                elif 'kmeans' in grid.keys():
                  kmeans_params_col = grid['kmeans'].keys()
                  input['kmeans'] = row[list(kmeans_params_col)].to_dict() # Convert to dict for run_bertopic
                  name = 'kmeans'

                results.append(
                    {
                        'name': name,
                        'params': params_from_key(dgp_key) | row.to_dict(),
                        'output': run_bertopic(list(subset['response']), embedding_data_np, input, representation_model=None, bertopic_kwargs={}),
                        'df': subset
                    }
                )

              pbar.update(1)

In [31]:
import numpy as np
import pandas as pd
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score
from sklearn.metrics.cluster import contingency_matrix

def clustering_report(y_true, y_pred):
    """
    Compute clustering metrics between ground-truth labels and cluster labels.

    Parameters
    ----------
    y_true : array-like (pd.Series, list, np.ndarray)
        Ground-truth class labels (can be strings or ints).
    y_pred : array-like
        Cluster assignments (can be strings or ints).

    Returns
    -------
    dict with keys: 'NMI', 'ACC', 'Purity', 'ARI'
    """
    # Convert to pandas Series for easy NA handling
    # reset index
    y_true = pd.Series(y_true).reset_index(drop=True)
    y_pred = pd.Series(y_pred).reset_index(drop=True)


    # Drop pairs with NA
    mask = (~y_true.isna()) & (~y_pred.isna())
    y_true = y_true[mask].to_numpy()
    y_pred = y_pred[mask].to_numpy()

    if y_true.size == 0:
        raise ValueError("No valid (non-NA) label pairs to compare.")

    # Contingency table: rows=true classes, cols=predicted clusters
    C = contingency_matrix(y_true, y_pred)  # shape: [n_true, n_pred]
    N = C.sum()

    # Purity: sum over clusters of majority true class proportion
    purity = np.sum(C.max(axis=0)) / N

    # ACC: maximize trace via Hungarian algorithm
    try:
        from scipy.optimize import linear_sum_assignment
        row_ind, col_ind = linear_sum_assignment(-C)  # maximize
        acc = C[row_ind, col_ind].sum() / N
    except Exception:
        # Greedy fallback if SciPy isn't available
        C_work = C.copy().astype(float)
        acc_sum = 0.0
        used_rows, used_cols = set(), set()
        while len(used_rows) < C.shape[0] and len(used_cols) < C.shape[1]:
            i, j = np.unravel_index(np.argmax(C_work), C_work.shape)
            if C_work[i, j] <= -1:  # exhausted
                break
            acc_sum += C[i, j]
            used_rows.add(i); used_cols.add(j)
            C_work[i, :] = -1
            C_work[:, j] = -1
        acc = acc_sum / N

    # NMI & ARI
    nmi = normalized_mutual_info_score(y_true, y_pred, average_method="arithmetic")
    ari = adjusted_rand_score(y_true, y_pred)

    return {"NMI": float(nmi), "ACC": float(acc), "Purity": float(purity), "ARI": float(ari)}


def evaluate(results):
  report = pd.DataFrame()
  for result in results:
    # turn into dataframe
    # add names
    df = pd.DataFrame(result['params'], index=[0])
    df['name'] = result['name']
    metrics = clustering_report(result['df']['adj'], result['output']['topics'])
    df['NMI'] = metrics['NMI']
    df['ACC'] = metrics['ACC']
    df['Purity'] = metrics['Purity']
    df['ARI'] = metrics['ARI']
    report = pd.concat([report, df])
  return report

In [32]:
def search(results, params):
    """
    Keep results where every key in `params` either:
      - is missing in result['params'] (ignored), or
      - exists and equals the requested value.
    """
    out = []
    for r in results:
        rp = r.get('params', {})
        if all((k not in rp) or (rp[k] == v) for k, v in params.items()):
            out.append(r)
    return out


# Run Simulation

In [18]:
import pickle as pkl

with open(here + 'data/IdealLifeResponse_Encoded.pkl', 'rb') as f:
    df = pkl.load(f)

In [19]:
df = pd.DataFrame(df)

### DGP

In [20]:
grid = {
    'dgp': {
        'n_ideal': [10, 50],
        'noise_rate': [0],
        'n_per_group': [30],
        'distribution': ['z', 'uniform']
    },
}

subsets = {}

dgp(df = df, grid = grid, subsets = subsets)

### Matching

In [None]:
matches = cos_sim(subsets[list(subsets.keys())[0]])

100%|██████████| 2470/2470 [00:10<00:00, 227.69it/s]


In [None]:
matches_bert = cos_sim(subsets[list(subsets.keys())[0]], model = 'bert')

100%|██████████| 2470/2470 [00:10<00:00, 242.73it/s]


### LDA

In [None]:
grid = {
    'lda': {
        'num_topics': ['true']
    }
}

results_lda = []
monte_carlo(df = df, dgp = subsets, grid = grid, results = results_lda)

Total combos:   0%|          | 0/8 [00:00<?, ?it/s]
100%|██████████| 2470/2470 [00:00<00:00, 41121.12it/s]

100%|██████████| 2470/2470 [00:00<00:00, 48435.34it/s]

100%|██████████| 2320/2320 [00:00<00:00, 48542.28it/s]

100%|██████████| 300/300 [00:00<00:00, 45329.13it/s]

100%|██████████| 2640/2640 [00:00<00:00, 46274.43it/s]

100%|██████████| 2640/2640 [00:00<00:00, 48119.46it/s]

100%|██████████| 2490/2490 [00:00<00:00, 49798.62it/s]

100%|██████████| 1500/1500 [00:00<00:00, 45611.41it/s]
Total combos:   0%|          | 0/8 [00:21<?, ?it/s]
