# Embedding Models: Load dataset and extract embeddings

In [5]:
# Standard imports
import json
import os
import numpy as np
from pprint import pprint

# For nicer display
np.set_printoptions(precision=4, suppress=True)

# Path to dataset: update if your filename is different
DATASET_PATH = './Text_Dataset.json'  # <-- replace with your JSON filename if different

# Path to embeddings module
EMBEDDINGS_MODULE = './Embeddings.py'  # embeddings.py should be in the same folder


In [6]:
# Load JSON (robust to different structures)
from collections.abc import Mapping

with open(DATASET_PATH, 'r', encoding='utf-8') as f:
    data = json.load(f)

print('Top-level JSON type:', type(data))

# Heuristic to collect candidate text entries from JSON
texts = []

# If the JSON is a list of strings, use directly
if isinstance(data, list) and all(isinstance(x, str) for x in data):
    texts = data

# If list of dicts, try common keys
elif isinstance(data, list) and all(isinstance(x, dict) for x in data):
    # try keys like 'text', 'sentence', 'caption', 'content', 'title'
    candidate_keys = ['text', 'sentence', 'caption', 'content', 'title', 'body']
    for item in data:
        found = False
        for k in candidate_keys:
            if k in item and isinstance(item[k], str):
                texts.append(item[k])
                found = True
                break
        if not found:
            # fallback: flatten dict values that are strings
            for v in item.values():
                if isinstance(v, str):
                    texts.append(v)
                    break

# If top-level dict, try to find lists inside
elif isinstance(data, dict):
    # If values contain a list of strings or dicts
    for k, v in data.items():
        if isinstance(v, list):
            if all(isinstance(x, str) for x in v):
                texts.extend(v)
            elif all(isinstance(x, dict) for x in v):
                # try to extract 'text'-like fields
                for item in v:
                    for cand in ['text','sentence','caption','content','title','body']:
                        if cand in item and isinstance(item[cand], str):
                            texts.append(item[cand])
                            break
        elif isinstance(v, str):
            texts.append(v)

# Deduplicate and filter empties
texts = [t.strip() for t in texts if isinstance(t, str) and t.strip()]
texts = list(dict.fromkeys(texts))  # preserve order but unique

print(f'Extracted {len(texts)} text entries (after heuristics).')
if len(texts) > 10:
    print('Sample texts:')
    for s in texts[:5]:
        print('-', s[:200])
else:
    pprint(texts)

if len(texts) == 0:
    raise ValueError('No text data found by heuristics. Please inspect the JSON structure and update the notebook code to extract the text field(s).')


Top-level JSON type: <class 'list'>
Extracted 18029 text entries (after heuristics).
Sample texts:
- beside the road here is a polygon pond and four square ponds in a bareland with clusters of trees
- there is a piece of green forest near industrial zone
- dark green trees surround the large green center
- several ripples are in a piece of khaki desert
- in lots of green forest


In [8]:
# Import embedding models from embeddings.py
print('Importing embedding module...')
try:
    import importlib
    embeddings_mod = importlib.import_module(EMBEDDINGS_MODULE)
    embedding_models = getattr(embeddings_mod, 'embedding_models', None)
    if embedding_models is None:
        raise AttributeError('embedding_models dictionary not found in embeddings.py')
    print('Found embedding_models:', list(embedding_models.keys()))
except Exception as e:
    raise ImportError(f'Failed to import embeddings.py or find embedding_models: {e}')

# We'll process a small subset to keep runtime reasonable
MAX_SAMPLES_FOR_TRAINING = 200  # change if you want to run on full dataset
sample_texts = texts[:MAX_SAMPLES_FOR_TRAINING]
print(f'Using {len(sample_texts)} samples for embedding extraction (change MAX_SAMPLES_FOR_TRAINING if needed).')

# Ensure reproducibility
import random
random.seed(77777)

results = {}

for model_name, ModelClass in embedding_models.items():
    print('' + '='*80)
    print('Model:', model_name)
    try:
        # Instantiate model with small embedding_size where possible to speed up
        # For classes that accept embedding_size in constructor, pass a smaller value
        kwargs = {}
        try:
            # Try to create with embedding_size=128 if signature supports it
            instance = ModelClass(embedding_size=128)
        except TypeError:
            try:
                instance = ModelClass()
            except Exception as e:
                # fallback: instantiate without args
                instance = ModelClass
        
        # If the retrieved ModelClass is actually a class type, ensure it's callable
        if callable(instance):
            embedding_matrix = instance(sample_texts)
        else:
            # if instance is the class object itself
            embedding_matrix = ModelClass(sample_texts)

        embedding_matrix = np.array(embedding_matrix)
        print('Embedding shape:', embedding_matrix.shape)
        # store
        results[model_name] = {
            'shape': embedding_matrix.shape,
            'sample': embedding_matrix[:min(5, embedding_matrix.shape[0]), :min(8, embedding_matrix.shape[1])].tolist()
        }
        print('Sample (first rows x first cols):')
        print(embedding_matrix[:min(5, embedding_matrix.shape[0]), :min(8, embedding_matrix.shape[1])])
    except Exception as e:
        print(f'ERROR processing model "{model_name}": {e}')
        import traceback
        traceback.print_exc()
        results[model_name] = {'error': str(e)}

# Summarize results
print('' + '='*80)
print('Summary of results:')
for mn, info in results.items():
    if 'error' in info:
        print(f'- {mn}: FAILED -> {info["error"]}')
    else:
        print(f'- {mn}: shape={info["shape"]}')

# Save summary to a JSON file
with open('embedding_results_summary.json', 'w', encoding='utf-8') as fout:
    json.dump(results, fout, indent=2)
print('Saved summary to embedding_results_summary.json')


Importing embedding module...


ImportError: Failed to import embeddings.py or find embedding_models: the 'package' argument is required to perform a relative import for './Embeddings.py'