#### Create environment with Geometricus 0.5.0
```
conda create -n geometricus python=3.9 tqdm
conda activate geometricus
pip install git+https://github.com/TurtleTools/geometricus.git
pip install papermill
```

#### Notes:
- Due to the large output, we recommend running this notebook using:  
`papermill embeddings.ipynb embeddings_out.ipynb`  
and deleting `embeddings_out.ipynb` afterward
- There are two examples: small dataset (ProtGPT2) with 10,000 structures and large dataset (BFVD) with 351,242 structures
- The second approach utilizes batching due to large RAM usage
- Please adjust the code to generate embeddings for your own dataset (only filepath with PDB structres is required)

In [None]:
import geometricus
import numpy as np
import tqdm

from pathlib import Path
from geometricus import get_invariants_for_structures, Geometricus, ShapemerLearn

In [None]:
assert geometricus.__version__ == '0.5.0'

In [None]:
PROTGPT2_PATH = Path('/path/to/protgpt2/structures')
BFVD_PATH = Path('/path/to/bfvd/structures')
OUT_PATH = Path('/path/to/outputs')

N_THREADS = 32  # number of CPU threads

# ProtGPT2

## Compute moments and invariants

In [None]:
learned_invariants, learned_errors = get_invariants_for_structures(PROTGPT2_PATH, n_threads=N_THREADS)

In [None]:
model = ShapemerLearn.load()

## Get count matrix (embeddings)

In [None]:
shapemer_class = Geometricus.from_invariants(learned_invariants, model=model)

## Objects to save

In [None]:
with open(OUT_PATH / 'protgpt2_indices.txt', 'w') as f:
    for el in shapemer_class.protein_keys:
        f.write(f"{el}\n")

In [None]:
with open(OUT_PATH / 'protgpt2_keys.txt', 'w') as f:
    for el in shapemer_class.shapemer_keys:
        f.write(f"{el}\n")

In [None]:
basis = shapemer_class.get_count_matrix().astype(np.uint16)
np.savez_compressed(OUT_PATH / 'protgpt2_shapemers.npz', matrix=basis)

# BFVD

#### Note: we use batching, since the number of structures is large

## Compute embeddings & save results

In [None]:
model = ShapemerLearn.load()

In [None]:
def batch_files(path, batch_size):
    batches = []
    for i in path.glob("*"):
        batches.append(i)
        if len(batches)==batch_size:
            yield batches
            batches = []
    yield batches

In [None]:
(OUT_PATH / BFVD_PATH.name).mkdir(parents=True, exist_ok=True)

for enum, batch in enumerate(tqdm.tqdm(list(batch_files(BFVD_PATH, N_THREADS * 100)))):
    learned_invariants, learned_errors = get_invariants_for_structures(batch, n_threads=N_THREADS, verbose=False)
    # Get count matrix (embeddings)
    shapemer_class = Geometricus.from_invariants(learned_invariants, model=model)

    # Save
    with open(OUT_PATH / BFVD_PATH.name / f'{enum}_indices.txt', 'w') as f:
        for el in shapemer_class.protein_keys:
            f.write(f"{el}\n")
    with open(OUT_PATH / BFVD_PATH.name / f'{enum}_keys.txt', 'w') as f:
        for el in shapemer_class.shapemer_keys:
            f.write(f"{el}\n")

    count_matrix = shapemer_class.get_count_matrix()
    assert count_matrix.max() < 65_000
    basis = count_matrix.astype(np.uint16)
    np.savez_compressed(OUT_PATH / BFVD_PATH.name / f'{enum}_shapemers.npz', matrix=basis)