# Test GeoPoc docker image

GeoPoc uses ESM-2

### Compute pdb structures

In [8]:
%%bash

mkdir -p /home/ec2-user/SageMaker/GeoPoc/tests/features/pdb
mkdir -p /home/ec2-user/SageMaker/GeoPoc/tests/features/embedding/temp
mkdir -p /home/ec2-user/SageMaker/GeoPoc/tests/features/embedding/pH
mkdir -p /home/ec2-user/SageMaker/GeoPoc/tests/features/embedding/salt
mkdir -p /home/ec2-user/SageMaker/GeoPoc/tests/features/DSSP
sudo chmod -R 777 /home/ec2-user/SageMaker/GeoPoc/tests/features

In [3]:
!cd ../Proteus && pip install -qr requirements.txt
!cd ../Proteus && pip uninstall -y proteus && poetry build && pip install dist/proteus-0.0.0-py3-none-any.whl

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sphinx 8.1.3 requires docutils<0.22,>=0.20, but you have docutils 0.16 which is incompatible.[0m[31m
[0mBuilding [36mproteus[39m ([39;1m0.0.0[39;22m)
  - Building [34msdist[39m
  - Built [32mproteus-0.0.0.tar.gz[39m
  - Building [34mwheel[39m
  - Built [32mproteus-0.0.0-py3-none-any.whl[39m
Processing ./dist/proteus-0.0.0-py3-none-any.whl
Installing collected packages: proteus
Successfully installed proteus-0.0.0


## Fold sequences

In [18]:
from proteus.fitness_helpers import run_esmfold

pdb_dir = "/home/ec2-user/SageMaker/GeoPoc/tests/features"

run_esmfold(
    fasta_path="/home/ec2-user/SageMaker/GeoPoc/GeoPoc/example/test.fasta",
    pdb_output_path=pdb_dir,
    cpu_only=False,
    docker_image="ghcr.io/new-atlantis-labs/esmfold:latest",
)

## Run GeoPoc: optimal temperature

In [11]:
%%bash

docker run --rm --gpus "all" \
  -v /home/ec2-user/SageMaker/GeoPoc/tests/:/app/GeoPoc/input \
  -v /home/ec2-user/SageMaker/GeoPoc/tests/outputs:/app/GeoPoc/output \
  -v /home/ec2-user/SageMaker/GeoPoc/tests/features:/app/GeoPoc/features \
  ghcr.io/new-atlantis-labs/geopoc:latest \
  -i /app/GeoPoc/input/test.fasta \
  --feature_path /app/GeoPoc/features/ \
  -o /app/GeoPoc/output/ \
  --task temp \
  --gpu 0 2>&1

100%|██████████| 1/1 [00:00<00:00,  1.17it/s]


## Optimal pH

In [9]:
%%bash

docker run --rm --gpus "all" \
  -v /home/ec2-user/SageMaker/GeoPoc/tests/:/app/GeoPoc/input \
  -v /home/ec2-user/SageMaker/GeoPoc/tests/outputs:/app/GeoPoc/output \
  -v /home/ec2-user/SageMaker/GeoPoc/tests/features:/app/GeoPoc/features \
  ghcr.io/new-atlantis-labs/geopoc:latest \
  -i /app/GeoPoc/input/test.fasta \
  --feature_path /app/GeoPoc/features/ \
  -o /app/GeoPoc/output/ \
  --task pH \
  --gpu 0 2>&1

100%|██████████| 1/1 [00:00<00:00, 33.34it/s]
100%|██████████| 1/1 [00:01<00:00,  1.95s/it]


## Optimal Salt Concentration

In [10]:
%%bash

docker run --rm --gpus "all" \
  -v /home/ec2-user/SageMaker/GeoPoc/tests/:/app/GeoPoc/input \
  -v /home/ec2-user/SageMaker/GeoPoc/tests/outputs:/app/GeoPoc/output \
  -v /home/ec2-user/SageMaker/GeoPoc/tests/features:/app/GeoPoc/features \
  ghcr.io/new-atlantis-labs/geopoc:latest \
  -i /app/GeoPoc/input/test.fasta \
  --feature_path /app/GeoPoc/features/ \
  -o /app/GeoPoc/output/ \
  --task salt \
  --gpu 0 2>&1

100%|██████████| 1/1 [00:00<00:00, 97.28it/s]
100%|██████████| 1/1 [00:00<00:00,  1.14it/s]


## Check ESM-2 embeddings

Default: esm2_t36_3B_UR50D, repr_layers 36, per_tok

In [2]:
import torch

master_emb_path = "tests/features/embedding/A0A2P5KBM8.pt"
master_emb = torch.load(master_emb_path)
master_emb

{'label': 'A0A2P5KBM8',
 'representations': {36: tensor([[-0.0092, -0.2994, -0.0446,  ..., -0.1057, -0.0994,  0.0472],
          [ 0.0281, -0.2679,  0.0986,  ..., -0.0202, -0.0519,  0.1983],
          [-0.0282, -0.2964, -0.0660,  ..., -0.0505, -0.1806,  0.0681],
          ...,
          [-0.1459, -0.0200,  0.0344,  ...,  0.0357, -0.1755,  0.0262],
          [-0.1893,  0.0585, -0.0425,  ...,  0.0749, -0.1993,  0.0551],
          [-0.1980,  0.0546, -0.0686,  ..., -0.1147, -0.2503, -0.0083]])}}

In [7]:
master_emb["representations"][36].shape

torch.Size([330, 2560])

In [1]:
import torch

emb_path = "tests/features/embedding/temp/A0A2P5KBM8.tensor"
embeddings = torch.load(emb_path)
embeddings

tensor([[0.5213, 0.3295, 0.4792,  ..., 0.4221, 0.4717, 0.5725],
        [0.5408, 0.3473, 0.5513,  ..., 0.4725, 0.4949, 0.6559],
        [0.5115, 0.3313, 0.4685,  ..., 0.4546, 0.4320, 0.5840],
        ...,
        [0.4502, 0.4869, 0.5190,  ..., 0.5055, 0.4345, 0.5609],
        [0.4276, 0.5310, 0.4803,  ..., 0.5286, 0.4228, 0.5769],
        [0.4231, 0.5288, 0.4672,  ..., 0.4168, 0.3979, 0.5419]])

In [3]:
embeddings.shape

torch.Size([330, 2560])

In [8]:
import torch
from pathlib import Path

# Define paths
base_path = "tests/features/embedding"
protein_id = "A0A2P5KBM8"
dirs = ['temp', 'pH', 'salt']

# Load tensors
tensors = {}
for dir_name in dirs:
    path = Path(base_path) / dir_name / f"{protein_id}.tensor"
    tensors[dir_name] = torch.load(path)
    print(f"\n{dir_name} tensor shape:", tensors[dir_name].shape)
    print(f"{dir_name} stats - mean: {tensors[dir_name].mean():.6f}, std: {tensors[dir_name].std():.6f}")

# Compare pairs
for i, dir1 in enumerate(dirs):
    for dir2 in dirs[i+1:]:
        is_equal = torch.equal(tensors[dir1], tensors[dir2])
        print(f"\n{dir1} vs {dir2} equal:", is_equal)
        if not is_equal:
            diff = torch.abs(tensors[dir1] - tensors[dir2]).mean()
            print(f"Mean absolute difference: {diff:.6f}")


temp tensor shape: torch.Size([330, 2560])
temp stats - mean: 0.498779, std: 0.072624

pH tensor shape: torch.Size([330, 2560])
pH stats - mean: 0.499010, std: 0.073043

salt tensor shape: torch.Size([330, 2560])
salt stats - mean: 0.498744, std: 0.073552

temp vs pH equal: False
Mean absolute difference: 0.011842

temp vs salt equal: False
Mean absolute difference: 0.013285

pH vs salt equal: False
Mean absolute difference: 0.012708


## There are different min max values for each task

To normalize raw embeddings to the ranges best suited to train the predictive model based in the natural ranges of each property/task

In [9]:
import pickle

ESM_MIN_MAX = pickle.load(open("/home/ec2-user/SageMaker/GeoPoc/GeoPoc/feature_extraction/ESM_Min_Max.pkl",'rb'))
ESM_MIN_MAX

{'temp_Min': array([-1.0107065, -0.8848684, -0.9967965, ..., -0.8216355, -1.0642859,
        -0.9903651], dtype=float32),
 'temp_Max': array([0.9102785 , 0.8916208 , 0.99005544, ..., 0.87448555, 0.9812699 ,
        0.8220415 ], dtype=float32),
 'salt_Min': array([-1.0437392 , -0.8454495 , -0.95653987, ..., -0.8225851 ,
        -1.0623534 , -0.9820347 ], dtype=float32),
 'salt_Max': array([0.87281096, 0.8617154 , 0.8983912 , ..., 0.8534491 , 0.9217443 ,
        0.7613497 ], dtype=float32),
 'pH_Min': array([-1.0453855 , -0.9161436 , -0.96290076, ..., -0.8216355 ,
        -1.0618458 , -0.9613065 ], dtype=float32),
 'pH_Max': array([0.8911413 , 0.8535615 , 0.99005544, ..., 0.8374581 , 0.9812699 ,
        0.7995495 ], dtype=float32)}

## Run GeoPoc with input structures and embeddings

In [3]:
%%bash

mkdir -p /home/ec2-user/SageMaker/GeoPoc/test_emb/features/pdb
mkdir -p /home/ec2-user/SageMaker/GeoPoc/test_emb/features/embedding/temp
mkdir -p /home/ec2-user/SageMaker/GeoPoc/test_emb/features/embedding/pH
mkdir -p /home/ec2-user/SageMaker/GeoPoc/test_emb/features/embedding/salt
mkdir -p /home/ec2-user/SageMaker/GeoPoc/test_emb/features/DSSP
sudo chmod -R 777 /home/ec2-user/SageMaker/GeoPoc/test_emb/features

cp /home/ec2-user/SageMaker/mangrove-plastic-degrading/data/protein_sequences_plastic_degrading_clean/Impranil.faa /home/ec2-user/SageMaker/GeoPoc/test_emb/

cp /home/ec2-user/SageMaker/mangrove-plastic-degrading/outputs/embeddings/Impranil/* /home/ec2-user/SageMaker/GeoPoc/test_emb/features/embedding/

cp /home/ec2-user/SageMaker/mangrove-plastic-degrading/outputs/structures/Impranil/* /home/ec2-user/SageMaker/GeoPoc/test_emb/features/pdb/

In [1]:
%%bash

docker run --rm --gpus "all" \
  -v /home/ec2-user/SageMaker/GeoPoc/test_emb/:/app/GeoPoc/input \
  -v /home/ec2-user/SageMaker/GeoPoc/test_emb/outputs_emb:/app/GeoPoc/output \
  -v /home/ec2-user/SageMaker/GeoPoc/test_emb/features:/app/GeoPoc/features \
  -v /home/ec2-user/SageMaker/GeoPoc/GeoPoc/model:/app/GeoPoc/models \
  ghcr.io/new-atlantis-labs/geopoc:latest \
  -i /app/GeoPoc/input/Impranil.faa \
  --feature_path /app/GeoPoc/features/ \
  -o /app/GeoPoc/output/ \
  --task temp \
  --model_path /app/GeoPoc/models/ \
  --gpu 0 2>&1

100%|██████████| 11/11 [00:55<00:00,  5.06s/it]


In [7]:
%%bash

docker run --rm --gpus "all" \
  -v /home/ec2-user/SageMaker/GeoPoc/test_emb/:/app/GeoPoc/input \
  -v /home/ec2-user/SageMaker/GeoPoc/test_emb/outputs_emb:/app/GeoPoc/output \
  -v /home/ec2-user/SageMaker/GeoPoc/test_emb/features:/app/GeoPoc/features \
  -v /home/ec2-user/SageMaker/GeoPoc/GeoPoc/model:/app/GeoPoc/models \
  ghcr.io/new-atlantis-labs/geopoc:latest \
  -i /app/GeoPoc/input/Impranil.faa \
  --feature_path /app/GeoPoc/features/ \
  -o /app/GeoPoc/output/ \
  --task salt \
  --model_path /app/GeoPoc/models/ \
  --gpu 0 2>&1

100%|██████████| 11/11 [00:00<00:00, 155.16it/s]
100%|██████████| 11/11 [00:02<00:00,  5.42it/s]


In [3]:
%%bash

docker run --rm --gpus "all" \
  -v /home/ec2-user/SageMaker/GeoPoc/test_emb/:/app/GeoPoc/input \
  -v /home/ec2-user/SageMaker/GeoPoc/test_emb/outputs_emb:/app/GeoPoc/output \
  -v /home/ec2-user/SageMaker/GeoPoc/test_emb/features:/app/GeoPoc/features \
  -v /home/ec2-user/SageMaker/GeoPoc/GeoPoc/model:/app/GeoPoc/models \
  ghcr.io/new-atlantis-labs/geopoc:latest \
  -i /app/GeoPoc/input/Impranil.faa \
  --feature_path /app/GeoPoc/features/ \
  -o /app/GeoPoc/output/ \
  --task pH \
  --model_path /app/GeoPoc/models/ \
  --gpu 0 2>&1

100%|██████████| 11/11 [00:01<00:00,  8.54it/s]
100%|██████████| 11/11 [00:01<00:00,  6.00it/s]


In [6]:
%%bash

cp -r test_emb test_emb_mini

In [8]:
%%bash

MODELDIR="/home/ec2-user/SageMaker/models"

docker run --rm --gpus "all" \
  -v /home/ec2-user/SageMaker/GeoPoc/test_emb_mini/:/app/GeoPoc/input \
  -v /home/ec2-user/SageMaker/GeoPoc/test_emb_mini/outputs_emb:/app/GeoPoc/output \
  -v /home/ec2-user/SageMaker/GeoPoc/test_emb_mini/features:/app/GeoPoc/features \
  -v /home/ec2-user/SageMaker/GeoPoc/GeoPoc/model:/app/GeoPoc/models \
  -v "${MODELDIR}:/root/.cache/torch/hub/checkpoints" \
  ghcr.io/new-atlantis-labs/geopoc:latest \
  -i /app/GeoPoc/input/Impranil.faa \
  --feature_path /app/GeoPoc/features/ \
  -o /app/GeoPoc/output/ \
  --task temp \
  --model_path /app/GeoPoc/models/ \
  --gpu 0 2>&1

25/03/05 17:42:02 | INFO | root | Reading sequences from /app/GeoPoc/input/Impranil.faa
25/03/05 17:42:02 | INFO | root | Loaded 11 sequences from /app/GeoPoc/input/Impranil.faa
25/03/05 17:42:02 | INFO | root | Loading model
Error while terminating subprocess (pid=8447): 


TypeError: %d format: a real number is required, not NoneType

In [3]:
%%bash

# Base directories
WORKDIR="/home/ec2-user/SageMaker/mangrove-plastic-degrading"
BASE_DIR="/home/ec2-user/SageMaker/GeoPoc/test_new_image"
SEQUENCES_DIR="${WORKDIR}/tests/data"
STRUCTURES_DIR="${WORKDIR}/tests/outputs/structures"
EMBEDDINGS_DIR="${WORKDIR}/tests/outputs/embeddings"
MODELDIR="/home/ec2-user/SageMaker/models"
GEOPOC_MODEL_PATH="${MODELDIR}/geopoc"

# Input specifics
BASE_NAME="PET"
TASK="temp"
FILENAME="${BASE_NAME}.faa"
FASTA_STRUCTURES_DIR="${STRUCTURES_DIR}/${BASE_NAME}"
FASTA_EMBEDDINGS_DIR="${EMBEDDINGS_DIR}/${BASE_NAME}"

# Create test directory structure
mkdir -p "${BASE_DIR}/input"
mkdir -p "${BASE_DIR}/output"
mkdir -p "${BASE_DIR}/features/pdb"
mkdir -p "${BASE_DIR}/features/embedding"
mkdir -p "${BASE_DIR}/features/embedding/${TASK}"
mkdir -p "${BASE_DIR}/features/DSSP"

# Copy FASTA file
cp "${SEQUENCES_DIR}/${FILENAME}" "${BASE_DIR}/input/"

# Copy structure files to the features/pdb directory
echo "Copying PDB structures..."
cp ${FASTA_STRUCTURES_DIR}/*.pdb "${BASE_DIR}/features/pdb/" 2>/dev/null || echo "No PDB files found"
cp ${FASTA_STRUCTURES_DIR}/*.tensor "${BASE_DIR}/features/pdb/" 2>/dev/null || echo "No tensor files found in PDB dir"

# Copy embedding files to the features/embedding directory
echo "Copying embedding files..."
cp ${FASTA_EMBEDDINGS_DIR}/*.pt "${BASE_DIR}/features/embedding/" 2>/dev/null || echo "No PT files found"
cp ${FASTA_EMBEDDINGS_DIR}/*.tensor "${BASE_DIR}/features/embedding/${TASK}/" 2>/dev/null || echo "No tensor files found in embedding dir"

# Run the old Docker image
echo "Running old GeoPoc Docker image..."
docker run --rm --gpus "all" \
  -v "${BASE_DIR}/input:/app/GeoPoc/input" \
  -v "${BASE_DIR}/output:/app/GeoPoc/output" \
  -v "${BASE_DIR}/features:/app/GeoPoc/features" \
  -v "${GEOPOC_MODEL_PATH}:/app/GeoPoc/models" \
  -v "${MODELDIR}:/root/.cache/torch/hub/checkpoints" \
  ghcr.io/new-atlantis-labs/geopoc:latest \
  -i "/app/GeoPoc/input/${FILENAME}" \
  --feature_path "/app/GeoPoc/features/" \
  -o "/app/GeoPoc/output/" \
  --task "${TASK}" \
  --model_path "/app/GeoPoc/models/" \
  --gpu 0

Copying PDB structures...
No tensor files found in PDB dir
Copying embedding files...
No tensor files found in embedding dir
Running old GeoPoc Docker image...




25/03/06 18:14:57 | INFO | root | Reading sequences from /app/GeoPoc/input/PET.faa
25/03/06 18:14:57 | INFO | root | Loaded 2 sequences from /app/GeoPoc/input/PET.faa
25/03/06 18:14:57 | INFO | root | Loading model
Error while terminating subprocess (pid=8684): 


TypeError: %d format: a real number is required, not NoneType

## Test new image

Doesn't require a features input dir

In [11]:
import torch

T = torch.load("test_emb/features/pdb/ERR12814712__k127_2301791_6.tensor")
T.shape

torch.Size([365, 5, 3])

In [12]:
raw_esm['representations']

{33: tensor([[-0.0386,  0.1496, -0.4162,  ...,  0.1260,  0.0268,  0.3619],
         [-0.0277,  0.1537, -0.4543,  ..., -0.1465,  0.2371,  0.0474],
         [ 0.0891,  0.1944, -0.2847,  ...,  0.0199, -0.0115, -0.2145],
         ...,
         [ 0.1561,  0.3619,  0.0113,  ...,  0.0762, -0.0765,  0.0232],
         [ 0.1476,  0.3353,  0.2289,  ...,  0.0300,  0.2180,  0.2163],
         [ 0.1009,  0.2275,  0.0245,  ..., -0.0682,  0.0600,  0.0505]])}

In [13]:
raw_esm = torch.load("test_emb/features/embedding/ERR12736120__k127_3120039_3.pt")
raw_esm['representations']

{36: tensor([[ 0.0169, -0.2708, -0.2502,  ...,  0.1520,  0.0411, -0.1204],
         [ 0.2070, -0.2094, -0.0072,  ..., -0.0141, -0.0665,  0.0525],
         [ 0.1188, -0.1721, -0.1340,  ..., -0.0222, -0.0210, -0.0084],
         ...,
         [ 0.0873,  0.0756, -0.0562,  ..., -0.1091, -0.0509,  0.0939],
         [ 0.0281,  0.0084,  0.0119,  ..., -0.0540, -0.1103,  0.0715],
         [-0.0409,  0.0933, -0.1379,  ..., -0.1378, -0.1839, -0.0161]])}

In [1]:
import torch


raw_esm = torch.load("../mangrove-plastic-degrading/tests/outputs/embeddings/PET/ERR12771227__k127_1462196_1.pt")
raw_esm = raw_esm['representations'][36].numpy()

In [12]:
%%bash

WORKDIR="/home/ec2-user/SageMaker/mangrove-plastic-degrading"
SEQUENCES_DIR="${WORKDIR}/tests/data"
OUTPUT_DIR="${WORKDIR}/tests/outputs"
MODELDIR="/home/ec2-user/SageMaker/models"
STRUCTURES_DIR="${OUTPUT_DIR}/structures"
EMBEDDINGS_DIR="${OUTPUT_DIR}/embeddings"
GEOPOC_MODEL_PATH="${MODELDIR}/geopoc"
GEOPOC_OUTPUT="${OUTPUT_DIR}/geopoc"

BASE_NAME="PET"
TASK="temp"
FILENAME="${BASE_NAME}.faa"

FASTA_STRUCTURES_DIR="${STRUCTURES_DIR}/${BASE_NAME}"
FASTA_EMBEDDINGS_DIR="${EMBEDDINGS_DIR}/${BASE_NAME}"

docker run --rm --gpus "all" \
  -v ${SEQUENCES_DIR}:/input \
  -v ${GEOPOC_OUTPUT}:/output \
  -v ${FASTA_STRUCTURES_DIR}:/pdb_base \
  -v ${FASTA_EMBEDDINGS_DIR}:/embeddings_base \
  -v ${GEOPOC_MODEL_PATH}:/models \
  -v ${MODELDIR}:/root/.cache/torch/hub/checkpoints \
  geopoc:new \
  -i /input/${FILENAME} \
  -o /output/ \
  --model_path /models/ \
  --pdb_dir /pdb_base/ \
  --embedding_dir /embeddings_base/ \
  --task ${TASK} \
  --gpu 0 2>&1

Reading FASTA file: /input/PET.faa
Found 2 protein sequences
Created temporary directory: /tmp/geopoc_features_khsdp76f
Using PDB files from: /pdb_base/
  - Linked/copied 2 PDB files and 0 tensor files
Converting 2 PDB files to tensor format...
Extracted coordinates for 356 residues with shape (356, 5, 3)
Saved tensor with shape torch.Size([356, 5, 3]) for ERR12772908__k127_1041917_29.pdb
Extracted coordinates for 359 residues with shape (359, 5, 3)
Saved tensor with shape torch.Size([359, 5, 3]) for ERR12771227__k127_1462196_1.pdb
Successfully converted 2 PDB files to tensor format.
Preprocessed 2 PDB files to tensors
  - 0 out of 2 proteins will need structure prediction
Using ESM embeddings from: /embeddings_base/
  - Linked/copied 2 embedding files
  - 0 out of 2 proteins will need embedding generation
No normalized embedding directory provided - temp-specific embeddings will be generated
No DSSP directory provided - secondary structure features will be calculated
Running: python /