# Test GeoPoc docker image

GeoPoc uses ESM-2

### Compute pdb structures

In [8]:
%%bash

mkdir -p /home/ec2-user/SageMaker/GeoPoc/tests/features/pdb
mkdir -p /home/ec2-user/SageMaker/GeoPoc/tests/features/embedding/temp
mkdir -p /home/ec2-user/SageMaker/GeoPoc/tests/features/embedding/pH
mkdir -p /home/ec2-user/SageMaker/GeoPoc/tests/features/embedding/salt
mkdir -p /home/ec2-user/SageMaker/GeoPoc/tests/features/DSSP
sudo chmod -R 777 /home/ec2-user/SageMaker/GeoPoc/tests/features

In [3]:
!cd ../Proteus && pip install -qr requirements.txt
!cd ../Proteus && pip uninstall -y proteus && poetry build && pip install dist/proteus-0.0.0-py3-none-any.whl

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sphinx 8.1.3 requires docutils<0.22,>=0.20, but you have docutils 0.16 which is incompatible.[0m[31m
[0mBuilding [36mproteus[39m ([39;1m0.0.0[39;22m)
  - Building [34msdist[39m
  - Built [32mproteus-0.0.0.tar.gz[39m
  - Building [34mwheel[39m
  - Built [32mproteus-0.0.0-py3-none-any.whl[39m
Processing ./dist/proteus-0.0.0-py3-none-any.whl
Installing collected packages: proteus
Successfully installed proteus-0.0.0


## Fold sequences

In [18]:
from proteus.fitness_helpers import run_esmfold

pdb_dir = "/home/ec2-user/SageMaker/GeoPoc/tests/features"

run_esmfold(
    fasta_path="/home/ec2-user/SageMaker/GeoPoc/GeoPoc/example/test.fasta",
    pdb_output_path=pdb_dir,
    cpu_only=False,
    docker_image="ghcr.io/new-atlantis-labs/esmfold:latest",
)

## Run GeoPoc: optimal temperature

In [11]:
%%bash

docker run --rm --gpus "all" \
  -v /home/ec2-user/SageMaker/GeoPoc/tests/:/app/GeoPoc/input \
  -v /home/ec2-user/SageMaker/GeoPoc/tests/outputs:/app/GeoPoc/output \
  -v /home/ec2-user/SageMaker/GeoPoc/tests/features:/app/GeoPoc/features \
  ghcr.io/new-atlantis-labs/geopoc:latest \
  -i /app/GeoPoc/input/test.fasta \
  --feature_path /app/GeoPoc/features/ \
  -o /app/GeoPoc/output/ \
  --task temp \
  --gpu 0 2>&1

100%|██████████| 1/1 [00:00<00:00,  1.17it/s]


## Optimal pH

In [9]:
%%bash

docker run --rm --gpus "all" \
  -v /home/ec2-user/SageMaker/GeoPoc/tests/:/app/GeoPoc/input \
  -v /home/ec2-user/SageMaker/GeoPoc/tests/outputs:/app/GeoPoc/output \
  -v /home/ec2-user/SageMaker/GeoPoc/tests/features:/app/GeoPoc/features \
  ghcr.io/new-atlantis-labs/geopoc:latest \
  -i /app/GeoPoc/input/test.fasta \
  --feature_path /app/GeoPoc/features/ \
  -o /app/GeoPoc/output/ \
  --task pH \
  --gpu 0 2>&1

100%|██████████| 1/1 [00:00<00:00, 33.34it/s]
100%|██████████| 1/1 [00:01<00:00,  1.95s/it]


## Optimal Salt Concentration

In [10]:
%%bash

docker run --rm --gpus "all" \
  -v /home/ec2-user/SageMaker/GeoPoc/tests/:/app/GeoPoc/input \
  -v /home/ec2-user/SageMaker/GeoPoc/tests/outputs:/app/GeoPoc/output \
  -v /home/ec2-user/SageMaker/GeoPoc/tests/features:/app/GeoPoc/features \
  ghcr.io/new-atlantis-labs/geopoc:latest \
  -i /app/GeoPoc/input/test.fasta \
  --feature_path /app/GeoPoc/features/ \
  -o /app/GeoPoc/output/ \
  --task salt \
  --gpu 0 2>&1

100%|██████████| 1/1 [00:00<00:00, 97.28it/s]
100%|██████████| 1/1 [00:00<00:00,  1.14it/s]


## Check ESM-2 embeddings

Default: esm2_t36_3B_UR50D, repr_layers 36, per_tok

In [2]:
import torch

master_emb_path = "tests/features/embedding/A0A2P5KBM8.pt"
master_emb = torch.load(master_emb_path)
master_emb

{'label': 'A0A2P5KBM8',
 'representations': {36: tensor([[-0.0092, -0.2994, -0.0446,  ..., -0.1057, -0.0994,  0.0472],
          [ 0.0281, -0.2679,  0.0986,  ..., -0.0202, -0.0519,  0.1983],
          [-0.0282, -0.2964, -0.0660,  ..., -0.0505, -0.1806,  0.0681],
          ...,
          [-0.1459, -0.0200,  0.0344,  ...,  0.0357, -0.1755,  0.0262],
          [-0.1893,  0.0585, -0.0425,  ...,  0.0749, -0.1993,  0.0551],
          [-0.1980,  0.0546, -0.0686,  ..., -0.1147, -0.2503, -0.0083]])}}

In [7]:
master_emb["representations"][36].shape

torch.Size([330, 2560])

In [1]:
import torch

emb_path = "tests/features/embedding/temp/A0A2P5KBM8.tensor"
embeddings = torch.load(emb_path)
embeddings

tensor([[0.5213, 0.3295, 0.4792,  ..., 0.4221, 0.4717, 0.5725],
        [0.5408, 0.3473, 0.5513,  ..., 0.4725, 0.4949, 0.6559],
        [0.5115, 0.3313, 0.4685,  ..., 0.4546, 0.4320, 0.5840],
        ...,
        [0.4502, 0.4869, 0.5190,  ..., 0.5055, 0.4345, 0.5609],
        [0.4276, 0.5310, 0.4803,  ..., 0.5286, 0.4228, 0.5769],
        [0.4231, 0.5288, 0.4672,  ..., 0.4168, 0.3979, 0.5419]])

In [3]:
embeddings.shape

torch.Size([330, 2560])

In [8]:
import torch
from pathlib import Path

# Define paths
base_path = "tests/features/embedding"
protein_id = "A0A2P5KBM8"
dirs = ['temp', 'pH', 'salt']

# Load tensors
tensors = {}
for dir_name in dirs:
    path = Path(base_path) / dir_name / f"{protein_id}.tensor"
    tensors[dir_name] = torch.load(path)
    print(f"\n{dir_name} tensor shape:", tensors[dir_name].shape)
    print(f"{dir_name} stats - mean: {tensors[dir_name].mean():.6f}, std: {tensors[dir_name].std():.6f}")

# Compare pairs
for i, dir1 in enumerate(dirs):
    for dir2 in dirs[i+1:]:
        is_equal = torch.equal(tensors[dir1], tensors[dir2])
        print(f"\n{dir1} vs {dir2} equal:", is_equal)
        if not is_equal:
            diff = torch.abs(tensors[dir1] - tensors[dir2]).mean()
            print(f"Mean absolute difference: {diff:.6f}")


temp tensor shape: torch.Size([330, 2560])
temp stats - mean: 0.498779, std: 0.072624

pH tensor shape: torch.Size([330, 2560])
pH stats - mean: 0.499010, std: 0.073043

salt tensor shape: torch.Size([330, 2560])
salt stats - mean: 0.498744, std: 0.073552

temp vs pH equal: False
Mean absolute difference: 0.011842

temp vs salt equal: False
Mean absolute difference: 0.013285

pH vs salt equal: False
Mean absolute difference: 0.012708


## There are different min max values for each task

To normalize raw embeddings to the ranges best suited to train the predictive model based in the natural ranges of each property/task

In [9]:
import pickle

ESM_MIN_MAX = pickle.load(open("/home/ec2-user/SageMaker/GeoPoc/GeoPoc/feature_extraction/ESM_Min_Max.pkl",'rb'))
ESM_MIN_MAX

{'temp_Min': array([-1.0107065, -0.8848684, -0.9967965, ..., -0.8216355, -1.0642859,
        -0.9903651], dtype=float32),
 'temp_Max': array([0.9102785 , 0.8916208 , 0.99005544, ..., 0.87448555, 0.9812699 ,
        0.8220415 ], dtype=float32),
 'salt_Min': array([-1.0437392 , -0.8454495 , -0.95653987, ..., -0.8225851 ,
        -1.0623534 , -0.9820347 ], dtype=float32),
 'salt_Max': array([0.87281096, 0.8617154 , 0.8983912 , ..., 0.8534491 , 0.9217443 ,
        0.7613497 ], dtype=float32),
 'pH_Min': array([-1.0453855 , -0.9161436 , -0.96290076, ..., -0.8216355 ,
        -1.0618458 , -0.9613065 ], dtype=float32),
 'pH_Max': array([0.8911413 , 0.8535615 , 0.99005544, ..., 0.8374581 , 0.9812699 ,
        0.7995495 ], dtype=float32)}