In [1]:
from transformers import AutoTokenizer, EsmForMaskedLM

model_name = "facebook/esm2_t33_650M_UR50D"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = EsmForMaskedLM.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at facebook/esm2_t33_650M_UR50D were not used when initializing EsmForMaskedLM: ['esm.embeddings.position_embeddings.weight']
- This IS expected if you are initializing EsmForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing EsmForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
import torch
import csv
import pandas as pd

In [4]:

##### DOUBLE CHECKS THIS IS OK AND THAT YOU UNDERSTAND IT
def get_mean_embedding(sequence):
    inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
        last_hidden_state = outputs.hidden_states[-1]
        attention_mask = inputs["attention_mask"]
        embedding = (last_hidden_state * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(1).unsqueeze(-1)
        return embedding.squeeze(0).numpy()

In [None]:


# USER DEFINED PARAMS (0 INDEXED)
iterations = 11 # Should be >= 1
startChunk = 189 # This will be total_rows/chunksize
chunkSize = 10000

# PROGRAM
# Read out the csv in chunks
reader = pd.read_csv(
        "GENOMESDB/bacteriaDB/bacteriaRandSub.csv",
        usecols=[2],
        names=["seq"],
        chunksize=chunkSize, 
        iterator=True)

# Skip until start chunk
for _ in range(startChunk):
    try:
        next(reader)
    except StopIteration:
        raise RuntimeError(f"File has fewer than {startChunk} chunks.")

# Unless first run open in append mode to not remove old data
mode = "a" if startChunk > 0 else "w"
with open('embeddings.csv', mode, newline='') as f:
    writer = csv.writer(f)

    processed = 0

    # Process each batch using the reader
    for batch in reader:
        count = 0
        
        if processed >= iterations:
            print(f"Reached limit of {iterations} chunks; stopping.")
            break

        seqs = batch["seq"].astype(str).tolist()
        embeddings = []

        for seq in seqs:
            count+=1
            #print(seq[:150])
            print(count)
            emb = get_mean_embedding(seq[:150])
            embeddings.append(emb)

        writer.writerows(embeddings)
        print(f"Finished chunk #{startChunk + processed + 1}")

        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        processed += 1


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79


In [None]:
pd.read_csv("embeddings.csv", header = None)

In [None]:
!awk '/^>/ { if (seq) print seq; seq=""; next } { seq = seq $0 } END { if (seq) print seq }' drive/MyDrive/RESEARCH/RAW_PROTEINS/Prot_05.faa > input.txt
!head -n 50000 input.txt > test.txt