In [6]:
import pandas as pd
from sentence_transformers import SentenceTransformer

csv_name = "protein_train_data"
df = pd.read_csv('../../train_data/' + csv_name + '.csv')

# 임베딩 모델 로드
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')



In [7]:
# 고유한 Target Name에 대해 임베딩 계산
unique_target_names = df['Target Name'].unique()
embeddings = model.encode(unique_target_names)

In [8]:
# 임베딩을 Target Name에 대응시키는 딕셔너리 생성
embedding_dict = {name: embedding for name, embedding in zip(unique_target_names, embeddings)}

In [9]:
# # 데이터프레임에 임베딩 추가 (임베딩 벡터를 문자열로 변환, 포맷팅)
# df['Target Embedding'] = df['Target Name'].map(lambda x: ', '.join(map(str, embedding_dict[x])))

# 포맷팅하지 않은 임베딩 벡터를 DataFrame에 추가
df['Target Embedding'] = df['Target Name'].map(lambda x: embedding_dict[x])


# 결과 확인
print(df.head())

                                              Smiles  \
0  CN[C@@H](C)C(=O)N[C@H](C(=O)N1C[C@@H](NC(=O)CC...   
1  CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...   
2  CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...   
3  CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...   
4  COc1cc2c(OC[C@@H]3CCC(=O)N3)ncc(C#CCCCCCCCCCCC...   

                                  Target Name  Standard Value  \
0  Interleukin-1 receptor-associated kinase 4           0.022   
1  Interleukin-1 receptor-associated kinase 4           0.026   
2  Interleukin-1 receptor-associated kinase 4           0.078   
3  Interleukin-1 receptor-associated kinase 4           0.081   
4  Interleukin-1 receptor-associated kinase 4           0.099   

                                    Target Embedding  
0  [-0.03705732, 0.032045893, 0.022981437, 0.0012...  
1  [-0.03705732, 0.032045893, 0.022981437, 0.0012...  
2  [-0.03705732, 0.032045893, 0.022981437, 0.0012...  
3  [-0.03705732, 0.032045893, 0.022981437, 0.0012...

In [10]:
# CSV 파일로 저장
df.to_csv(csv_name +'_embeddings.csv', index=False)