In [7]:
import pandas as pd
from sentence_transformers import SentenceTransformer

csv_name = "dacon_train"
df = pd.read_csv('../../train_data/' + csv_name + '.csv')

# 임베딩 모델 로드
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')



In [8]:
# 고유한 Target Name에 대해 임베딩 계산
unique_target_names = df['Target Name'].unique()
embeddings = model.encode(unique_target_names)

In [9]:
# 임베딩을 Target Name에 대응시키는 딕셔너리 생성
embedding_dict = {name: embedding for name, embedding in zip(unique_target_names, embeddings)}

In [10]:
# # 데이터프레임에 임베딩 추가 (임베딩 벡터를 문자열로 변환, 포맷팅)
# df['Target Embedding'] = df['Target Name'].map(lambda x: ', '.join(map(str, embedding_dict[x])))

# 포맷팅하지 않은 임베딩 벡터를 DataFrame에 추가
df['Target Embedding'] = df['Target Name'].map(lambda x: embedding_dict[x])


# 결과 확인
print(df.head())

                                              Smiles  \
0      Oc1ccc(-c2ccc3ncnc(Nc4cccc5[nH]ncc45)c3c2)cc1   
1  Cc1ccc(C(=O)Nc2cc(C(C)(C)C)n[nH]2)cc1Nc1ncnc2c...   
2  N#Cc1c(-c2ccccc2)cc(-c2ccccc2)nc1/N=c1\sc(-c2c...   
3  C#CCN1CCN(c2ccc(-c3ccc4[nH]c5nccc(-c6ccc(Cc7cc...   
4  O=C(N1CCN(c2ncnc3[nH]ccc23)CC1)C1(c2ccc(Br)cc2...   

                             Target Name  Standard Value  \
0  Serine/threonine-protein kinase B-raf           547.3   
1  Serine/threonine-protein kinase B-raf             5.0   
2   Serine/threonine-protein kinase Chk1             9.9   
3           ALK tyrosine kinase receptor          3600.0   
4    Serine/threonine-protein kinase AKT            27.1   

                                    Target Embedding  
0  [-0.07307798, -0.02817397, -0.15112427, -0.015...  
1  [-0.07307798, -0.02817397, -0.15112427, -0.015...  
2  [-0.10474009, 0.03651712, -0.07544042, 0.02934...  
3  [-0.06357608, 0.022508983, -0.1155538, 0.02284...  
4  [-0.1120778, -0.01233815

In [11]:
# CSV 파일로 저장
df.to_csv(csv_name +'_embeddings.csv', index=False)