In [43]:
import pandas as pd
from sentence_transformers import SentenceTransformer

In [44]:
df = pd.read_csv('../../train_data/train.csv')

In [45]:
df['Target Name']

0       Interleukin-1 receptor-associated kinase 4
1       Interleukin-1 receptor-associated kinase 4
2       Interleukin-1 receptor-associated kinase 4
3       Interleukin-1 receptor-associated kinase 4
4       Interleukin-1 receptor-associated kinase 4
                           ...                    
1947    Interleukin-1 receptor-associated kinase 4
1948    Interleukin-1 receptor-associated kinase 4
1949    Interleukin-1 receptor-associated kinase 4
1950    Interleukin-1 receptor-associated kinase 4
1951    Interleukin-1 receptor-associated kinase 4
Name: Target Name, Length: 1952, dtype: object

In [46]:
target_name_counts = df['Target Name'].value_counts()

# 결과 출력
print(target_name_counts)

Target Name
Interleukin-1 receptor-associated kinase 4    1952
Name: count, dtype: int64


In [47]:
# 'Target Name' 열의 고유한 값들만 추출
unique_values = df['Target Name'].unique()

# 결과 출력
print(unique_values)

['Interleukin-1 receptor-associated kinase 4']


In [48]:
unique_values_list = df['Target Name'].unique().tolist()
print(unique_values_list)

['Interleukin-1 receptor-associated kinase 4']


In [49]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(unique_values_list)
print(embeddings.shape)

(1, 384)


In [50]:
# DataFrame 생성
df_embeddings = pd.DataFrame({
    'Target Name': unique_values_list,
    'Vector': list(embeddings)  # NumPy 배열을 리스트로 변환하지 않고 직접 사용
})

# 결과 확인
print(df_embeddings.head())

# CSV 파일로 저장
df_embeddings.to_csv('target_name_embeddings.csv', index=False)

                                  Target Name  \
0  Interleukin-1 receptor-associated kinase 4   

                                              Vector  
0  [-0.037057307, 0.032045923, 0.022981424, 0.001...  


In [51]:
# 공백을 ','로 바꾸고, 대괄호 제거
formatted_embeddings = [', '.join(map(str, vector)) for vector in embeddings]

# DataFrame 생성
df_embeddings = pd.DataFrame({
    'Target Name': unique_values_list,
    'Vector': formatted_embeddings  # 각 벡터를 문자열로 저장
})

# 결과 확인
print(df_embeddings.head())

# CSV 파일로 저장
df_embeddings.to_csv('target_name_formatted_embeddings.csv', index=False)

                                  Target Name  \
0  Interleukin-1 receptor-associated kinase 4   

                                              Vector  
0  -0.037057307, 0.032045923, 0.022981424, 0.0012...  


In [52]:
check_df = pd.read_csv('target_name_formatted_embeddings.csv')

print(check_df.head())

                                  Target Name  \
0  Interleukin-1 receptor-associated kinase 4   

                                              Vector  
0  -0.037057307, 0.032045923, 0.022981424, 0.0012...  
