In [13]:
import pandas as pd

df = pd.read_csv('updated_keyword_Emb.csv')

In [31]:
# 나머지 절반
# remaining_half_df = df.iloc[len(df) // 2:]
half_df = df.iloc[:len(df) // 2]

In [14]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import tqdm
import numpy as np
from concurrent.futures import ThreadPoolExecutor
import ast  # 문자열을 리스트로 변환할 때 사용

# 1. koBERT 모델과 토크나이저 로드
model_name = 'monologg/kobert'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# koBERT는 GPU 사용을 권장하지만, CPU로도 실행 가능
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 2. 단어 임베딩 생성 함수
def get_word_embedding(word):
    inputs = tokenizer(word, return_tensors='pt', padding=True, truncation=True, max_length=32)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # 임베딩 추출
    return embedding

# 3. 각 단어별 임베딩 생성 (병렬 처리)
def generate_word_embeddings_parallel(keywords):
    embeddings = {}
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(get_word_embedding, keywords))  # 병렬로 임베딩 생성
    for word, embedding in zip(keywords, results):
        embeddings[word] = embedding
    return embeddings

# 4. 배치별로 임베딩 추가하는 함수
# def add_embeddings_to_df_batch(df, start_idx, end_idx):
#     all_embeddings = []
    
#     # 지정된 범위에 대해 처리
#     for _, row in tqdm.tqdm(df.iloc[start_idx:end_idx].iterrows(), total=end_idx-start_idx):
#         # 문자열 형태의 'top_keywords'를 리스트로 변환
#         keywords = ast.literal_eval(row['top_keywords']) if isinstance(row['top_keywords'], str) else row['top_keywords']
#         word_embeddings = generate_word_embeddings_parallel(keywords)  # 병렬로 임베딩 생성
#         all_embeddings.append(word_embeddings)
    
#     # 'word_embeddings' 새 칼럼에 추가
#     df.loc[start_idx:end_idx-1, 'word_embeddings'] = all_embeddings
#     return df

# 4. 배치별로 임베딩 추가하는 함수
def add_embeddings_to_df_batch(df, start_idx, end_idx):
    all_embeddings = []
    
    # 지정된 범위에 대해 처리
    for _, row in tqdm.tqdm(df.iloc[start_idx:end_idx].iterrows(), total=end_idx-start_idx):
        # 문자열 형태의 'top_keywords'를 리스트로 변환
        keywords = ast.literal_eval(row['top_keywords']) if isinstance(row['top_keywords'], str) else row['top_keywords']
        word_embeddings = generate_word_embeddings_parallel(keywords)  # 병렬로 임베딩 생성
        all_embeddings.append(word_embeddings)
    
    # 슬라이스 길이와 all_embeddings 길이 확인
    slice_length = len(df.iloc[start_idx:end_idx])
    embedding_length = len(all_embeddings)

    if slice_length == embedding_length:
        df.iloc[start_idx:end_idx, df.columns.get_loc('word_embeddings')] = all_embeddings
    else:
        raise ValueError(f"Mismatch: Slice length={slice_length}, Embedding length={embedding_length}")

    return df


# 5. 데이터프레임을 배치로 처리하는 함수
def process_in_batches(df, batch_size=300):
    total_rows = len(df)
    for start_idx in range(0, total_rows, batch_size):
        end_idx = min(start_idx + batch_size, total_rows)
        df = add_embeddings_to_df_batch(df, start_idx, end_idx)
    return df

# # 6. 데이터프레임에 단어별 임베딩 추가 (배치 처리)
# # 예시 데이터프레임
# data = {
#     'id': [1, 2],
#     'top_keywords': ["['apple', 'banana', 'cherry']", "['dog', 'cat', 'bird']"]  # 문자열로 저장된 리스트
# }
# df = pd.DataFrame(data)

  from .autonotebook import tqdm as notebook_tqdm
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [None]:
df['word_embeddings'] = None

# 'top_keywords'가 문자열인 경우 리스트로 변환 후 처리
local_embedding = process_in_batches(df)

# 결과 출력
print(local_embedding)

100%|██████████| 300/300 [02:41<00:00,  1.86it/s]
100%|██████████| 300/300 [02:02<00:00,  2.44it/s]
 51%|█████     | 153/300 [01:46<01:22,  1.78it/s]

In [37]:
df1 = pd.read_csv('df_with_embeddings2.csv', encoding=False, index_col=0)
df1

Unnamed: 0,_id,user_id,time,url,text,text_result,processed_text,top_keywords,keyword_Emb,word_embeddings
2710,67430fbc0c898a183d4fbf26,ti_a61,2023-08-21T23:08:07.000Z,https://www.threads.net/@ti_a61/post/CwOXna9BmoX,뭐하는 애죠시험 평균이 10~30이라길래그럼 자퇴하면 검정고시 합격 못한다했더니그럼...,뭐 하는 애죠? 시험 평균이 10~30이라길래 그럼 자퇴하면 검정고시 합격 못 한다...,뭐 애 시험 평균 자퇴 검정 고시 합격 학교 공부 선택지 어디,"['검정', '고시', '선택지', '평균', '합격', '자퇴', '시험', '학...","[-0.5419741, -0.7466092, -0.6101014, 1.4957911...","{'검정': array([[ 2.08431482e-01, -8.82843733e-0..."
2711,67430fbc0c898a183d4fbf30,ti_a61,2023-08-21T04:39:17.000Z,https://www.threads.net/@ti_a61/post/CwMYuDZJYV_,누가 뭐래도 사람 대하는게 제일 어려운듯..모든 사람이 날 좋아할 순 없다지만모든 ...,누가 뭐래도 사람 대하는 게 제일 어려운 듯.. 모든 사람이 날 좋아할 순 없다지만...,누구 뭐 사람 사람 나 사람 나,"['사람', '누구']","[-0.3865591, -0.817194, -0.9155909, 1.7148402,...","{'사람': array([[-3.41696925e-02, -9.46605206e-0..."
2712,67430fbc0c898a183d4fbf31,ti_a61,2023-08-20T17:45:40.000Z,https://www.threads.net/@ti_a61/post/CwLN658puC2,진짜 눈물만 난다내가 뭘그리 잘못했는지 모르겠어내가 뭘 그리 잘못해서 이렇게 살고있...,진짜 눈물만 난다. 내가 뭘 그리 잘못했는지 모르겠어. 내가 뭘 그리 잘못해서 이렇...,눈물 나 뭐 나 뭐 나 뭐 취급 나 행복 사람 사람 사람 모두 나 오늘 여기,"['사람', '취급', '모두', '여기', '행복', '눈물', '오늘']","[-0.61295563, -0.784042, -0.9003625, 1.5614357...","{'사람': array([[-3.41696925e-02, -9.46605206e-0..."
2713,67430fbc0c898a183d4fbf32,ti_a61,2023-08-20T16:17:40.000Z,https://www.threads.net/@ti_a61/post/CwLD2WHp4ey,자퇴해버릴까 고민중,문장: 자퇴해 버릴까 고민 중,문장 자퇴 고민,"['자퇴', '고민']","[-0.42230695, -0.87979144, -0.44737652, 1.6923...","{'자퇴': array([[ 2.08431482e-01, -8.82843733e-0..."
2714,67430fbc0c898a183d4fbf33,ti_a61,2023-08-20T05:37:25.000Z,https://www.threads.net/@ti_a61/post/CwJ6k9BJjij,현준님이랑 체리들은 항상 날 웃게하는 것 같아방근까지만해도 죽을 듯 우울했는데 보고...,현준 님이랑 체리들은 항상 날 웃게 하는 것 같아 방금까지만 해도 죽을 듯 우울했는...,준 체리 나 방금 나,"['방금', '체리']","[-0.52754587, -0.47142637, -1.0802295, 1.37854...","{'방금': array([[ 2.08431482e-01, -8.82843733e-0..."
...,...,...,...,...,...,...,...,...,...,...
5416,67430fbc0c898a183d4fec1b,ibfci,2024-02-13T06:00:07.000Z,https://www.threads.net/@ibfci/post/C3Rt4LuRZ7D,걱정하지 말아달라는 거다.다들 그래봤자 어차피 나를 떠날 거니까.나는 사람을 잘 안...,걱정하지 말라는 거다. 다들 그래 봤자 어차피 나를 떠날 거니까. 나는 사람을 잘 ...,걱정 나 나 사람 나 나 속 얘기 사람 우발,"['우발', '사람', '걱정', '얘기']","[-0.65913206, -1.2135304, -0.5713355, 1.570326...","{'우발': array([[ 2.08431482e-01, -8.82843733e-0..."
5417,67430fbc0c898a183d4fec28,ibfci,2024-02-13T01:27:38.000Z,https://www.threads.net/@ibfci/post/C3ROsXJPp9w,요즘 들어 꿈에서 자꾸 제가 스스로 목숨을 끊는데.. 이거 어어.. 안 좋은 거죠?,요즘 들어 꿈에서 자꾸 제가 스스로 목숨을 끊는데.. 이거 안 좋은 거죠?,요즘 꿈 저 목숨 이거,"['목숨', '요즘', '이거']","[-0.25130317, -0.21303415, -0.51493216, 1.6068...","{'목숨': array([[ 2.08431482e-01, -8.82843733e-0..."
5418,67430fbc0c898a183d4fec29,ibfci,2024-02-13T01:28:41.000Z,https://www.threads.net/@ibfci/post/C3RO0GKP_fZ,"몇 주 안에 두 번 꿨어요. 한 번은 바다에 투신, 한 번은 옥상에서..","몇 주 안에 두 번 꿨어요. 한 번은 바다에 투신, 한 번은 옥상에서....",안 바다 투신 옥상,"['옥상', '바다', '투신']","[-0.3210084, -0.7765417, -0.35808966, 1.484147...","{'옥상': array([[ 2.08431482e-01, -8.82843733e-0..."
5419,67430fbc0c898a183d4fec2a,ibfci,2024-02-13T01:24:16.000Z,https://www.threads.net/@ibfci/post/C3ROTz1PCGj,아.. 울다..콘텐츠를 이용할 수 없음,아.. 울다.. 콘텐츠를 이용할 수 없음,,[],"[-0.31545943, -0.48268098, -0.75341344, 1.7447...",{}


In [38]:
df_with_embeddings2.to_csv('df_with_embeddings.csv', encoding='utf-8', index=False)

In [6]:
df_with_embeddings2 = pd.read_csv('df_with_embeddings.csv', encoding='utf-8')
df_with_embeddings2['word_embeddings'].head(5)

0    {'일러스트': array([[ 2.08431482e-01, -8.82843733e...
1    {'이해': array([[ 2.08431482e-01, -8.82843733e-0...
2    {'오늘': array([[ 3.96649651e-02, -3.41148376e-0...
3    {'죄인': array([[ 2.08431482e-01, -8.82843733e-0...
4    {'해방감': array([[ 2.08431482e-01, -8.82843733e-...
Name: word_embeddings, dtype: object

In [43]:
df1['word_embeddings'].head(5)

2710    {'검정': array([[ 2.08431482e-01, -8.82843733e-0...
2711    {'사람': array([[-3.41696925e-02, -9.46605206e-0...
2712    {'사람': array([[-3.41696925e-02, -9.46605206e-0...
2713    {'자퇴': array([[ 2.08431482e-01, -8.82843733e-0...
2714    {'방금': array([[ 2.08431482e-01, -8.82843733e-0...
Name: word_embeddings, dtype: object

In [52]:
df_with_embeddings2['word_embeddings']

0       {'일러스트': [[0.20843148, -0.08828437, 0.13065429...
1       {'이해': [[0.20843148, -0.08828437, 0.13065429, ...
2       {'오늘': [[0.039664965, -0.34114838, 0.43761957,...
3       {'죄인': [[0.20843148, -0.08828437, 0.13065429, ...
4       {'해방감': [[0.20843148, -0.08828437, 0.13065429,...
                              ...                        
2705    {'상처': [[0.20843148, -0.08828437, 0.13065429, ...
2706    {'진심': [[0.20843148, -0.08828437, 0.13065429, ...
2707                                                   {}
2708    {'스트레스': [[0.20843148, -0.08828437, 0.13065429...
2709    {'개학': [[0.20843148, -0.08828437, 0.13065429, ...
Name: word_embeddings, Length: 2710, dtype: object

In [53]:
# word_embeddings를 NumPy 배열로 변환
def ensure_array_format(embedding):
    if isinstance(embedding, dict):  # 딕셔너리인 경우
        return {key: np.array(value) for key, value in embedding.items()}
    return embedding

# 두 데이터프레임의 word_embeddings 칼럼 통일
# df1['word_embeddings'] = df1['word_embeddings'].apply(ensure_array_format)
df_with_embeddings2['word_embeddings'] = df_with_embeddings2['word_embeddings'].apply(ensure_array_format)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_with_embeddings2['word_embeddings'] = df_with_embeddings2['word_embeddings'].apply(ensure_array_format)


In [None]:
import pandas as pd

df = pd.read_csv('ujung.csv', encoding='utf-8-sig', index_col=0)

In [3]:
df['word_embeddings'].head(5)

0    {'일러스트': array([[ 2.08431482e-01, -8.82843733e...
1    {'이해': array([[ 2.08431482e-01, -8.82843733e-0...
2    {'오늘': array([[ 3.96649651e-02, -3.41148376e-0...
3    {'죄인': array([[ 2.08431482e-01, -8.82843733e-0...
4    {'해방감': array([[ 2.08431482e-01, -8.82843733e-...
Name: word_embeddings, dtype: object

In [4]:
import re

def clean_array_format(x):
    if isinstance(x, str):
        # `array(...)`를 제거하고 내부 값만 남김
        x = re.sub(r'array\((.*?)\)', r'\1', x)
    return x

df['word_embeddings'] = df['word_embeddings'].apply(clean_array_format)

In [6]:
print(df['word_embeddings'].apply(type).value_counts())

word_embeddings
<class 'str'>    5421
Name: count, dtype: int64


In [9]:
import numpy as np
import json

# numpy 배열을 리스트로 변환
def serialize_embeddings(embeddings):
    if isinstance(embeddings, dict):
        return {k: v.tolist() if isinstance(v, np.ndarray) else v for k, v in embeddings.items()}
    return embeddings

df['word_embeddings'] = df['word_embeddings'].apply(serialize_embeddings)
df['word_embeddings'] = df['word_embeddings'].apply(json.dumps)  # JSON 직렬화
df.to_csv('word_embeddings.csv', index=False)

In [5]:
df['word_embeddings'].head(5)

0    {'일러스트': array([[ 2.08431482e-01, -8.82843733e...
1    {'이해': array([[ 2.08431482e-01, -8.82843733e-0...
2    {'오늘': array([[ 3.96649651e-02, -3.41148376e-0...
3    {'죄인': array([[ 2.08431482e-01, -8.82843733e-0...
4    {'해방감': array([[ 2.08431482e-01, -8.82843733e-...
Name: word_embeddings, dtype: object

In [10]:
import ast
import json

# 문자열을 올바른 JSON으로 변환
def convert_to_json(s):
    try:
        # 문자열이 JSON 형식이 아닌 경우 Python 객체로 평가
        python_dict = ast.literal_eval(s)
        # Python 객체를 JSON 문자열로 변환
        return json.dumps(python_dict)
    except (ValueError, SyntaxError):
        # 변환 실패 시 원래 값을 반환
        return s

df['word_embeddings'] = df['word_embeddings'].apply(convert_to_json)

In [11]:
# Check the data type of word_embeddings
print(df['word_embeddings'].apply(type).value_counts())

word_embeddings
<class 'str'>    5421
Name: count, dtype: int64


In [12]:
df['word_embeddings'].head(5)

0    "{'\uc77c\ub7ec\uc2a4\ud2b8': array([[ 2.08431...
1    "{'\uc774\ud574': array([[ 2.08431482e-01, -8....
2    "{'\uc624\ub298': array([[ 3.96649651e-02, -3....
3    "{'\uc8c4\uc778': array([[ 2.08431482e-01, -8....
4    "{'\ud574\ubc29\uac10': array([[ 2.08431482e-0...
Name: word_embeddings, dtype: object

In [7]:
# 유사도 계산 후 결과 표시 함수
def calculate_similarity(df):
    results = []

    # 유저별로 데이터를 순차적으로 처리
    for user_id in tqdm.tqdm(df['user_id'].unique()):
        user_data = df[df['user_id'] == user_id].sort_values(by='time')

        # i번째와 i+1번째 문장 간 유사도 비교
        for i in range(len(user_data) - 1):
            word_embeddings_i = user_data.iloc[i]['word_embeddings']
            word_embeddings_next = user_data.iloc[i + 1]['word_embeddings']

            # 유사도 찾기
            similarity_found = False
            # i번째 문장의 모든 단어에 대해 비교
            for word_i, embedding_i in word_embeddings_i.items():
                # i+1번째 문장의 모든 단어에 대해 비교
                for word_next, embedding_next in word_embeddings_next.items():
                    similarity = cosine_similarity_fn(embedding_i, embedding_next)
                    if similarity > 0.9:  # 유사도 기준(0.9 이상이면 1로 설정)
                        similarity_found = True
                        break
                if similarity_found:
                    break

            results.append({
                'user_id': user_id,
                'time_1': user_data.iloc[i]['time'],
                'time_2': user_data.iloc[i + 1]['time'],
                'similarity': 1 if similarity_found else 0
            })

    return pd.DataFrame(results)

In [9]:
import ast

# Convert string representations to dictionaries
df['word_embeddings'] = df['word_embeddings'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

ValueError: malformed node or string: <ast.Call object at 0x7f8b74cda250>

In [None]:
# Identify problematic rows
for idx, value in enumerate(df['word_embeddings']):
    try:
        ast.literal_eval(value)
    except Exception as e:
        print(f"Row {idx} caused an error: {e}")

Row 0 caused an error: malformed node or string: <ast.Call object at 0x7f8b6ee02880>
Row 1 caused an error: malformed node or string: <ast.Call object at 0x7f8b6ef88640>
Row 2 caused an error: malformed node or string: <ast.Call object at 0x7f8b6f070160>
Row 3 caused an error: malformed node or string: <ast.Call object at 0x7f8b6ef87820>
Row 4 caused an error: malformed node or string: <ast.Call object at 0x7f8b6ecc77c0>
Row 5 caused an error: malformed node or string: <ast.Call object at 0x7f8b6eb8c280>
Row 6 caused an error: malformed node or string: <ast.Call object at 0x7f8b6f0f9d60>
Row 7 caused an error: malformed node or string: <ast.Call object at 0x7f8b6f0f7220>
Row 9 caused an error: malformed node or string: <ast.Call object at 0x7f8b6efe2820>
Row 10 caused an error: malformed node or string: <ast.Call object at 0x7f8b6f056b50>
Row 11 caused an error: malformed node or string: <ast.Call object at 0x7f8b6ed2b880>
Row 12 caused an error: malformed node or string: <ast.Call obj

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f8bb412b190>>
Traceback (most recent call last):
  File "/home/kyuseok00/.pyenv/versions/dartb/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 788, in _clean_thread_parent_frames
    if phase != "start":
KeyboardInterrupt: 


Row 36 caused an error: malformed node or string: <ast.Call object at 0x7f8b6ee50bb0>
Row 37 caused an error: malformed node or string: <ast.Call object at 0x7f8b6eff3280>
Row 38 caused an error: malformed node or string: <ast.Call object at 0x7f8b6ed2a310>
Row 39 caused an error: malformed node or string: <ast.Call object at 0x7f8b6ed8f3d0>
Row 40 caused an error: malformed node or string: <ast.Call object at 0x7f8b6eda5970>
Row 41 caused an error: malformed node or string: <ast.Call object at 0x7f8b6ebee6d0>
Row 42 caused an error: malformed node or string: <ast.Call object at 0x7f8b6f1725e0>
Row 43 caused an error: malformed node or string: <ast.Call object at 0x7f8b6efec4c0>
Row 44 caused an error: malformed node or string: <ast.Call object at 0x7f8b6ec8b400>
Row 45 caused an error: malformed node or string: <ast.Call object at 0x7f8b6ed07550>
Row 46 caused an error: malformed node or string: <ast.Call object at 0x7f8b6ed1e910>
Row 47 caused an error: malformed node or string: <ast

In [8]:
result = calculate_similarity(df)

  0%|          | 0/93 [00:00<?, ?it/s]


AttributeError: 'str' object has no attribute 'items'

In [3]:
import tqdm
import numpy as np
from numpy.linalg import norm  # 추가: 코사인 유사도 계산에 필요한 라이브러리
import pandas as pd

# 코사인 유사도 계산 함수 정의
def cosine_similarity_fn(vec1, vec2):
    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2))

# 유사도 계산 함수
def calculate_similarity(df):
    results = []

    for user_id in tqdm.tqdm(df['user_id'].unique()):
        user_data = df[df['user_id'] == user_id].sort_values(by='time')

        for i in range(len(user_data) - 1):
            word_embeddings_i = user_data.iloc[i]['word_embeddings']
            word_embeddings_next = user_data.iloc[i + 1]['word_embeddings']

            # Ensure both word_embeddings are dictionaries
            if not isinstance(word_embeddings_i, dict) or not isinstance(word_embeddings_next, dict):
                # print(f"Skipping invalid word_embeddings: {word_embeddings_i}, {word_embeddings_next}")
                print("skip")
                continue

            similarity_found = False
            for word_i, embedding_i in word_embeddings_i.items():
                for word_next, embedding_next in word_embeddings_next.items():
                    similarity = cosine_similarity_fn(
                        np.array(embedding_i).flatten(),
                        np.array(embedding_next).flatten()
                    )
                    if similarity > 0.9:
                        similarity_found = True
                        break
                if similarity_found:
                    break

            results.append({
                'user_id': user_id,
                'time_1': user_data.iloc[i]['time'],
                'time_2': user_data.iloc[i + 1]['time'],
                'similarity': 1 if similarity_found else 0
            })

    return pd.DataFrame(results)

# Perform similarity calculation
similarity_results = calculate_similarity(df)

# Output the results
# print(similarity_results)

  0%|          | 0/93 [00:00<?, ?it/s]

  1%|          | 1/93 [00:00<00:11,  7.72it/s]

skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip


  6%|▋         | 6/93 [00:00<00:04, 18.43it/s]

skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip


  9%|▊         | 8/93 [00:00<00:06, 13.88it/s]

skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip


 16%|█▌        | 15/93 [00:00<00:03, 23.41it/s]

skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip


 19%|█▉        | 18/93 [00:01<00:03, 19.83it/s]

skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip


 26%|██▌       | 24/93 [00:01<00:02, 27.89it/s]

skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip


 30%|███       | 28/93 [00:01<00:03, 20.31it/s]

skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip


 51%|█████     | 47/93 [00:01<00:01, 39.18it/s]

skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip


 58%|█████▊    | 54/93 [00:01<00:00, 46.21it/s]

skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip


 67%|██████▋   | 62/93 [00:02<00:00, 42.25it/s]

skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip


 72%|███████▏  | 67/93 [00:02<00:00, 35.77it/s]

skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip


100%|██████████| 93/93 [00:02<00:00, 35.61it/s]

skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip
skip





In [28]:
df_with_embeddings.to_csv('df_with_embeddings2.csv', encoding='utf-8-sig')

In [29]:
df_with_embeddings

Unnamed: 0,_id,user_id,time,url,text,text_result,processed_text,top_keywords,keyword_Emb,word_embeddings
2710,67430fbc0c898a183d4fbf26,ti_a61,2023-08-21T23:08:07.000Z,https://www.threads.net/@ti_a61/post/CwOXna9BmoX,뭐하는 애죠시험 평균이 10~30이라길래그럼 자퇴하면 검정고시 합격 못한다했더니그럼...,뭐 하는 애죠? 시험 평균이 10~30이라길래 그럼 자퇴하면 검정고시 합격 못 한다...,뭐 애 시험 평균 자퇴 검정 고시 합격 학교 공부 선택지 어디,"['검정', '고시', '선택지', '평균', '합격', '자퇴', '시험', '학...","[-0.5419741, -0.7466092, -0.6101014, 1.4957911...","{'검정': [[0.20843148, -0.08828437, 0.13065429, ..."
2711,67430fbc0c898a183d4fbf30,ti_a61,2023-08-21T04:39:17.000Z,https://www.threads.net/@ti_a61/post/CwMYuDZJYV_,누가 뭐래도 사람 대하는게 제일 어려운듯..모든 사람이 날 좋아할 순 없다지만모든 ...,누가 뭐래도 사람 대하는 게 제일 어려운 듯.. 모든 사람이 날 좋아할 순 없다지만...,누구 뭐 사람 사람 나 사람 나,"['사람', '누구']","[-0.3865591, -0.817194, -0.9155909, 1.7148402,...","{'사람': [[-0.034169693, -0.09466052, -0.0040782..."
2712,67430fbc0c898a183d4fbf31,ti_a61,2023-08-20T17:45:40.000Z,https://www.threads.net/@ti_a61/post/CwLN658puC2,진짜 눈물만 난다내가 뭘그리 잘못했는지 모르겠어내가 뭘 그리 잘못해서 이렇게 살고있...,진짜 눈물만 난다. 내가 뭘 그리 잘못했는지 모르겠어. 내가 뭘 그리 잘못해서 이렇...,눈물 나 뭐 나 뭐 나 뭐 취급 나 행복 사람 사람 사람 모두 나 오늘 여기,"['사람', '취급', '모두', '여기', '행복', '눈물', '오늘']","[-0.61295563, -0.784042, -0.9003625, 1.5614357...","{'사람': [[-0.034169693, -0.09466052, -0.0040782..."
2713,67430fbc0c898a183d4fbf32,ti_a61,2023-08-20T16:17:40.000Z,https://www.threads.net/@ti_a61/post/CwLD2WHp4ey,자퇴해버릴까 고민중,문장: 자퇴해 버릴까 고민 중,문장 자퇴 고민,"['자퇴', '고민']","[-0.42230695, -0.87979144, -0.44737652, 1.6923...","{'자퇴': [[0.20843148, -0.08828437, 0.13065429, ..."
2714,67430fbc0c898a183d4fbf33,ti_a61,2023-08-20T05:37:25.000Z,https://www.threads.net/@ti_a61/post/CwJ6k9BJjij,현준님이랑 체리들은 항상 날 웃게하는 것 같아방근까지만해도 죽을 듯 우울했는데 보고...,현준 님이랑 체리들은 항상 날 웃게 하는 것 같아 방금까지만 해도 죽을 듯 우울했는...,준 체리 나 방금 나,"['방금', '체리']","[-0.52754587, -0.47142637, -1.0802295, 1.37854...","{'방금': [[0.20843148, -0.08828437, 0.13065429, ..."
...,...,...,...,...,...,...,...,...,...,...
5416,67430fbc0c898a183d4fec1b,ibfci,2024-02-13T06:00:07.000Z,https://www.threads.net/@ibfci/post/C3Rt4LuRZ7D,걱정하지 말아달라는 거다.다들 그래봤자 어차피 나를 떠날 거니까.나는 사람을 잘 안...,걱정하지 말라는 거다. 다들 그래 봤자 어차피 나를 떠날 거니까. 나는 사람을 잘 ...,걱정 나 나 사람 나 나 속 얘기 사람 우발,"['우발', '사람', '걱정', '얘기']","[-0.65913206, -1.2135304, -0.5713355, 1.570326...","{'우발': [[0.20843148, -0.08828437, 0.13065429, ..."
5417,67430fbc0c898a183d4fec28,ibfci,2024-02-13T01:27:38.000Z,https://www.threads.net/@ibfci/post/C3ROsXJPp9w,요즘 들어 꿈에서 자꾸 제가 스스로 목숨을 끊는데.. 이거 어어.. 안 좋은 거죠?,요즘 들어 꿈에서 자꾸 제가 스스로 목숨을 끊는데.. 이거 안 좋은 거죠?,요즘 꿈 저 목숨 이거,"['목숨', '요즘', '이거']","[-0.25130317, -0.21303415, -0.51493216, 1.6068...","{'목숨': [[0.20843148, -0.08828437, 0.13065429, ..."
5418,67430fbc0c898a183d4fec29,ibfci,2024-02-13T01:28:41.000Z,https://www.threads.net/@ibfci/post/C3RO0GKP_fZ,"몇 주 안에 두 번 꿨어요. 한 번은 바다에 투신, 한 번은 옥상에서..","몇 주 안에 두 번 꿨어요. 한 번은 바다에 투신, 한 번은 옥상에서....",안 바다 투신 옥상,"['옥상', '바다', '투신']","[-0.3210084, -0.7765417, -0.35808966, 1.484147...","{'옥상': [[0.20843148, -0.08828437, 0.13065429, ..."
5419,67430fbc0c898a183d4fec2a,ibfci,2024-02-13T01:24:16.000Z,https://www.threads.net/@ibfci/post/C3ROTz1PCGj,아.. 울다..콘텐츠를 이용할 수 없음,아.. 울다.. 콘텐츠를 이용할 수 없음,,[],"[-0.31545943, -0.48268098, -0.75341344, 1.7447...",{}


In [None]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import tqdm
import numpy as np
from concurrent.futures import ThreadPoolExecutor
import ast  # 문자열을 리스트로 변환할 때 사용

# 1. koBERT 모델과 토크나이저 로드
model_name = 'monologg/kobert'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# koBERT는 GPU 사용을 권장하지만, CPU로도 실행 가능
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 2. 단어 임베딩 생성 함수
def get_word_embedding(word):
    inputs = tokenizer(word, return_tensors='pt', padding=True, truncation=True, max_length=32)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # 임베딩 추출
    return embedding

# 3. 각 단어별 임베딩 생성 (병렬 처리)
def generate_word_embeddings_parallel(keywords):
    embeddings = {}
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(get_word_embedding, keywords))  # 병렬로 임베딩 생성
    for word, embedding in zip(keywords, results):
        embeddings[word] = embedding
    return embeddings

# 4. 배치별로 임베딩 추가하는 함수
def add_embeddings_to_df_batch(df, start_idx, end_idx):
    all_embeddings = []
    
    # 지정된 범위에 대해 처리
    for _, row in tqdm.tqdm(df.iloc[start_idx:end_idx].iterrows(), total=end_idx-start_idx):
        # 문자열 형태의 'top_keywords'를 리스트로 변환
        keywords = ast.literal_eval(row['top_keywords']) if isinstance(row['top_keywords'], str) else row['top_keywords']
        word_embeddings = generate_word_embeddings_parallel(keywords)  # 병렬로 임베딩 생성
        all_embeddings.append(word_embeddings)
    
    # 'word_embeddings' 새 칼럼에 추가
    df.loc[start_idx:end_idx-1, 'word_embeddings'] = all_embeddings
    return df

# 5. 데이터프레임을 배치로 처리하는 함수
def process_in_batches(df, batch_size=200):
    total_rows = len(df)
    for start_idx in range(0, total_rows, batch_size):
        end_idx = min(start_idx + batch_size, total_rows)
        df = add_embeddings_to_df_batch(df, start_idx, end_idx)
    return df

# # 6. 데이터프레임에 단어별 임베딩 추가 (배치 처리)
# # 예시 데이터프레임
# data = {
#     'id': [1, 2],
#     'top_keywords': ["['apple', 'banana', 'cherry']", "['dog', 'cat', 'bird']"]  # 문자열로 저장된 리스트
# }
# df = pd.DataFrame(data)

# 'top_keywords'가 문자열인 경우 리스트로 변환 후 처리
df_with_embeddings = process_in_batches(half_df)

# 결과 출력
print(df_with_embeddings)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [19]:
# Process the DataFrame in batches
df_with_embeddings = process_in_batches(df)

# Display results
print(df_with_embeddings)

100%|██████████| 200/200 [02:36<00:00,  1.28it/s]


Processed rows 0 to 200 successfully.


100%|██████████| 200/200 [01:55<00:00,  1.73it/s]


Processed rows 200 to 400 successfully.


  1%|          | 2/200 [00:02<03:24,  1.03s/it]


KeyboardInterrupt: 

In [14]:
# 각 키워드와 임베딩 벡터를 개별 행으로 변환
parsed_data = []
for _, row in df_with_embeddings2.iterrows():
    embeddings = row['word_embeddings']  # 딕셔너리 형태
    if embeddings:
        for keyword, embedding_list in embeddings.items():
            for embedding in embedding_list:  # 다차원 임베딩 벡터를 분리
                parsed_data.append({'keyword': keyword, 'embedding': embedding})

# 새로운 데이터프레임 생성
parsed_df = pd.DataFrame(parsed_data)

In [15]:
parsed_df

Unnamed: 0,keyword,embedding
0,검정,"[0.20843148, -0.08828437, 0.13065429, -3.01225..."
1,고시,"[0.20843148, -0.08828437, 0.13065429, -3.01225..."
2,선택지,"[0.20843148, -0.08828437, 0.13065429, -3.01225..."
3,평균,"[0.037131798, -0.11239781, 0.022574052, -4.098..."
4,합격,"[0.20843148, -0.08828437, 0.13065429, -3.01225..."
...,...,...
14515,카페인,"[0.20843148, -0.08828437, 0.13065429, -3.01225..."
14516,기본,"[0.20843148, -0.08828437, 0.13065429, -3.01225..."
14517,일주일,"[0.20843148, -0.08828437, 0.13065429, -3.01225..."
14518,의사,"[0.20843148, -0.08828437, 0.13065429, -3.01225..."


In [18]:
half_keyword_Emb = pd.read_csv('half_keyword_Emb.csv', encoding='utf-8-sig')

In [20]:
keyword_Emb = pd.concat([half_keyword_Emb, df_with_embeddings2])
keyword_Emb

Unnamed: 0,_id,user_id,time,url,text,text_result,processed_text,top_keywords,keyword_Emb,word_embeddings
0,67430fbc0c898a183d4f9641,gh0422,2024-05-26T07:25:18.000Z,https://www.threads.net/@gh0422/post/C7bFgLsyDqq,"읽씹,안읽씹내가 눈치가 없는걸까? 바로 전 카톡까지만 해도 나를 걱정하는 말.혹은 ...","읽씹, 안읽씹 내가 눈치가 없는 걸까? 바로 전 카톡까지만 해도 나를 걱정하는 말....",읽씹 안읽씹 나 눈치 카톡 나 걱정 말 질문 나 그 답 답 경우 봉사 사람 태도 때...,"['일러스트', '안읽씹', '읽씹', '재단', '이분', '배척', '네이버',...","[-0.30443642, -0.35974327, -0.53078324, 1.6614...","{'일러스트': array([[ 2.08431482e-01, -8.82843733e..."
1,67430fbc0c898a183d4f9643,gh0422,2024-05-25T16:55:54.000Z,https://www.threads.net/@gh0422/post/C7ZiAb0y7QS,남 탓으로 돌리는게 아니야.난 이해받지 못하는 인간이지만간절히 이해받고 싶던 인간이...,남 탓으로 돌리는 게 아니야. 난 이해받지 못하는 인간이지만 간절히 이해받고 싶던 ...,남 탓 나 이해 인간 이해 인간 설명 설명 사랑 우정 이해 포기 조금 위로 타협 그...,"['이해', '인간', '특별', '설명', '우정', '일러스트', '타협', '...","[-0.8265727, -0.68740433, -0.791313, 1.4141463...","{'이해': array([[ 2.08431482e-01, -8.82843733e-0..."
2,67430fbc0c898a183d4f9649,gh0422,2024-05-24T20:14:00.000Z,https://www.threads.net/@gh0422/post/C7XT4h1ScCA,오늘은 나쁜꿈 슬픈꿈 안꾸게 해주세요🙏한번 크게 힘든 꿈 꾸고나면 안그래도 잘 못자...,오늘은 나쁜 꿈 슬픈 꿈 안 꾸게 해주세요 🙏 한 번 크게 힘든 꿈 꾸고 나면 안 ...,오늘 꿈 꿈 꿈 꿈,['오늘'],"[-0.48158136, -0.64766693, -0.70270747, 1.8364...","{'오늘': array([[ 3.96649651e-02, -3.41148376e-0..."
3,67430fbc0c898a183d4f964c,gh0422,2024-05-24T13:31:24.000Z,https://www.threads.net/@gh0422/post/C7Wlzzcykh0,과연 나는 누군가가 감싸줄만한 사람일까?누구나 절대 선으로만 인생을 살아오지는 않았...,과연 나는 누군가가 감싸줄 만한 사람일까? 누구나 절대 선으로만 인생을 살아오지는 ...,나 누구 사람 누구 절대 선 인생 나 나 과거 나 인생 벌 생각 때 상담 쌤 주치의...,"['죄인', '절대', '공감', '칼날', '인생', '처절', '누구', '광희...","[-0.15184923, -0.46468312, -0.81459737, 1.4558...","{'죄인': array([[ 2.08431482e-01, -8.82843733e-0..."
4,67430fbc0c898a183d4f964d,gh0422,2024-05-24T12:20:17.000Z,https://www.threads.net/@gh0422/post/C7Wdq7SyBrv,온 몸으로 우울함이 쏟아지는 오늘.학원에서도 말 하기가 싫어서 이어폰 끼고계속 내 ...,온몸으로 우울함이 쏟아지는 오늘. 학원에서도 말하기가 싫어서 이어폰 끼고 계속 내 ...,온몸 우울 오늘 학원 말 이어폰 나 일 집 길 나 집 나 집 생각 어디 나 진정 곳...,"['해방감', '토로', '제정신', '이어폰', '거부', '감당', '온몸', ...","[-0.34009513, -0.5111167, -0.72624904, 1.49870...","{'해방감': array([[ 2.08431482e-01, -8.82843733e-..."
...,...,...,...,...,...,...,...,...,...,...
5416,67430fbc0c898a183d4fec1b,ibfci,2024-02-13T06:00:07.000Z,https://www.threads.net/@ibfci/post/C3Rt4LuRZ7D,걱정하지 말아달라는 거다.다들 그래봤자 어차피 나를 떠날 거니까.나는 사람을 잘 안...,걱정하지 말라는 거다. 다들 그래 봤자 어차피 나를 떠날 거니까. 나는 사람을 잘 ...,걱정 나 나 사람 나 나 속 얘기 사람 우발,"['우발', '사람', '걱정', '얘기']","[-0.65913206, -1.2135304, -0.5713355, 1.570326...","{'우발': [[0.20843148, -0.08828437, 0.13065429, ..."
5417,67430fbc0c898a183d4fec28,ibfci,2024-02-13T01:27:38.000Z,https://www.threads.net/@ibfci/post/C3ROsXJPp9w,요즘 들어 꿈에서 자꾸 제가 스스로 목숨을 끊는데.. 이거 어어.. 안 좋은 거죠?,요즘 들어 꿈에서 자꾸 제가 스스로 목숨을 끊는데.. 이거 안 좋은 거죠?,요즘 꿈 저 목숨 이거,"['목숨', '요즘', '이거']","[-0.25130317, -0.21303415, -0.51493216, 1.6068...","{'목숨': [[0.20843148, -0.08828437, 0.13065429, ..."
5418,67430fbc0c898a183d4fec29,ibfci,2024-02-13T01:28:41.000Z,https://www.threads.net/@ibfci/post/C3RO0GKP_fZ,"몇 주 안에 두 번 꿨어요. 한 번은 바다에 투신, 한 번은 옥상에서..","몇 주 안에 두 번 꿨어요. 한 번은 바다에 투신, 한 번은 옥상에서....",안 바다 투신 옥상,"['옥상', '바다', '투신']","[-0.3210084, -0.7765417, -0.35808966, 1.484147...","{'옥상': [[0.20843148, -0.08828437, 0.13065429, ..."
5419,67430fbc0c898a183d4fec2a,ibfci,2024-02-13T01:24:16.000Z,https://www.threads.net/@ibfci/post/C3ROTz1PCGj,아.. 울다..콘텐츠를 이용할 수 없음,아.. 울다.. 콘텐츠를 이용할 수 없음,,[],"[-0.31545943, -0.48268098, -0.75341344, 1.7447...",{}


In [22]:
df = keyword_Emb

In [23]:
import ast
import tqdm  # 안전하게 문자열을 딕셔너리로 변환하기 위해 사용

def calculate_similarity(df):
    results = []

    # 유저별로 데이터를 순차적으로 처리
    for user_id in tqdm.tqdm(df['user_id'].unique()):
        user_data = df[df['user_id'] == user_id].sort_values(by='time')

        # i번째와 i+1번째 문장 간 유사도 비교
        for i in range(len(user_data) - 1):
            # word_embeddings 컬럼 값이 문자열일 경우 변환
            word_embeddings_i = user_data.iloc[i]['word_embeddings']
            word_embeddings_next = user_data.iloc[i + 1]['word_embeddings']

            if isinstance(word_embeddings_i, str):
                word_embeddings_i = ast.literal_eval(word_embeddings_i)
            if isinstance(word_embeddings_next, str):
                word_embeddings_next = ast.literal_eval(word_embeddings_next)

            # 유사도 찾기
            similarity_found = False
            for word_i, embedding_i in word_embeddings_i.items():
                for word_next, embedding_next in word_embeddings_next.items():
                    similarity = cosine_similarity_fn(np.array(embedding_i), np.array(embedding_next))  # np.array로 변환
                    if similarity > 0.9:  # 유사도 기준(0.9 이상이면 1로 설정)
                        similarity_found = True
                        break
                if similarity_found:
                    break

            results.append({
                'user_id': user_id,
                'time_1': user_data.iloc[i]['time'],
                'time_2': user_data.iloc[i + 1]['time'],
                'similarity': 1 if similarity_found else 0
            })

    return pd.DataFrame(results)

# 유사도 계산 수행
similarity_results = calculate_similarity(df)

# 결과 출력
print(similarity_results)

  0%|          | 0/93 [00:00<?, ?it/s]

  0%|          | 0/93 [00:00<?, ?it/s]


ValueError: malformed node or string: <ast.Call object at 0x7ff61b3a0310>

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5421 entries, 0 to 5420
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   _id              5421 non-null   object
 1   user_id          5421 non-null   object
 2   time             5421 non-null   object
 3   url              5421 non-null   object
 4   text             5421 non-null   object
 5   text_result      5421 non-null   object
 6   processed_text   5302 non-null   object
 7   top_keywords     5421 non-null   object
 8   keyword_Emb      5421 non-null   object
 9   word_embeddings  5421 non-null   object
dtypes: object(10)
memory usage: 423.6+ KB


In [29]:
import ast
import tqdm  # 안전하게 문자열을 딕셔너리로 변환하기 위해 사용

def calculate_similarity(df):
    results = []

    # 유저별로 데이터를 순차적으로 처리
    for user_id in tqdm.tqdm(df['user_id'].unique()):
        user_data = df[df['user_id'] == user_id].sort_values(by='time')

        # i번째와 i+1번째 문장 간 유사도 비교
        for i in range(len(user_data) - 1):
            # word_embeddings 컬럼 값이 문자열일 경우 변환
            word_embeddings_i = user_data.iloc[i]['word_embeddings']
            word_embeddings_next = user_data.iloc[i + 1]['word_embeddings']

            if isinstance(word_embeddings_i, str):
                word_embeddings_i = ast.literal_eval(word_embeddings_i)
            if isinstance(word_embeddings_next, str):
                word_embeddings_next = ast.literal_eval(word_embeddings_next)

            # 유사도 찾기
            similarity_found = False
            for word_i, embedding_i in word_embeddings_i.items():
                for word_next, embedding_next in word_embeddings_next.items():
                    similarity = cosine_similarity_fn(np.array(embedding_i), np.array(embedding_next))  # np.array로 변환
                    if similarity > 0.9:  # 유사도 기준(0.9 이상이면 1로 설정)
                        similarity_found = True
                        break
                if similarity_found:
                    break

            results.append({
                'user_id': user_id,
                'time_1': user_data.iloc[i]['time'],
                'time_2': user_data.iloc[i + 1]['time'],
                'similarity': 1 if similarity_found else 0
            })

    return pd.DataFrame(results)

# 유사도 계산 수행
similarity_results = calculate_similarity(df)

# 결과 출력
print(similarity_results)

  0%|          | 0/93 [00:00<?, ?it/s]


ValueError: malformed node or string: <ast.Call object at 0x7ff61a3a30a0>

In [30]:
# 데이터 타입과 샘플 확인
print(df_with_embeddings['word_embeddings'].apply(type).value_counts())
print(df_with_embeddings['word_embeddings'].head())

word_embeddings
<class 'dict'>    2711
Name: count, dtype: int64
2710    {'검정': [[0.20843148, -0.08828437, 0.13065429, ...
2711    {'사람': [[-0.034169693, -0.09466052, -0.0040782...
2712    {'사람': [[-0.034169693, -0.09466052, -0.0040782...
2713    {'자퇴': [[0.20843148, -0.08828437, 0.13065429, ...
2714    {'방금': [[0.20843148, -0.08828437, 0.13065429, ...
Name: word_embeddings, dtype: object


In [None]:
keyword_Emb = pd.concat([half_keyword_Emb, df_with_embeddings2])