In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

model_name = "monologg/koelectra-base-v3-finetuned-korquad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

documents = [
    "우리 회사는 인공지능 연구를 하고 있습니다.",
    "우리 제품은 2025년에 출시되었습니다."
]
question = "우리 회사는 무슨 연구를 하나요?"


In [28]:
def select_best_context(documents, question):
    result = []
    for doc in documents:
        doc_bool = []
        for word in question.split():
            doc_bool.append(word in doc)
        doc_score = sum(doc_bool)
        result.append((doc_score, doc))
    best_context_idx = result.index(max(result))
    best_context = documents[best_context_idx]
    return best_context



In [29]:
doc = "우리 회사는 인공지능 연구를 하고 있습니다."
question = "우리 회사는 무슨 연구를 하나요?"
question.split()


['우리', '회사는', '무슨', '연구를', '하나요?']

In [30]:
for word in question.split():
    print(word)

우리
회사는
무슨
연구를
하나요?


In [31]:
doc_bool = []
for word in question.split():
    doc_bool.append(word in doc)
print(doc_bool)

doc_score = sum(doc_bool)
print(doc_score)

[True, True, False, True, False]
3


In [32]:
doc2 = "우리 회사에서는 인공지능 연구를 하고 있지 않습니다."
question = "우리 회사는 무슨 연구를 하나요?"
doc_bool2 = []
for word in question.split():
    doc_bool2.append(word in doc2)
print(doc_bool2)

doc_score2 = sum(doc_bool2)
print(doc_score2)


[True, False, False, True, False]
2


In [33]:
best_context = select_best_context(documents, question)
print("Best context:", best_context)

inputs = tokenizer.encode_plus(question, best_context, return_tensors="pt")
print("Inputs:", inputs)

Best context: 우리 회사는 인공지능 연구를 하고 있습니다.
Inputs: {'input_ids': tensor([[    2,  6233,  6387,  4034,  7008,  6303,  4110,  6272,  4150,    35,
             3,  6233,  6387,  4034, 11881,  6303,  4110, 14227,  3249,  4576,
          6216,    18,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [35]:
inputs.input_ids

tensor([[    2,  6233,  6387,  4034,  7008,  6303,  4110,  6272,  4150,    35,
             3,  6233,  6387,  4034, 11881,  6303,  4110, 14227,  3249,  4576,
          6216,    18,     3]])

In [36]:
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
print(outputs)

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[ -4.9601, -11.9210, -11.9915, -12.1289, -11.9584, -12.1299, -12.5216,
         -12.0291, -12.1545, -12.2190,  -4.9601,  -7.5452,  -7.9683,  -7.9850,
           9.0662,  -5.0904,  -7.8677,  -8.6923,  -9.8993,  -9.8247, -10.9598,
          -9.4014,  -4.9601]]), end_logits=tensor([[ -7.9729, -12.8283, -12.7886, -12.8603, -12.9101, -12.5868, -12.5420,
         -12.9244, -12.8554, -12.6807,  -7.9729,  -9.5976,  -9.7077, -10.7093,
           2.0292,   7.7995,  -3.1036,  -7.4832,  -9.2761, -10.3589,  -8.8760,
          -5.3346,  -7.9729]]), hidden_states=None, attentions=None)


In [38]:
start_index = torch.argmax(outputs.start_logits)
end_index = torch.argmax(outputs.end_logits) + 1
answer = tokenizer.decode(inputs.input_ids[0][start_index:end_index], skip_special_tokens=True)

print(f"선택된 문서: {best_context}")

print(f"질문: {question}")
print(f"답변: {answer}")

선택된 문서: 우리 회사는 인공지능 연구를 하고 있습니다.
질문: 우리 회사는 무슨 연구를 하나요?
답변: 인공지능 연구


In [39]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

text = "안녕하세요, 반갑습니다."
embedding = model.encode(text).tolist()
print(embedding)


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[0.1739785373210907, 0.1782427430152893, 0.45043250918388367, 0.21449102461338043, -0.14879804849624634, -0.13885906338691711, 0.44790297746658325, 0.12859216332435608, -0.32391777634620667, 0.21514305472373962, 0.10711116343736649, -0.5577337145805359, -0.15766416490077972, 0.04360466077923775, 0.29028233885765076, 0.32177627086639404, 0.3335725665092468, -0.10994072258472443, -0.440006285905838, -0.12614165246486664, -0.34778866171836853, 0.0859408751130104, -0.13949787616729736, 0.11013740301132202, 0.06987199187278748, -0.2633180022239685, 0.3010371923446655, 0.5414543151855469, -0.08924804627895355, -0.3403833508491516, -0.29455673694610596, -0.18356309831142426, 0.35005971789360046, -0.03385234251618385, -0.08254000544548035, 0.46301525831222534, -0.11786183714866638, -0.09847896546125412, -0.13621345162391663, 0.09960996359586716, 0.17333611845970154, -0.25445640087127686, 0.20710447430610657, 0.23632389307022095, 0.018303243443369865, -0.30280712246894836, -0.07270225882530212,

In [52]:
# file 에서 문장 읽어와서 embedding 생성하기
import pandas   as pd
from sentence_transformers import SentenceTransformer

df = pd.read_csv('C:\\Users\\Leo\\code\\data\\test_file.csv', encoding='utf8')
df

Unnamed: 0,sentence
0,안녕하세요.
1,누구세요?
2,안녕히 가세요.


In [50]:
sc_list= df['sentence'].tolist()
sc_list

['안녕하세요.', '누구세요?', '안녕히 가세요.']

In [54]:
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
res = []
for sc in sc_list:
    embedding = model.encode(sc).tolist()
    res.append(embedding)
print(res)

[[0.12071584910154343, 0.16374550759792328, 0.27759042382240295, 0.28737086057662964, -0.16390344500541687, -0.19601304829120636, 0.5038758516311646, 0.15490840375423431, -0.3444209098815918, 0.1318168193101883, 0.06661806255578995, -0.5285354256629944, -0.162029430270195, 0.09770146012306213, 0.1298447549343109, 0.1576467752456665, 0.28771552443504333, -0.2034401297569275, -0.22404614090919495, -0.09415482729673386, -0.32635778188705444, 0.06041639298200607, -0.11048044264316559, 0.07415991276502609, 0.044156335294246674, -0.2010980248451233, 0.39069122076034546, 0.41583117842674255, -0.09814126044511795, -0.47868067026138306, -0.30981162190437317, -0.1146661639213562, 0.3893645107746124, -0.04963717237114906, -0.1770198941230774, 0.36784180998802185, 0.04083467274904251, -0.16361196339130402, -0.06218527629971504, 0.0021354742348194122, 0.25746771693229675, -0.15965187549591064, 0.09251745045185089, 0.2879413366317749, -0.05430970340967178, -0.12396062910556793, -0.07110217213630676,