##%% md
## !pip install sentence_transformers

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("Huffon/sentence-klue-roberta-base")

In [None]:
model

In [None]:
import pandas as pd
df = pd.read_csv('user_answer.csv')

In [None]:
df

In [None]:
df.iloc[0]

In [None]:
import numpy as np
# NAN data 제거
print(f"pre data size : {len(df)}")
df['user_answer'] = df['user_answer'].str.strip().str.replace("\n", "").str.replace("\xa0", "").str.replace("  ", " ")
df['user_answer'].replace('', np.nan, inplace=True)
df.dropna(axis=0, subset=['user_answer'], inplace=True)  # 빈 답변 제거
print(f"after data size : {len(df)}")

In [None]:
min_len = min(map(len, df['user_answer']))
max_len = max(map(len, df['user_answer']))

for i, data in df.iterrows():
    if len(data['user_answer']) == min_len:
        print(f"가장 짧은 답변 {min_len}ch:\n\t {data['user_answer']}")
    elif len(data['user_answer']) == max_len:
        print(f"가장 긴 답변 {max_len}ch:\n\t {data['user_answer']}")

### pip install konlpy

In [None]:
from konlpy.tag import Okt
okt = Okt()

In [None]:
from dataclasses import dataclass
from typing import List

@dataclass
class Problem:
    subject: str  # 문제별
    keywords: List[str]  # 문제별
    keywords_score: List[int]  # 문제별
    keywords_embedding : List[np.ndarray]  # 문제별
    user_answers: List[str]  # 유저별
    user_correct_keywords: List[List[str]]  # 유저별
    ground_truths: List[List[int]]


In [None]:
df.iloc[0]

In [None]:
dataset = {}
# criterion parsing
for i, data in df.iterrows():
    problem_id = data['problem_id']
    if problem_id not in dataset:
        keywords = []
        keywords_score = []

        for criterion in eval(data['keyword_criterion']):
            keyword, score = map(str.strip, criterion.split('-'))
            score = float(score.split("점")[0])
            keywords.append(keyword)
            keywords_score.append(score)
        keywords_embedding = model.encode(keywords)

        dataset[problem_id] = Problem(
            subject=data['problem'],
            keywords=keywords,
            keywords_score=keywords_score,
            keywords_embedding=keywords_embedding,
            user_answers=[],
            user_correct_keywords=[],
            ground_truths=[],
        )

In [None]:
import random

problem_id = random.choice(list(dataset.keys()))
print(f"problem : {dataset[problem_id].subject}")
print(f"keywords : {dataset[problem_id].keywords}")
print(f"keywords_score : {dataset[problem_id].keywords_score}")
print(f"keyword embedding shape :{dataset[problem_id].keywords_embedding.shape}")

In [None]:
for i, data in df.iterrows():
    problem = dataset[data['problem_id']]
    user_answer = data['user_answer']
    user_correct_keyword = [criterion.split('-')[0].rstrip() for criterion in eval(data['correct_keyword_criterion'])]
    ground_truth = [1 if keyword in user_correct_keyword else 0 for keyword in problem.keywords]
    problem.user_correct_keywords.append(user_correct_keyword)
    problem.user_answers.append(user_answer)
    problem.ground_truths.append(ground_truth)

In [None]:
print(f"문제       : {problem.subject}")
print(f"유저 답변   : {problem.user_answers[0]}")
print(f"정답 키워드 : {problem.user_correct_keywords[0]}")
print(f"후보 키워드 : {problem.keywords}")
print(f"정답 라벨  : {problem.ground_truths[0]}")

In [None]:
# 하나라도 출력되면 라벨링을 잘못 파싱한 것
for problem_id in dataset:
    problem = dataset[problem_id]
    for gt, correct_keyword in zip(problem.ground_truths, problem.user_correct_keywords):
        ground_truth = []
        for i, flag in enumerate(gt):
            if flag:
                ground_truth.append(problem.keywords[i])
        if not ground_truth == correct_keyword:
            print(ground_truth, correct_keyword)

In [None]:
for problem_id in dataset:
    print(f"{dataset[problem_id].subject}")
    print(f"{len(dataset[problem_id].user_answers)}개의 유저 답변")
    print(f"{len(dataset[problem_id].user_correct_keywords)}개의 유저의 답변별 키워드 라벨링")

In [None]:
problem_id = random.choice(list(dataset.keys()))
user_answer = random.choice(dataset[problem_id].user_answers)
tokenized_answer = [word[0] for word in okt.pos(user_answer) if word[1] == 'Noun']
print(tokenized_answer[:10])

In [None]:
# 유사도 검사
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

threshold = 0.7
total_acc, total_f1 = 0, 0
for i, key in enumerate(dataset):
    acc, f1 = 0, 0
    problem = dataset[key]
    for j, (user_answer, ground_truth) in enumerate(tqdm(zip(problem.user_answers, problem.ground_truths))):
        tokenized_answer = [word[0] for word in okt.pos(user_answer) if word[1] == 'Noun']  # 명사만 추출
        tokenized_answer_embedding = model.encode(tokenized_answer)
        similarity_scores = cosine_similarity(problem.keywords_embedding, tokenized_answer_embedding)
        predicts = [1 if score.max() > 0.7 else 0 for idx, score in enumerate(similarity_scores)]
        acc += accuracy_score(ground_truth, predicts)
        f1 += f1_score(ground_truth, predicts, zero_division=0)
    print(f"{i}번 문제 점수 : accuracy : {acc / len(problem.user_answers)}, f1-score : {f1 / len(problem.user_answers)}")
    total_acc += acc / len(problem.user_answers)
    total_f1 += f1 / len(problem.user_answers)
print(f"전체 문제 평균 accuracy : {total_acc / len(dataset)}, f1-score : {total_f1 / len(dataset)}")

In [None]:
problem_id = random.choice(list(dataset.keys()))
user_answer = random.choice(dataset[problem_id].user_answers)
tokenized_answer = [word[0] for word in okt.pos(user_answer) if word[1] in ('Noun', 'Alpha')]
print(tokenized_answer[:10])

In [None]:
# 유사도 검사 -> 영어도 추가
threshold = 0.7
total_acc, total_f1 = 0, 0
for i, key in enumerate(dataset):
    acc, f1 = 0, 0
    problem = dataset[key]
    for j, (user_answer, ground_truth) in enumerate(tqdm(zip(problem.user_answers, problem.ground_truths))):
        tokenized_answer = [word[0] for word in okt.pos(user_answer) if word[1] in ('Noun', 'Alpha')]  # 명사만 추출
        tokenized_answer_embedding = model.encode(tokenized_answer)
        similarity_scores = cosine_similarity(problem.keywords_embedding, tokenized_answer_embedding)
        predicts = [1 if score.max() > 0.7 else 0 for idx, score in enumerate(similarity_scores)]
        acc += accuracy_score(ground_truth, predicts)
        f1 += f1_score(ground_truth, predicts, zero_division=0)
    print(f"{i}번 문제 점수 : accuracy : {acc / len(problem.user_answers)}, f1-score : {f1 / len(problem.user_answers)}")
    total_acc += acc / len(problem.user_answers)
    total_f1 += f1 / len(problem.user_answers)
print(f"전체 문제 평균 accuracy : {total_acc / len(dataset)}, f1-score : {total_f1 / len(dataset)}")

---
### 2번이랑 3번이랑 아니 그냥 전체적으로 엉망진창이네!!
### 성능 향상을 위한 방법들은 아래와 같다.
- Tokenizing을 하지 않고 n-gram 방식으로 대조
- 키워드를 조금 더 적합한 단어로 변경
- 여러개의 키워드 후보군을 비교
- 불용어 제거
---

In [None]:
for i, problem_id in enumerate(dataset):
    problem = dataset[problem_id]
    print(f"{i}번째 문제 ID : {problem_id}")
    print(f"{i}번째 문제 : {problem.subject}")
    print(f"keyword : {problem.keywords}")

In [None]:
print(dataset['recXfKthnQLwWETgb'].keywords)
dataset['recXfKthnQLwWETgb'].keywords[0] = "만료"  # Lifecycle -> 만료
print(dataset['recXfKthnQLwWETgb'].keywords)
dataset['recXfKthnQLwWETgb'].keywords_embedding = model.encode(dataset['recXfKthnQLwWETgb'].keywords)  # 1번
print(dataset['rec1QvAvB4CMami3p'].keywords)
dataset['rec1QvAvB4CMami3p'].keywords[2] = '스택'  # Stack -> 스택
print(dataset['rec1QvAvB4CMami3p'].keywords)
dataset['rec1QvAvB4CMami3p'].keywords_embedding = model.encode(dataset['rec1QvAvB4CMami3p'].keywords)  # 7번
print(dataset['recUcGjT9Xkb7N5pu'].keywords)
dataset['recUcGjT9Xkb7N5pu'].keywords[2] = "삭제"  # 삭제(POP) -> 삭제
dataset['recUcGjT9Xkb7N5pu'].keywords[3] = "삽입"  # 삽입(PUSH) -> 삽입
print(dataset['recUcGjT9Xkb7N5pu'].keywords)
dataset['recUcGjT9Xkb7N5pu'].keywords_embedding = model.encode(dataset['recUcGjT9Xkb7N5pu'].keywords)  # 10번

In [None]:
problem_id = random.choice(list(dataset.keys()))
split_answer = random.choice(dataset[problem_id].user_answers).split(' ')
word_concat_size = 2
tokenized_answer = []
for k in range(len(split_answer) - word_concat_size):
    tokenized_answer.append(' '.join(split_answer[k : k + word_concat_size]))
print(tokenized_answer[:10])

In [None]:
# 유사도 검사 -> Tokenizer 방식에서 띄어쓰기 단위로 2개씩 묶어서 window 방식으로 비교 + 키워드 변형
threshold = 0.7
total_acc, total_f1 = 0, 0
word_concat_size = 2

for i, key in enumerate(dataset):
    acc, f1 = 0, 0
    problem = dataset[key]
    for j, (user_answer, ground_truth) in enumerate(tqdm(zip(problem.user_answers, problem.ground_truths))):
        split_answer = user_answer.split(' ')
        tokenized_answer = []
        for k in range(len(split_answer) - word_concat_size):
            tokenized_answer.append(' '.join(split_answer[k : k + word_concat_size]))
        tokenized_answer_embedding = model.encode(tokenized_answer)
        similarity_scores = cosine_similarity(problem.keywords_embedding, tokenized_answer_embedding)
        predicts = [1 if score.max() > threshold else 0 for idx, score in enumerate(similarity_scores)]
        acc += accuracy_score(ground_truth, predicts)
        f1 += f1_score(ground_truth, predicts, zero_division=0)
    print(f"{i}번 문제 점수 : accuracy : {acc / len(problem.user_answers)}, f1-score : {f1 / len(problem.user_answers)}")
    total_acc += acc / len(problem.user_answers)
    total_f1 += f1 / len(problem.user_answers)
print(f"전체 문제 평균 accuracy : {total_acc / len(dataset)}, f1-score : {total_f1 / len(dataset)}")

In [None]:
# 유사도 검사 -> Tokenizer 방식에서 띄어쓰기 단위로 2개씩 묶어서 window 방식으로 비교 + 키워드 변형, threshold 0.5로 낮춤
threshold = 0.5
total_acc, total_f1 = 0, 0
word_concat_size = 2

for i, key in enumerate(dataset):
    acc, f1 = 0, 0
    problem = dataset[key]
    for j, (user_answer, ground_truth) in enumerate(tqdm(zip(problem.user_answers, problem.ground_truths))):
        split_answer = user_answer.split(' ')
        tokenized_answer = []
        for k in range(len(split_answer) - word_concat_size):
            tokenized_answer.append(' '.join(split_answer[k : k + word_concat_size]))
        tokenized_answer_embedding = model.encode(tokenized_answer)
        similarity_scores = cosine_similarity(problem.keywords_embedding, tokenized_answer_embedding)
        predicts = [1 if score.max() > threshold else 0 for idx, score in enumerate(similarity_scores)]
        acc += accuracy_score(ground_truth, predicts)
        f1 += f1_score(ground_truth, predicts, zero_division=0)
    print(f"{i}번 문제 점수 : accuracy : {acc / len(problem.user_answers)}, f1-score : {f1 / len(problem.user_answers)}")
    total_acc += acc / len(problem.user_answers)
    total_f1 += f1 / len(problem.user_answers)
print(f"전체 문제 평균 accuracy : {total_acc / len(dataset)}, f1-score : {total_f1 / len(dataset)}")

In [None]:
problem_id = random.choice(list(dataset.keys()))
split_answer = random.choice(dataset[problem_id].user_answers).split(' ')
word_concat_size = 3
tokenized_answer = []
for k in range(len(split_answer) - word_concat_size):
    tokenized_answer.append(' '.join(split_answer[k : k + word_concat_size]))
print(tokenized_answer[:10])

In [None]:
# 유사도 검사 -> Tokenizer 방식에서 띄어쓰기 단위로 2개씩 묶어서 window 방식으로 비교 + 키워드 변형 -> 윈도우 사이즈 3으로 늘림, threshold 0.35로 낮춤
threshold = 0.35
total_acc, total_f1 = 0, 0
word_concat_size = 3

for i, key in enumerate(dataset):
    acc, f1 = 0, 0
    problem = dataset[key]
    for j, (user_answer, ground_truth) in enumerate(tqdm(zip(problem.user_answers, problem.ground_truths))):
        split_answer = user_answer.split(' ')
        tokenized_answer = []
        for k in range(len(split_answer) - word_concat_size + 1):
            tokenized_answer.append(' '.join(split_answer[k : k + word_concat_size]))
        if len(split_answer) < word_concat_size:
            tokenized_answer.append(' '.join(split_answer))
        tokenized_answer_embedding = model.encode(tokenized_answer)
        similarity_scores = cosine_similarity(problem.keywords_embedding, tokenized_answer_embedding)
        predicts = [1 if score.max() > threshold else 0 for idx, score in enumerate(similarity_scores)]

        acc += accuracy_score(ground_truth, predicts)
        f1 += f1_score(ground_truth, predicts, zero_division=1)
    print(f"{i}번 문제 점수 : accuracy : {acc / len(problem.user_answers)}, f1-score : {f1 / len(problem.user_answers)}")
    total_acc += acc / len(problem.user_answers)
    total_f1 += f1 / len(problem.user_answers)
print(f"전체 문제 평균 accuracy : {total_acc / len(dataset)}, f1-score : {total_f1 / len(dataset)}")

---
### 왜 5번문제만 f1 score가 저 모양일까?
---

In [None]:
problem = dataset['recio3s0A77i0kkEn']

print(problem.subject)
print(f"정답 키워드 : {problem.keywords}")
keyword_count = {}
for answer, keywords in zip(problem.user_answers, problem.user_correct_keywords):
    for keyword in keywords:
        if keyword not in keyword_count:
            keyword_count[keyword] = 1
        else:
            keyword_count[keyword] += 1
for keyword in keyword_count:
    keyword_count[keyword] = f"{int(keyword_count[keyword] / len(problem.user_answers) * 100)}%"
print(keyword_count)

---
#### 자세히 보면 키워드가 너무 어려웠는지 정답률이 엉망이다.
#### 실제 정답에 1이 없다면 f1 score는 무조건 0이다.
#### 이는 키워드가 적절치 않았는지를 고려해봐야 할 듯 하다.
---

---
### 이제 실제로 모델이 어떻게 예측하고 있는지 눈으로 확인해보자!!
---

In [None]:
# 유사도 검사 -> Tokenizer 방식에서 띄어쓰기 단위로 2개씩 묶어서 window 방식으로 비교 + 키워드 변형 -> 윈도우 사이즈 3으로 늘림
threshold = 0.35
total_acc, total_f1 = 0, 0
word_concat_size = 3

for i, key in enumerate(dataset):
    acc, f1 = 0, 0
    problem = dataset[key]
    for j, (user_answer, ground_truth) in enumerate(tqdm(zip(problem.user_answers, problem.ground_truths))):
        if j == 10:
            break
        split_answer = user_answer.split(' ')
        tokenized_answer = []
        for k in range(len(split_answer) - word_concat_size + 1):
            tokenized_answer.append(' '.join(split_answer[k : k + word_concat_size]))
        if len(split_answer) < word_concat_size:
            tokenized_answer.append(' '.join(split_answer))
        tokenized_answer_embedding = model.encode(tokenized_answer)
        similarity_scores = cosine_similarity(problem.keywords_embedding, tokenized_answer_embedding)
        predicts = []
        for z, idx in enumerate(similarity_scores.argmax(axis=1)):
            if threshold < similarity_scores[z][idx]:
                print(f"keyword          : {problem.keywords[z]}")
                print(f"detected keyword : {tokenized_answer[idx]}")
                predicts.append(1)
            else:
                predicts.append(0)
        print(f"실제 정답 : {problem.user_correct_keywords[j]}")
        acc += accuracy_score(ground_truth, predicts)
        f1 += f1_score(ground_truth, predicts, zero_division=1)
    print(f"{i}번 문제 점수 : accuracy : {acc / len(problem.user_answers)}, f1-score : {f1 / len(problem.user_answers)}")
    total_acc += acc / len(problem.user_answers)
    total_f1 += f1 / len(problem.user_answers)
print(f"전체 문제 평균 accuracy : {total_acc / len(dataset)}, f1-score : {total_f1 / len(dataset)}")

---
### 처음에 tokenized 되면서 영어들이 무시되었기 때문에 영어로 구성된 키워드들을 한글로 변형 시켰지만 스윽 보니 이제 영어도 잘 찾네요!!
### 그럼 다시 이전 키워드들로 변경해서 성능을 비교해봅시다
---

In [None]:
print(dataset['recXfKthnQLwWETgb'].keywords)
dataset['recXfKthnQLwWETgb'].keywords[0] = "Lifecycle"  # 만료 -> Lifecycle
print(dataset['recXfKthnQLwWETgb'].keywords)
dataset['recXfKthnQLwWETgb'].keywords_embedding = model.encode(dataset['recXfKthnQLwWETgb'].keywords)  # 1번
print(dataset['recUcGjT9Xkb7N5pu'].keywords)
dataset['recUcGjT9Xkb7N5pu'].keywords[2] = "삭제(POP)"  # 삭제 -> 삭제(POP)
dataset['recUcGjT9Xkb7N5pu'].keywords[3] = "삽입(PUSH)"  # 삽입 -> 삽입(PUSH)
print(dataset['recUcGjT9Xkb7N5pu'].keywords)
dataset['recUcGjT9Xkb7N5pu'].keywords_embedding = model.encode(dataset['recUcGjT9Xkb7N5pu'].keywords)  # 10번
print(dataset['rec1QvAvB4CMami3p'].keywords)
dataset['rec1QvAvB4CMami3p'].keywords[2] = 'Stack'  # 스택 -> Stack
dataset['rec1QvAvB4CMami3p'].keywords_embedding = model.encode(dataset['rec1QvAvB4CMami3p'].keywords)  # 7번
print(dataset['rec1QvAvB4CMami3p'].keywords)

In [None]:
# 유사도 검사 -> Tokenizer 방식에서 띄어쓰기 단위로 3개씩 묶어서 window 방식으로 비교 + 키워드 변형 -> 윈도우 사이즈 3으로 늘림
threshold = 0.35
total_acc, total_f1 = 0, 0
word_concat_size = 3

for i, key in enumerate(dataset):
    acc, f1 = 0, 0
    problem = dataset[key]
    for j, (user_answer, ground_truth) in enumerate(tqdm(zip(problem.user_answers, problem.ground_truths))):
        split_answer = user_answer.split(' ')
        tokenized_answer = []
        for k in range(len(split_answer) - word_concat_size + 1):
            tokenized_answer.append(' '.join(split_answer[k : k + word_concat_size]))
        if len(split_answer) < word_concat_size:
            tokenized_answer.append(' '.join(split_answer))
        tokenized_answer_embedding = model.encode(tokenized_answer)
        similarity_scores = cosine_similarity(problem.keywords_embedding, tokenized_answer_embedding)
        predicts = [1 if score.max() > threshold else 0 for idx, score in enumerate(similarity_scores)]

        acc += accuracy_score(ground_truth, predicts)
        f1 += f1_score(ground_truth, predicts, zero_division=1)
    print(f"{i}번 문제 점수 : accuracy : {acc / len(problem.user_answers)}, f1-score : {f1 / len(problem.user_answers)}")
    total_acc += acc / len(problem.user_answers)
    total_f1 += f1 / len(problem.user_answers)
print(f"전체 문제 평균 accuracy : {total_acc / len(dataset)}, f1-score : {total_f1 / len(dataset)}")

---
### 1번 문제 : 만료 -> Lifecycle [성능 소폭 하락]
### 7번 문제 : 삽입 -> 스택 -> Stack [성능 상승]
### 10번 문제 : 삭제 -> 삭제(POP), 삽입 -> 삽입(PUSH) [성능 하락]
---

---
### 이번에는 여러개의 키워드라면 쉼표로 구분해서 넣어줘보자
---

In [None]:
print(dataset['recXfKthnQLwWETgb'].keywords)
dataset['recXfKthnQLwWETgb'].keywords[0] = "만료, Lifecycle"
print(dataset['recXfKthnQLwWETgb'].keywords)
dataset['recXfKthnQLwWETgb'].keywords_embedding = model.encode(dataset['recXfKthnQLwWETgb'].keywords)  # 1번
print(dataset['recUcGjT9Xkb7N5pu'].keywords)
dataset['recUcGjT9Xkb7N5pu'].keywords[2] = "삭제, POP"  # 삭제 -> 삭제(POP)
dataset['recUcGjT9Xkb7N5pu'].keywords[3] = "삽입, PUSH"  # 삽입 -> 삽입(PUSH)
print(dataset['recUcGjT9Xkb7N5pu'].keywords)
dataset['recUcGjT9Xkb7N5pu'].keywords_embedding = model.encode(dataset['recUcGjT9Xkb7N5pu'].keywords)  # 10번
print(dataset['rec1QvAvB4CMami3p'].keywords)
dataset['rec1QvAvB4CMami3p'].keywords[2] = 'Stack'  # 스택 -> Stack
dataset['rec1QvAvB4CMami3p'].keywords_embedding = model.encode(dataset['rec1QvAvB4CMami3p'].keywords)  # 7번
print(dataset['rec1QvAvB4CMami3p'].keywords)

In [None]:
# 유사도 검사 -> Tokenizer 방식에서 띄어쓰기 단위로 2개씩 묶어서 window 방식으로 비교 + 키워드 변형 -> 윈도우 사이즈 3으로 늘림
threshold = 0.35
total_acc, total_f1 = 0, 0
word_concat_size = 3

for i, key in enumerate(dataset):
    acc, f1 = 0, 0
    problem = dataset[key]
    for j, (user_answer, ground_truth) in enumerate(tqdm(zip(problem.user_answers, problem.ground_truths))):
        split_answer = user_answer.split(' ')
        tokenized_answer = []
        for k in range(len(split_answer) - word_concat_size + 1):
            tokenized_answer.append(' '.join(split_answer[k : k + word_concat_size]))
        if len(split_answer) < word_concat_size:
            tokenized_answer.append(' '.join(split_answer))
        tokenized_answer_embedding = model.encode(tokenized_answer)
        similarity_scores = cosine_similarity(problem.keywords_embedding, tokenized_answer_embedding)
        predicts = [1 if score.max() > threshold else 0 for idx, score in enumerate(similarity_scores)]

        acc += accuracy_score(ground_truth, predicts)
        f1 += f1_score(ground_truth, predicts, zero_division=1)
    print(f"{i}번 문제 점수 : accuracy : {acc / len(problem.user_answers)}, f1-score : {f1 / len(problem.user_answers)}")
    total_acc += acc / len(problem.user_answers)
    total_f1 += f1 / len(problem.user_answers)
print(f"전체 문제 평균 accuracy : {total_acc / len(dataset)}, f1-score : {total_f1 / len(dataset)}")

---
### 꽤 유의미한 결과다!!
### 마지막으로 word concat size를 2로 줄여서 두개의 단어씩만 비교해보자
---

In [None]:
problem_id = random.choice(list(dataset.keys()))

split_answer = random.choice(dataset[problem_id].user_answers).split(' ')
word_concat_size = 2
tokenized_answer = []
for k in range(len(split_answer) - word_concat_size):
    tokenized_answer.append(' '.join(split_answer[k : k + word_concat_size]))
print(tokenized_answer[:3])

In [None]:
# 유사도 검사 -> Tokenizer 방식에서 띄어쓰기 단위로 2개씩 묶어서 window 방식으로 비교 + 키워드 변형 -> 윈도우 사이즈 3으로 늘림
threshold = 0.35
total_acc, total_f1 = 0, 0
word_concat_size = 2

for i, key in enumerate(dataset):
    acc, f1 = 0, 0
    problem = dataset[key]
    for j, (user_answer, ground_truth) in enumerate(tqdm(zip(problem.user_answers, problem.ground_truths))):
        split_answer = user_answer.split(' ')
        tokenized_answer = []
        for k in range(len(split_answer) - word_concat_size + 1):
            tokenized_answer.append(' '.join(split_answer[k : k + word_concat_size]))
        if len(split_answer) < word_concat_size:
            tokenized_answer.append(' '.join(split_answer))
        tokenized_answer_embedding = model.encode(tokenized_answer)
        similarity_scores = cosine_similarity(problem.keywords_embedding, tokenized_answer_embedding)
        predicts = [1 if score.max() > threshold else 0 for idx, score in enumerate(similarity_scores)]

        acc += accuracy_score(ground_truth, predicts)
        f1 += f1_score(ground_truth, predicts, zero_division=1)
    print(f"{i}번 문제 점수 : accuracy : {acc / len(problem.user_answers)}, f1-score : {f1 / len(problem.user_answers)}")
    total_acc += acc / len(problem.user_answers)
    total_f1 += f1 / len(problem.user_answers)
print(f"전체 문제 평균 accuracy : {total_acc / len(dataset)}, f1-score : {total_f1 / len(dataset)}")

---
### 전체 정확도는 조금 떨어졌지만 분명 2개씩 봤을 때 성능이 더 좋은 문제들이 있다.
### 확실하게 하려면 다음부터는 2개씩 보는 것과 3개씩 보는 것, 그리고 쉼표로 여러 키워드 후보를 비교해보는게 아닌 각각 여러개 비교하기까지 하면 성능이 꽤나 좋아질 것 같다.
---

### 마지막으로 윈도우 사이즈를 2개와 3개를 합쳐서 해보자!

In [None]:
problem_id = random.choice(list(dataset.keys()))
split_answer = random.choice(dataset[problem_id].user_answers).split(' ')
word_concat_size = 2
for k in range(len(split_answer) - word_concat_size + 1):
    tokenized_answer.append(' '.join(split_answer[k : k + word_concat_size]))
word_concat_size = 3
for k in range(len(split_answer) - word_concat_size + 1):
    tokenized_answer.append(' '.join(split_answer[k : k + word_concat_size]))
print(tokenized_answer[:3])
print(tokenized_answer[-3:])

In [None]:
# 유사도 검사 -> Tokenizer 방식에서 띄어쓰기 단위로 2개씩 묶어서 window 방식으로 비교 + 키워드 변형 -> 윈도우 사이즈 3으로 늘림
threshold = 0.35
total_acc, total_f1 = 0, 0

for i, key in enumerate(dataset):
    acc, f1 = 0, 0
    problem = dataset[key]
    for j, (user_answer, ground_truth) in enumerate(tqdm(zip(problem.user_answers, problem.ground_truths))):
        split_answer = user_answer.split(' ')
        tokenized_answer = []
        word_concat_size = 2
        for k in range(len(split_answer) - word_concat_size + 1):
            tokenized_answer.append(' '.join(split_answer[k : k + word_concat_size]))
        word_concat_size = 3
        for k in range(len(split_answer) - word_concat_size + 1):
            tokenized_answer.append(' '.join(split_answer[k : k + word_concat_size]))
        if not tokenized_answer:
            tokenized_answer.append(' '.join(split_answer))
        tokenized_answer_embedding = model.encode(tokenized_answer)
        similarity_scores = cosine_similarity(problem.keywords_embedding, tokenized_answer_embedding)
        predicts = [1 if score.max() > threshold else 0 for idx, score in enumerate(similarity_scores)]

        acc += accuracy_score(ground_truth, predicts)
        f1 += f1_score(ground_truth, predicts, zero_division=1)
    print(f"{i}번 문제 점수 : accuracy : {acc / len(problem.user_answers)}, f1-score : {f1 / len(problem.user_answers)}")
    total_acc += acc / len(problem.user_answers)
    total_f1 += f1 / len(problem.user_answers)
print(f"전체 문제 평균 accuracy : {total_acc / len(dataset)}, f1-score : {total_f1 / len(dataset)}")

### 하지만 이런 방식은 너무 느리다. GPU를 사용한다해도 이는 감당이 안될 수 있다.
### 그리고 정확도도 그리 좋아지지 않았다.