In [1]:
import os
import pandas as pd
import numpy as np
import re
import nltk
import torch
import networkx as nx
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from transformers import BertTokenizer, BertModel
from nltk.corpus import stopwords
from itertools import combinations
from konlpy.tag import Okt
from langchain.text_splitter import RecursiveCharacterTextSplitter
from IPython.display import display

In [2]:
import nltk
print(nltk.data.path)

['/Users/choiwonjun/nltk_data', '/Users/choiwonjun/.local/share/virtualenvs/keyword_extraction_test-7wgJIzyW/nltk_data', '/Users/choiwonjun/.local/share/virtualenvs/keyword_extraction_test-7wgJIzyW/share/nltk_data', '/Users/choiwonjun/.local/share/virtualenvs/keyword_extraction_test-7wgJIzyW/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']


In [3]:
# NLTK 데이터 다운로드 (TextRank에서 필요)
nltk.download("punkt")


# 분석할 폴더 경로 설정
TEXT_FOLDER = "../../data/processed_text"  # 폴더 경로 수정

print("📂 현재 작업 폴더:", os.getcwd())

📂 현재 작업 폴더: /Users/choiwonjun/nebula/nebula-ai-research/keyword_extraction_test/notebooks


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/choiwonjun/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# 정규식 기반 한국어와 영어 분리 함수
def separate_korean_english(text):
    """ 한국어와 영어 문장을 분리하는 함수 """
    korean_text = " ".join(re.findall(r'[가-힣]+', text))
    english_text = " ".join(re.findall(r'[a-zA-Z]+', text))
    return korean_text, english_text

# 한국어 불용어 로드 함수
def load_korean_stopwords(file_name="ko_stopwords.txt"):
    stop_words = set()
    if os.path.exists(file_name):
        with open(file_name, "r", encoding="utf-8") as f:
            stop_words.update(f.read().splitlines())
    
    return stop_words

# 불용어 제거 함수
def remove_stopwords(tokens, language="en"):
    if language == "en":
        stop_words = set(stopwords.words("english"))
    else:
        stop_words = load_korean_stopwords()
    return [token for token in tokens if token not in stop_words]

# 토큰화 함수
def tokenize(text):
    okt = Okt()
    korean_text, english_text = separate_korean_english(text)

    # 한국어 형태소 분석 (명사 + 의미 있는 단어 유지)
    korean_tokens = [word for word, pos in okt.pos(korean_text) if pos in ["Noun", "Alpha"]] if korean_text else []
    
    # 영어 토큰화
    english_tokens = word_tokenize(english_text) if english_text else []

    # 불용어 제거
    korean_tokens = remove_stopwords(korean_tokens, language='ko')
    english_tokens = remove_stopwords(english_tokens, language='en')

    return korean_tokens + english_tokens

In [5]:
## TF-IDF 기반 키워드 추출
def extract_keywords_tfidf(text, top_n=5):
    vectorizer = TfidfVectorizer(stop_words="english", max_features=1000)
    tfidf_matrix = vectorizer.fit_transform([text])

    scores = tfidf_matrix.toarray()[0]
    words = vectorizer.get_feature_names_out()

    keyword_scores = sorted(zip(words, scores), key=lambda x: x[1], reverse=True)
    return [word for word, score in keyword_scores[:top_n]]

In [6]:
## TF-IDF 기반 키워드 추출 - Stopword 추가
def extract_keywords_tfidf_with_stopwords(text, top_n=5, chunk_size=500):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )

    chunks = text_splitter.split_text(text)
    
    vectorizer = TfidfVectorizer(
        tokenizer=tokenize,
        token_pattern=None,
        ngram_range=(1, 2)
    )
    
    tfidf_matrix = vectorizer.fit_transform(chunks)
    scores = np.mean(tfidf_matrix.toarray(), axis=0)  # 각 청크의 TF-IDF 평균값 계산

    words = vectorizer.get_feature_names_out()

    keyword_scores = sorted(zip(words, scores), key=lambda x: x[1], reverse=True)
    return [word for word, score in keyword_scores[:top_n]]

In [7]:
## TextRank 기반 키워드 추출
def extract_keywords_textrank(text, top_n=5):
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.isalnum()]

    word_graph = nx.Graph()
    for w1, w2 in combinations(set(words), 2):
        word_graph.add_edge(w1, w2)

    scores = nx.pagerank(word_graph)
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return [word for word, score in sorted_words[:top_n]]


In [8]:
## TextRank 기반 키워드 추출  - Stopword 추가
def extract_keywords_textrank_with_stopwords(text, top_n=5):
    words = tokenize(text)
    words = [word.lower() for word in words if word.isalnum()]

    word_graph = nx.Graph()
    for w1, w2 in combinations(set(words), 2):
        word_graph.add_edge(w1, w2)

    scores = nx.pagerank(word_graph)
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return [word for word, score in sorted_words[:top_n]]


In [9]:
## LDA 기반 키워드 추출
def extract_keywords_lda(text, num_topics=1, top_n=5):
    vectorizer = CountVectorizer(stop_words="english")
    term_matrix = vectorizer.fit_transform([text])

    lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda_model.fit(term_matrix)

    words = vectorizer.get_feature_names_out()
    topics = lda_model.components_

    topic_keywords = [words[i] for i in topics[0].argsort()[-top_n:]]
    return topic_keywords

In [10]:
## LDA 기반 키워드 추출 - Stopword 추가
def extract_keywords_lda_with_stopwords(text, num_topics=1, top_n=5):
    vectorizer = CountVectorizer(tokenizer=tokenize, token_pattern=None)  
    term_matrix = vectorizer.fit_transform([text])

    lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda_model.fit(term_matrix)

    words = vectorizer.get_feature_names_out()
    topics = lda_model.components_

    topic_keywords = [words[i] for i in topics[0].argsort()[-top_n:]]  
    return topic_keywords


In [11]:
def extract_keywords_bert(text, top_n=5):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertModel.from_pretrained("bert-base-uncased")

    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)

    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())
    token_weights = outputs.last_hidden_state.mean(dim=2).squeeze().detach().numpy()

    keyword_scores = sorted(zip(tokens, token_weights), key=lambda x: x[1], reverse=True)
    return [word for word, score in keyword_scores[:top_n]]


In [None]:
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModel
import torch

TOKENIZER_BERT_MULTI = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
MODEL_BERT_MULTI = BertModel.from_pretrained("bert-base-multilingual-cased")

TOKENIZER_KOBERT = AutoTokenizer.from_pretrained("monologg/kobert")
MODEL_KOBERT = AutoModel.from_pretrained("monologg/kobert")


In [3]:
def extract_keywords_bert_multilingual_with_custom_tokenizer(text, top_n=5):
    custom_tokens = tokenize(text)
    processed_text = " ".join(custom_tokens)

    inputs = TOKENIZER_BERT_MULTI(processed_text, return_tensors="pt", truncation=True, max_length=512)
    
    with torch.no_grad():  
        outputs = MODEL_BERT_MULTI(**inputs)

    tokens = TOKENIZER_BERT_MULTI.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())
    token_weights = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

    keyword_scores = sorted(zip(tokens, token_weights), key=lambda x: x[1], reverse=True)

    merged_keywords = []
    current_word = ""

    for word, score in keyword_scores:
        if word.startswith("##"):  # 서브워드이면 기존 단어에 붙이기
            current_word += word[2:]
        else:  
            if current_word:  # 기존 단어가 있다면 저장
                merged_keywords.append(current_word)
            current_word = word  # 새로운 단어 시작

    if current_word:  # 마지막 단어 추가
        merged_keywords.append(current_word)

    filtered_keywords = [word for word in merged_keywords if word.isalnum()][:top_n]

    return filtered_keywords


In [None]:
def extract_keywords_bert_multilingual_basic(text, top_n=5):
    inputs = TOKENIZER_BERT_MULTI(text, return_tensors="pt", truncation=True, max_length=512)

    with torch.no_grad():  
        outputs = MODEL_BERT_MULTI(**inputs)

    tokens = TOKENIZER_BERT_MULTI.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())
    token_weights = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

    keyword_scores = sorted(zip(tokens, token_weights), key=lambda x: x[1], reverse=True)

    merged_keywords = []
    current_word = ""

    for word, score in keyword_scores:
        if word.startswith("##"):  # 서브워드이면 기존 단어에 붙이기
            current_word += word[2:]
        else:  
            if current_word:  # 기존 단어가 있다면 저장
                merged_keywords.append(current_word)
            current_word = word  # 새로운 단어 시작

    if current_word:  # 마지막 단어 추가
        merged_keywords.append(current_word)

    filtered_keywords = [word for word in merged_keywords if word.isalnum()][:top_n]

    return filtered_keywords


In [None]:
def extract_keywords_kobert_basic(text, top_n=5):
    inputs = TOKENIZER_KOBERT(text, return_tensors="pt", truncation=True, max_length=512)

    with torch.no_grad():  
        outputs = MODEL_KOBERT(**inputs)

    tokens = TOKENIZER_KOBERT.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())
    token_weights = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

    keyword_scores = sorted(zip(tokens, token_weights), key=lambda x: x[1], reverse=True)

    merged_keywords = []
    current_word = ""

    for word, score in keyword_scores:
        if word.startswith("##"):  # 서브워드이면 기존 단어에 붙이기
            current_word += word[2:]
        else:  
            if current_word:  # 기존 단어가 있다면 저장
                merged_keywords.append(current_word)
            current_word = word  # 새로운 단어 시작

    if current_word:  # 마지막 단어 추가
        merged_keywords.append(current_word)

    filtered_keywords = [word for word in merged_keywords if word.isalnum()][:top_n]

    return filtered_keywords


In [None]:
def extract_keywords_kobert_with_custom_tokenizer(text, top_n=5):
    custom_tokens = tokenize(text)  # 사용자 정의 토크나이저 적용
    processed_text = " ".join(custom_tokens)  # 토큰을 문자열로 변환

    inputs = TOKENIZER_KOBERT(processed_text, return_tensors="pt", truncation=True, max_length=512)

    with torch.no_grad():  
        outputs = MODEL_KOBERT(**inputs)

    tokens = TOKENIZER_KOBERT.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())
    token_weights = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

    keyword_scores = sorted(zip(tokens, token_weights), key=lambda x: x[1], reverse=True)

    merged_keywords = []
    current_word = ""

    for word, score in keyword_scores:
        if word.startswith("##"):  # 서브워드이면 기존 단어에 붙이기
            current_word += word[2:]
        else:  
            if current_word:  # 기존 단어가 있다면 저장
                merged_keywords.append(current_word)
            current_word = word  # 새로운 단어 시작

    if current_word:  # 마지막 단어 추가
        merged_keywords.append(current_word)

    filtered_keywords = [word for word in merged_keywords if word.isalnum()][:top_n]

    return filtered_keywords


In [1]:
import time
import os
import pandas as pd

# 실행 시간 저장용 리스트
execution_times = []

# 결과 데이터 저장용 리스트
data = []
tf_idf_data = []
textrank_data = []
lda_data = []
bert_data = []

# 총 파일 개수
file_list = [f for f in os.listdir(TEXT_FOLDER) if f.endswith(".txt")]
total_files = len(file_list)

print(f"📂 총 {total_files}개의 파일을 처리합니다.\n")

for i, filename in enumerate(file_list):
    file_path = os.path.join(TEXT_FOLDER, filename)

    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    print(f"🔍 [{i+1}/{total_files}] {filename} 처리 중...")

    # 실행 시간 기록을 위한 딕셔너리
    file_exec_times = {"파일명": filename}
    file_start_time = time.time()

    # ✅ 기존 방식
    print("⏳ TF-IDF (기존) 실행 중...")
    start = time.time()
    tfidf_keywords = extract_keywords_tfidf(text)
    file_exec_times["TF-IDF (기존)"] = round(time.time() - start, 3)

    print("⏳ TextRank (기존) 실행 중...")
    start = time.time()
    textrank_keywords = extract_keywords_textrank(text)
    file_exec_times["TextRank (기존)"] = round(time.time() - start, 3)

    print("⏳ LDA (기존) 실행 중...")
    start = time.time()
    lda_keywords = extract_keywords_lda(text)
    file_exec_times["LDA (기존)"] = round(time.time() - start, 3)

    print("⏳ BERT (기존) 실행 중...")
    start = time.time()
    bert_keywords = extract_keywords_bert(text)
    file_exec_times["BERT (기존)"] = round(time.time() - start, 3)

    print("⏳ BERT multilingual (기존) 실행 중...")
    start = time.time()
    bert_multilingual_keywords = extract_keywords_bert_multilingual_basic(text)
    file_exec_times["BERT multilingual (기존)"] = round(time.time() - start, 3)

    print("⏳ koBERT (기존) 실행 중...")
    start = time.time()
    kobert_keywords = extract_keywords_kobert_basic(text)
    file_exec_times["koBERT (기존)"] = round(time.time() - start, 3)

    # ✅ 토크나이저 적용 버전
    print("⏳ TF-IDF (토크나이저 적용) 실행 중...")
    start = time.time()
    tfidf_keywords_tokenized = extract_keywords_tfidf_with_stopwords(text)
    file_exec_times["TF-IDF (토크나이저 적용)"] = round(time.time() - start, 3)

    print("⏳ TextRank (토크나이저 적용) 실행 중...")
    start = time.time()
    textrank_keywords_tokenized = extract_keywords_textrank_with_stopwords(text)
    file_exec_times["TextRank (토크나이저 적용)"] = round(time.time() - start, 3)

    print("⏳ LDA (토크나이저 적용) 실행 중...")
    start = time.time()
    lda_keywords_tokenized = extract_keywords_lda_with_stopwords(text)
    file_exec_times["LDA (토크나이저 적용)"] = round(time.time() - start, 3)

    print("⏳ BERT multilingual (토크나이저 적용) 실행 중...")
    start = time.time()
    bert_keywords_multilingual_tokenized = extract_keywords_bert_multilingual_with_custom_tokenizer(text)
    file_exec_times["BERT multilingual (토크나이저 적용)"] = round(time.time() - start, 3)

    print("⏳ koBERT (토크나이저 적용) 실행 중...")
    start = time.time()
    kobert_keywords_tokenized = extract_keywords_kobert_with_custom_tokenizer(text)
    file_exec_times["koBERT (토크나이저 적용)"] = round(time.time() - start, 3)

    # 📌 실행 시간 리스트에 추가
    execution_times.append(file_exec_times)

    # 📌 통합 결과 DataFrame용
    data.append({
        "파일명": filename,
        "TF-IDF (기존)": ", ".join(tfidf_keywords),
        "TF-IDF (토크나이저 적용)": ", ".join(tfidf_keywords_tokenized),
        "TextRank (기존)": ", ".join(textrank_keywords),
        "TextRank (토크나이저 적용)": ", ".join(textrank_keywords_tokenized),
        "LDA (기존)": ", ".join(lda_keywords),
        "LDA (토크나이저 적용)": ", ".join(lda_keywords_tokenized),
        "BERT (기존)": ", ".join(bert_keywords),
        "BERT multilingual (기존)": ", ".join(bert_multilingual_keywords),
        "koBERT (기존)": ", ".join(kobert_keywords),
        "BERT multilingual (토크나이저 적용)": ", ".join(bert_keywords_multilingual_tokenized),
        "koBERT (토크나이저 적용)": ", ".join(kobert_keywords_tokenized),
    })

    # 📌 개별 알고리즘별 DataFrame 추가
    tf_idf_data.append({
        "파일명": filename,
        "TF-IDF (기존)": ", ".join(tfidf_keywords),
        "TF-IDF (토크나이저 적용)": ", ".join(tfidf_keywords_tokenized),
    })

    textrank_data.append({
        "파일명": filename,
        "TextRank (기존)": ", ".join(textrank_keywords),
        "TextRank (토크나이저 적용)": ", ".join(textrank_keywords_tokenized),
    })

    lda_data.append({
        "파일명": filename,
        "LDA (기존)": ", ".join(lda_keywords),
        "LDA (토크나이저 적용)": ", ".join(lda_keywords_tokenized),
    })

    bert_data.append({
        "파일명": filename,
        "BERT (기존)": ", ".join(bert_keywords),
        "BERT multilingual (기존)": ", ".join(bert_multilingual_keywords),
        "koBERT (기존)": ", ".join(kobert_keywords),
        "BERT multilingual (토크나이저 적용)": ", ".join(bert_keywords_multilingual_tokenized),
        "koBERT (토크나이저 적용)": ", ".join(kobert_keywords_tokenized),
    })

    file_end_time = time.time()
    print(f"✅ {filename} 처리 완료! 총 소요 시간: {round(file_end_time - file_start_time, 3)}초\n")

# 📊 DataFrame 생성
df_results = pd.DataFrame(data)
df_tf_idf_results = pd.DataFrame(tf_idf_data)
df_textrank_results = pd.DataFrame(textrank_data)
df_lda_results = pd.DataFrame(lda_data)
df_bert_results = pd.DataFrame(bert_data)

# 📊 실행 시간 비교 DataFrame
df_execution_times = pd.DataFrame(execution_times)

print("🚀 모든 파일 처리 완료!")

NameError: name 'TEXT_FOLDER' is not defined

In [None]:
display(df_results)

In [None]:
display(df_tf_idf_results)


In [None]:
display(df_textrank_results)


In [None]:
display(df_lda_results)


In [None]:
display(df_bert_results)


In [None]:
display(df_execution_times)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 📌 실행 시간 데이터 확인
display(df_execution_times)

# 📊 그래프 스타일 설정
plt.style.use("ggplot")
plt.rcParams["axes.labelsize"] = 12
plt.rcParams["axes.titlesize"] = 14
plt.rcParams["xtick.labelsize"] = 10
plt.rcParams["ytick.labelsize"] = 10

# 📌 실행 시간 데이터 가공
df_exec_time_melted = df_execution_times.melt(id_vars=["파일명"], var_name="알고리즘", value_name="실행 시간")

# ✅ 1. 막대 그래프 (Bar Chart) - 알고리즘별 평균 실행 시간 비교
plt.figure(figsize=(12, 6))
sns.barplot(x="알고리즘", y="실행 시간", data=df_exec_time_melted, ci=None)
plt.xticks(rotation=45, ha="right")
plt.title("알고리즘별 평균 실행 시간 비교")
plt.xlabel("알고리즘")
plt.ylabel("평균 실행 시간 (초)")
plt.show()

# ✅ 2. 박스 플롯 (Box Plot) - 실행 시간의 변동성 확인
plt.figure(figsize=(12, 6))
sns.boxplot(x="알고리즘", y="실행 시간", data=df_exec_time_melted)
plt.xticks(rotation=45, ha="right")
plt.title("알고리즘별 실행 시간 분포")
plt.xlabel("알고리즘")
plt.ylabel("실행 시간 (초)")
plt.show()

# ✅ 3. 히트맵 (Heatmap) - 실행 시간 비교
plt.figure(figsize=(10, 8))
df_heatmap = df_execution_times.set_index("파일명")
sns.heatmap(df_heatmap, cmap="coolwarm", annot=True, fmt=".3f", linewidths=0.5)
plt.title("실행 시간 비교 히트맵")
plt.xlabel("알고리즘")
plt.ylabel("파일명")
plt.show()
