In [34]:
import os
import pandas as pd
import numpy as np
import re
import nltk
import torch
import networkx as nx
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from transformers import BertTokenizer, BertModel
from itertools import combinations

In [35]:
import nltk
print(nltk.data.path)

['/Users/choiwonjun-macbook/nltk_data', '/Users/choiwonjun-macbook/.local/share/virtualenvs/keyword_extraction_test-fUJ8m5h5/nltk_data', '/Users/choiwonjun-macbook/.local/share/virtualenvs/keyword_extraction_test-fUJ8m5h5/share/nltk_data', '/Users/choiwonjun-macbook/.local/share/virtualenvs/keyword_extraction_test-fUJ8m5h5/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']


In [36]:
# NLTK 데이터 다운로드 (TextRank에서 필요)
nltk.download("punkt")


# 분석할 폴더 경로 설정
TEXT_FOLDER = "../../data/processed_text"  # 폴더 경로 수정

print("📂 현재 작업 폴더:", os.getcwd())
# 데이터 저장용 리스트
data = []

📂 현재 작업 폴더: /Users/choiwonjun-macbook/nebula/nebula-ai-research/keyword_extraction_test/notebooks


[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


In [37]:
def extract_keywords_tfidf(text, top_n=5):
    vectorizer = TfidfVectorizer(stop_words="english", max_features=1000)
    tfidf_matrix = vectorizer.fit_transform([text])

    scores = tfidf_matrix.toarray()[0]
    words = vectorizer.get_feature_names_out()

    keyword_scores = sorted(zip(words, scores), key=lambda x: x[1], reverse=True)
    return [word for word, score in keyword_scores[:top_n]]

In [38]:
## TextRank 기반 키워드 추출
def extract_keywords_textrank(text, top_n=5):
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.isalnum()]

    word_graph = nx.Graph()
    for w1, w2 in combinations(set(words), 2):
        word_graph.add_edge(w1, w2)

    scores = nx.pagerank(word_graph)
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return [word for word, score in sorted_words[:top_n]]


In [39]:
## LDA 기반 키워드 추출
def extract_keywords_lda(text, num_topics=1, top_n=5):
    vectorizer = CountVectorizer(stop_words="english")
    term_matrix = vectorizer.fit_transform([text])

    lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda_model.fit(term_matrix)

    words = vectorizer.get_feature_names_out()
    topics = lda_model.components_

    topic_keywords = [words[i] for i in topics[0].argsort()[-top_n:]]
    return topic_keywords

In [40]:
def extract_keywords_bert(text, top_n=5):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertModel.from_pretrained("bert-base-uncased")

    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)

    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())
    token_weights = outputs.last_hidden_state.mean(dim=2).squeeze().detach().numpy()

    keyword_scores = sorted(zip(tokens, token_weights), key=lambda x: x[1], reverse=True)
    return [word for word, score in keyword_scores[:top_n]]


In [41]:
# 📂 폴더 내 모든 텍스트 파일 읽기 및 알고리즘 적용
for filename in os.listdir(TEXT_FOLDER):
    if filename.endswith(".txt"):
        file_path = os.path.join(TEXT_FOLDER, filename)

        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()

        # 각 알고리즘 적용
        tfidf_keywords = extract_keywords_tfidf(text)
        # textrank_keywords = extract_keywords_textrank(text)
        lda_keywords = extract_keywords_lda(text)
        bert_keywords = extract_keywords_bert(text)

        # 📌 결과 DataFrame에 추가
        data.append({
            "파일명": filename,
            "TF-IDF": ", ".join(tfidf_keywords),
            # "TextRank": ", ".join(textrank_keywords),
            "LDA": ", ".join(lda_keywords),
            "BERT": ", ".join(bert_keywords),
        })

# 📊 DataFrame 생성 및 결과 확인
df_results = pd.DataFrame(data)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [43]:
from IPython.display import display

display(df_results)

Unnamed: 0,파일명,TF-IDF,LDA,BERT
0,blog_18.txt,"답글, 1개의, 그렇기에, 응원합니다, 나는","나는, 응원합니다, 그렇기에, 1개의, 답글","squeeze, squeeze, [CLS], 1, 3"
1,blog_19.txt,"fiber, flatmap, value, const, maybe","maybe, const, value, flatmap, fiber","[CLS], react, 8, 3, ran"
2,blog_20.txt,"ai, 최신, 있습니다, ai가, 기술","모델, ai가, 있습니다, 최신, ai","gemini, claude, ##ai, ##2, [CLS]"
3,blog_9.txt,"상태, const, 업데이트, null, update","update, null, 업데이트, const, 상태","react, [CLS], 19, 19, ##tate"
4,blog_8.txt,"line, wscellrowi, 엑셀, credit, import","import, subject_name, 엑셀, wscellrowi, line","[CLS], ##com, files, files, https"
5,blog_5.txt,"항해, 있습니다, 이런, 프론트엔드, 플러스","프론트엔드, 플러스, 이런, 있습니다, 항해","[CLS], 3, 3, ##ᄀ, 10"
6,blog_4.txt,"이미지, js, 거야, 쉽게, 이야기","쉽게, 거야, 이야기, js, 이미지","##loading, [CLS], lazy, 3, 8"
7,blog_6.txt,"typescript, 네이티브, 모든, 것입니다, typescript의","현재, 것입니다, 모든, 네이티브, typescript","faster, 16, [CLS], ##us, ##us"
8,blog_7.txt,"스켈레톤, 로딩, div, ui를, classname","classname, div, ui를, 로딩, 스켈레톤","[CLS], 6, 33, 5, 3"
9,blog_3.txt,"것이, 어떻게, 기술, 기업, 내가","하는, 기업, 기술, 어떻게, 것이","[CLS], golden, golden, 7, ##dus"
