# YAKE

In [None]:
# !pip install yake

In [None]:
import yake

def extract_keywords_yake(text, max_keywords=5):
    # YAKE 키워드 추출기 생성
    kw_extractor = yake.KeywordExtractor()
    # 텍스트에서 키워드 추출
    keywords = kw_extractor.extract_keywords(text)
    # 상위 max_keywords 개의 키워드 반환
    return [kw[0] for kw in keywords[:max_keywords]]

# 예제 텍스트
text = "KeyBERT is a minimal and easy-to-use keyword extraction technique."

# YAKE를 사용하여 키워드 추출
keywords = extract_keywords_yake(text)
print("Extracted Keywords:", keywords)

# 검증
expected_keywords = ["KeyBERT", "keyword", "extraction", "technique", "easy-to-use"]
print("Expected Keywords:", expected_keywords)

# 간단한 검증
correct_keywords = set(keywords) & set(expected_keywords)
print("Correct Keywords:", correct_keywords)


# TF-IDF

In [None]:
# sudo apt install default-jdk: ubuntu에서 java를 찾을 수 없을 때
# !pip install konlpy

In [1]:
from konlpy.tag import Okt
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [2]:
okt = Okt()

In [3]:
docs = ["파이썬에서 다중 라인 문자열의 사용 예는 무엇인가요"]
text = ' '.join(docs)

In [4]:
# 명사 추출
nouns = okt.nouns(text)
processed_text = ' '.join(nouns)
print(processed_text)

# TF-IDF 벡터
vectorizer = TfidfVectorizer()

# 텍스트를 TF-IDF벡터로 변환
tfidf_matrix = vectorizer.fit_transform([processed_text])
print(tfidf_matrix)

# 단어와 그에 대한 TF-IDF값 얻기
feature_names = vectorizer.get_feature_names_out()
print(feature_names)
tfidf_values = tfidf_matrix.toarray().tolist()[0]
print(tfidf_values)

# 높은 TF-IDF값을 가진 단어 순서대로 정렬
keywords = [feature_names[i] for i in sorted(range(len(tfidf_values)), key=lambda k: tfidf_values[k], reverse=True)]
print(keywords)

파이썬 다중 라인 문자열 사용 예 무엇
  (0, 5)	0.4082482904638631
  (0, 0)	0.4082482904638631
  (0, 1)	0.4082482904638631
  (0, 3)	0.4082482904638631
  (0, 4)	0.4082482904638631
  (0, 2)	0.4082482904638631
['다중' '라인' '무엇' '문자열' '사용' '파이썬']
[0.4082482904638631, 0.4082482904638631, 0.4082482904638631, 0.4082482904638631, 0.4082482904638631, 0.4082482904638631]
['다중', '라인', '무엇', '문자열', '사용', '파이썬']


# keybert + kiwi
- https://datainclude.me/posts/Keybert%EC%99%80_kiwi%ED%98%95%ED%83%9C%EC%86%8C%EB%B6%84%EC%84%9D%EA%B8%B0%EB%A5%BC_%EC%82%AC%EC%9A%A9%ED%95%98%EC%97%AC_%ED%82%A4%EC%9B%8C%EB%93%9C%EC%B6%94%EC%B6%9C_%ED%95%98%EA%B8%B0/

In [None]:
# !pip install keybert
# !pip install kiwipiepy

In [21]:
import pandas as pd
from keybert import KeyBERT
from transformers import BertModel
from kiwipiepy import Kiwi

text = '데이터프레임에서 특정 열을 추출하려면 어떻게 해야 하나요'

model = BertModel.from_pretrained('skt/kobert-base-v1')
kw_model = KeyBERT(model)
keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words=None, top_n=10)

kiwi = Kiwi()
kiwi.analyze(text)

# 명사 추출 함수
def noun_extractor(text):
    results = []
    result = kiwi.analyze(text)
    for token, pos, _, _ in result[0][0]:
        if len(token) != 1 and pos.startswith('N') or pos.startswith('SL'):
            results.append(token)
    return results

nouns = noun_extractor(text)

text = ' '.join(nouns)

keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words=None, top_n=1)

keywords

[('데이터', 0.8142)]

In [9]:
import pandas as pd
from keybert import KeyBERT
from transformers import BertModel
from kiwipiepy import Kiwi

# 모델 선택
model = BertModel.from_pretrained('skt/kobert-base-v1')
kw_model = KeyBERT(model)
kiwi = Kiwi()


In [23]:
text = 'range() 함수를 사용한 반복문의 예를 들어보세요.'

# 명사 추출 함수
def noun_extractor(text):
    results = []
    result = kiwi.analyze(text)
    for token, pos, _, _ in result[0][0]:
        if len(token) != 1 and pos.startswith('N') or pos.startswith('SL'):
            results.append(token)
    return results

nouns = noun_extractor(text)

text = ' '.join(nouns)

keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words=None, top_n=20)

keywords

[('range', 0.6023), ('반복', 0.5649), ('사용', 0.2457), ('함수', 0.2364)]

In [8]:
import pandas as pd
from keybert import KeyBERT
from transformers import BertModel
from kiwipiepy import Kiwi

# 모델 선택
model = BertModel.from_pretrained('skt/kobert-base-v1')
kw_model = KeyBERT(model)
kiwi = Kiwi()

# 데이터 불러오기
df = pd.read_excel('../fasttext/data/train_set.xlsx')

# 불용어 파일 불러오기
with open('stopwords.txt', 'r', encoding='utf-8') as file:
    stopwords = file.read().splitlines()

# 명사 추출 함수
def noun_extractor(text):
    results = []
    result = kiwi.analyze(text)
    for token, pos, _, _ in result[0][0]:
        if len(token) != 1 and (pos.startswith('N') or pos.startswith('SL')):
            results.append(token)
    return results

# 키워드 추출 함수
def extract_keywords_from_df(df, stopwords, kw_model):
    keywords_list = []

    for _, row in df.iterrows():
        if row['label'] == 'yes':
            noun = noun_extractor(row['question'])
            text_noun = ' '.join(noun)
            keywords = kw_model.extract_keywords(text_noun, keyphrase_ngram_range=(1, 1), stop_words=stopwords, top_n=20)
            keywords_list.append(', '.join([kw[0] for kw in keywords]))
        else:
            keywords_list.append('') 
    return keywords_list


# 키워드 추출
df['keyword'] = extract_keywords_from_df(df, stopwords, kw_model)

# 결과를 엑셀 파일로 저장
df.to_excel('keyword.xlsx', index=False)

print("키워드 추출 및 저장이 완료되었습니다.")


키워드 추출 및 저장이 완료되었습니다.


# RAKE

In [None]:
# !pip install rake-nltk

In [4]:
with open('stopwords.txt', 'r', encoding='utf-8') as file:
    stopwords = file.read().splitlines()

In [5]:
from rake_nltk import Rake

r = Rake()

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/home/bok/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [1]:
text = "Feature extraction is not that complex. There are many algorithms available that can help you with feature extraction. Rapid Automatic Key Word Extraction is one of those"

r.extract_keywords_from_text(text)

keywords_rake = r.get_ranked_phrases()

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/home/bok/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


# TextRank

In [25]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# GPT-2 모델과 토크나이저 로드
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# GPU 사용 가능 시 GPU로 모델 이동
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_response(question):
    inputs = tokenizer.encode(question, return_tensors='pt').to(device)
    outputs = model.generate(inputs, max_length=150, num_return_sequences=1)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


In [28]:
# 답변 생성이 아니라 질문에 추가되는 무언가를 생성이잖아
generate_response("How do I create a list in Python?")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'How do I create a list in Python?\n\nYou can create a list using the following syntax:\n\nfrom list import list\n\nYou can also use the following syntax:\n\nfrom list import list\n\nYou can also use the following syntax:\n\nfrom list import list\n\nYou can also use the following syntax:\n\nfrom list import list\n\nYou can also use the following syntax:\n\nfrom list import list\n\nYou can also use the following syntax:\n\nfrom list import list\n\nYou can also use the following syntax:\n\nfrom list import list\n\nYou can also use the following syntax:\n\nfrom list import list\n\nYou can also use the following syntax:\n'

# Stopwords 만들기

In [63]:
import numpy as np
import pandas as pd

# 엑셀 파일 불러오기
key_df = pd.read_excel('keyword.xlsx')

# 결측치 제거
key_df = key_df[key_df['keyword'].notna()]

# 'keyword' 열을 리스트로 변환
keywords = key_df['keyword'].to_list()

# 쉼표와 공백으로 구분된 각 키워드를 개별 단어로 분리
split_keywords = []
for keyword in keywords:
    split_keywords.extend([k.strip() for k in keyword.split(',')])

# 중복 제거 및 각 단어를 작은따옴표로 감싸기
unique_keywords = list(set(split_keywords))
quoted_keywords = [f"'{word}'" for word in unique_keywords]

# 리스트 형태로 표현
result = f"[{', '.join(quoted_keywords)}]"

print(result)


['문제', '처리', '최신', '중요', '이스케이프', '튜플의', '키워드', 'interpreter', '텍스트', 'in', '바인딩', 'hex', '기능', 'max', '자료', 'enumerate', '메모리', '사용법', 'binding', '유형', 'constant', 'string', '용도', '해제', 'dictionary', '구분', 'sqlite', '이유', '클래스', '의미', '관습', '중복', 'iterator', '차원', '개수', '삽입', '참조', 'codecs', 'first', 'globals', 'self', 'any', '소개', 'dict', 'copy', 'is', '체크', '정의', 'else', '누락', '파일', 'py', '컴프리헨션', '특정', '최소', '순회', 'bin', 'rule', '시각', 'for', '예약어', '순서', '일급', '라인', '철학', 'scope', 'duck', '람다', '주의', '할당', '데이터', 'locals', '복사', '배열', '지정', '횟수', '내부', '선택', 'dynamic', 'sorted', '타입', '예시', 'datetime', '메타', '통계', '컴파일', 'named', '호출', '인덱스', '객체', '표준', '연산자', 'xml', '반복', '커링', 'defaultdict', '차이', '역할', '콤마', '패킹', 'closure', '카운터', '소수점', 'issubclass', 're', '모델', '타이핑', '제어', '출력', '상황', 'random', '기본', '역순', '라이브러리', 'xhh', 'array', '패키지', 'unittest', '누적', 'min', 'language', '요약', 'not', '정수', '해결', '힌팅', '부울', 'bytes', '형식', '스크립트', 'context', '프레임', 'gil', '엑셀', 'differenc

In [73]:
if '데이터 프레임' in result:
    print('yes')
else:
    print('no')

no
