In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import csv
import warnings

import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from konlpy.tag import Okt
import time

class VectorSpaceModel:
    def __init__(self, documents):
        self.documents = documents
        self.vectorizer = TfidfVectorizer()
        self.vectorized_documents = self.vectorizer.fit_transform([doc['article'] for doc in documents])

    def search(self, query):
        query_vector = self.vectorizer.transform([query])
        similarities = cosine_similarity(query_vector, self.vectorized_documents)        
        positive_similarities_indices = similarities[0] > 0
        ranked_indices = positive_similarities_indices.nonzero()[0]        
        ranked_similarities = similarities[0][ranked_indices]        
        ranked_indices = ranked_indices[ranked_similarities.argsort()[::-1]]
        ranked_similarities = sorted(ranked_similarities, reverse=True)
        return ranked_indices, ranked_similarities      

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', '', text)
    return ' '.join(word_tokenize(text))

def extract_keywords(article_text, num_clusters=5, num_keywords=5):
    okt = Okt()
    preprocessed_text = preprocess_text(article_text)
    words = okt.pos(preprocessed_text, stem=True)
    nouns = [word for word, pos in words if pos == 'Noun' and len(word) > 1]  # 명사이면서 길이가 1보다 긴 것만 선택
    preprocessed_text = ' '.join(nouns)

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform([preprocessed_text])

    num_samples = X.shape[0]
    if num_samples < num_clusters:
        num_clusters = num_samples

    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(X)

    cluster_centers = kmeans.cluster_centers_
    features = vectorizer.get_feature_names_out()
    top_keywords = []
    for cluster_center in cluster_centers:
        top_keyword_indices = cluster_center.argsort()[-num_keywords:][::-1]
        keywords = ['#' + features[int(i)] for i in top_keyword_indices]
        top_keywords.append(keywords)
    return ' '.join([' '.join(keywords) for keywords in top_keywords])

def save_articles_to_csv(documents, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Title', 'Article'])
        for doc in documents:
            writer.writerow([doc['title'], doc['article']])

documents = []
with open('../datasets/Korea_DB_0413.csv', mode='r', encoding='cp949') as file:
    reader = csv.DictReader(file)

    for row in reader:
        title = row['title']
        article = row['article']
        date = row['date']
        documents.append({'title': title, 'date': date, 'article': article})

vector_space_model = VectorSpaceModel(documents)

query = input("Enter your query: ")

warnings.filterwarnings("ignore", category=FutureWarning)

start = time.time()
ranked_indices = vector_space_model.search(query)
showing_article = []
print("[[ Search results ]]")
print("Below are the articles relevant with the keyword '{}':".format(query))
for i in range(0,100): ######검색결과로 나오는 기사 수는 여기에서 조정하시면 됩니당
    idx=ranked_indices[i]
    showing_article.append(idx)
    print("Title: ", documents[idx]['title'],documents[idx]['date'])
    keywords = extract_keywords(documents[idx]['article'])
    print(keywords, '\n')
end = time.time()
print(f'###########{end-start}##############')
selected_title = input("Enter the title you want to read: ")
print()

for doc in documents:
    if doc['title'] == selected_title:
        print("Title:", doc['title'], '/n')
        print("Article:\n", doc['article'], "/n/n")
        break
else:
    print("Selected title not found in the search results.")

    print("Title:", documents[idx]['title'], '\n')

Enter your query:  총선입니다


[[ Search results ]]
Below are the articles relevant with the keyword '총선입니다':


TypeError: only integer scalar arrays can be converted to a scalar index

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import csv
import warnings

import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from konlpy.tag import Okt
from sklearn.decomposition import TruncatedSVD

class VectorSpaceModel:
    def __init__(self, documents, num_components=100):
        self.documents = documents
        self.vectorizer = TfidfVectorizer()
        self.vectorized_documents = self.vectorizer.fit_transform([doc['article'] for doc in documents])
        self.svd = TruncatedSVD(n_components=num_components)
        self.reduced_vectorized_documents = self.svd.fit_transform(self.vectorized_documents)

    def search(self, query):
        query_vector = self.vectorizer.transform([query])
        similarities = cosine_similarity(query_vector, self.vectorized_documents)        
        positive_similarities_indices = similarities[0] > 0
        ranked_indices = positive_similarities_indices.nonzero()[0]        
        ranked_similarities = similarities[0][ranked_indices]        
        ranked_indices = ranked_indices[ranked_similarities.argsort()[::-1]]
        ranked_similarities = sorted(ranked_similarities, reverse=True)
        return ranked_indices, ranked_similarities 

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', '', text)
    return ' '.join(word_tokenize(text))

def save_articles_to_csv(documents, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Title', 'Article'])
        for doc in documents:
            writer.writerow([doc['title'], doc['article']])

documents = []
with open('../datasets/Korea_DB_0413.csv', mode='r', encoding='cp949') as file:
    reader = csv.DictReader(file)

    for row in reader:
        title = row['title']
        article = row['article']
        date = row['date']
        documents.append({'title': title, 'date': date, 'article': article})

vector_space_model = VectorSpaceModel(documents)

query = input("Enter your query: ")

warnings.filterwarnings("ignore", category=FutureWarning)
start = time.time()
ranked_indices = vector_space_model.search(query)
showing_article = []
print("[[ Search results ]]")
print("Below are the articles relevant with the keyword '{}':".format(query))
for i in range(0,100): ######검색결과로 나오는 기사 수는 여기에서 조정하시면 됩니당
    idx=ranked_indices[i]
    showing_article.append(idx)
    print("Title: ", documents[idx]['title'],documents[idx]['date'])
    keywords = extract_keywords(documents[idx]['article'])
    print(keywords, '\n')
end = time.time()
print(f'###########{end-start}##############')
selected_title = input("Enter the title you want to read: ")
print()

for doc in documents:
    if doc['title'] == selected_title:
        print("Title:", doc['title'], '/n')
        print("Article:\n", doc['article'], "/n/n")
        break
else:
    print("Selected title not found in the search results.")

    print("Title:", documents[idx]['title'], '\n')

Enter your query:  총선


[[ Search results ]]
Below are the articles relevant with the keyword '총선':


TypeError: only integer scalar arrays can be converted to a scalar index

In [None]:
import csv
import re
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from konlpy.tag import Okt
import nltk
from nltk.tokenize import word_tokenize
import warnings
import time

class VectorSpaceModel:
    def __init__(self, documents):
        self.documents = documents
        self.vectorizer = TfidfVectorizer()
        self.vectorized_documents = self.vectorizer.fit_transform([doc['article'] for doc in documents])

    def search(self, query):
        query_vector = self.vectorizer.transform([query])
        similarities = cosine_similarity(query_vector, self.vectorized_documents)        
        positive_similarities_indices = similarities[0] > 0
        ranked_indices = positive_similarities_indices.nonzero()[0]        
        ranked_similarities = similarities[0][ranked_indices]        
        ranked_indices = ranked_indices[ranked_similarities.argsort()[::-1]]
        ranked_similarities = sorted(ranked_similarities, reverse=True)
        return ranked_indices, ranked_similarities 

documents = []
with open('../datasets/Korea_DB_0413.csv', mode='r', encoding='cp949') as file:
    reader = csv.DictReader(file)
    for row in reader:
        title = row['title']
        article = row['article']
        date = row['date']
        #수정 (keyword 함께 저장)
        documents.append({'title': title, 'date':date, 'article': article})
        

vector_space_model = VectorSpaceModel(documents)

query = input("Enter your query: ")

warnings.filterwarnings("ignore", category=FutureWarning)

ranked_indices = vector_space_model.search(query)

print("[[ Search results ]]")
print("Below are the articles relevant with the keyword '{}':".format(query))


for idx in ranked_indices[:100]:
    print("Title: ", documents[idx]['title'], documents[idx]['date'])

selected_title = input("Enter the title you want to read: ")
print()

for doc in documents:
    if doc['title'] == selected_title:
        print("Title:", doc['title'])
        print("Article:\n", doc['article'], "\n\n")
        break
else:
    print("Selected title not found in the search results.")

Enter your query:  총선


[[ Search results ]]
Below are the articles relevant with the keyword '총선':
Title:  민주 “4·10 총선 사전투표율 31.3%·최종 71.3% 국민께 호소” 2024/04/03 10:55
Title:  내일부터 사전투표…국힘 ‘보수 결집’ 민주 ‘반윤 표심’ 노려 2024/04/04 10:15
Title:  인천 사전투표율 30.06% 역대 최고…최종 70% 넘어 하위권 탈출 기대 2024/04/09 17:44
Title:  한동훈 “총선 예상 의석수 과장되게 전망 삼가야” 공개 경고…왜? 2024/02/25 22:00
Title:  정우택 “총선 불출마…선거방해 정치공작 만행 끝까지 싸울 것” 2024/03/20 11:14
Title:  한동훈, 이달곤 불출마에 “깊은 존경…선민후사 마음으로 헌신” 2024/02/25 18:56
Title:  김종인, 이번엔 개혁신당으로… 총선 때마다 점퍼색 갈아 입어 2024/02/23 19:51
Title:  조국, 12일 문재인 예방…13일 부산서 총선 입장 표명 2024/02/11 18:21
Title:  ‘선거운동 2일차’ 법원 간 이재명 “재판받는 시간 아까워…정치검찰 노림수” 2024/03/29 13:56
Title:  ‘총선 후 유학설’ 일축한 한동훈… 당권·대권 도전 암시? 2024/03/31 9:58
Title:  文 전 대통령 예방한 이재명 대표, 평산마을 사저서 오찬 2024/02/04 15:50
Title:  최고치 갈아치운 사전투표율…21대 총선 투표율 ‘66.2%’ 깰까 2024/04/09 16:55
Title:  사전투표율 31.28% ‘역대 최고’…총선 첫 30% 돌파 2024/04/06 20:50
Title:  오후 1시 투표율 53.4%…지난 총선보다 3.7%p 높아 2024/04/10 13:53
Title:  또 김종인… 黨 4차례 바꿔가며 총선 등판 2024/02/24 1:40
Title:  정세균·김부겸 “이재명, 불공정