In [None]:
import csv

import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from konlpy.tag import Okt


class BooleanModel:
    def __init__(self, documents):
        self.index = {}
        self.documents = documents
        self.build_index()

    def build_index(self):
        for doc_id, document in enumerate(self.documents):
            for word in document['article'].split():
                if word in self.index:
                    self.index[word].add(doc_id)
                else:
                    self.index[word] = {doc_id}

    def search(self, query):
        query_words = query.split()
        result = None
        for word in query_words:
            if word in self.index:
                if result is None:
                    result = self.index[word]
                else:
                    result = result.intersection(self.index[word])
            else:
                result = set()
                break
        return result

def save_articles_to_csv(documents, output_file):
    with open(output_file, mode='w', newline='', encoding='cp949') as file:
        writer = csv.writer(file)
        writer.writerow(['Title', 'Article'])
        for doc in documents:
            writer.writerow([doc['title'], doc['article']])

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', '', text)
    return ' '.join(word_tokenize(text))

def extract_keywords(article_text, num_clusters=5, num_keywords=5):
    okt = Okt()
    preprocessed_text = preprocess_text(article_text)
    words = okt.pos(preprocessed_text, stem=True)
    nouns = [word for word, pos in words if pos == 'Noun' and len(word) > 1]  # 명사이면서 길이가 1보다 긴 것만 선택
    preprocessed_text = ' '.join(nouns)

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform([preprocessed_text])

    num_samples = X.shape[0]
    if num_samples < num_clusters:
        num_clusters = num_samples

    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(X)

    cluster_centers = kmeans.cluster_centers_
    features = vectorizer.get_feature_names_out()
    top_keywords = []
    for cluster_center in cluster_centers:
        top_keyword_indices = cluster_center.argsort()[-num_keywords:][::-1]
        keywords = ['#' + features[int(i)] for i in top_keyword_indices]
        top_keywords.append(keywords)
    return ' '.join([' '.join(keywords) for keywords in top_keywords])

documents = []
with open('../datasets/Korea_DB_0413.csv', mode='r', encoding='cp949') as file:
    reader = csv.DictReader(file)
    for row in reader:
        title = row['title']
        date=row['date']
        article = row['article']
        documents.append({'title': title, 'date':date, 'article': article})

boolean_model = BooleanModel(documents)
query = input("Enter your query: ")

search_result = boolean_model.search(query)
count=0
print("[[ Search results ]]")
print("Below are the articles relevant with the keyword '{}':".format(query))
if search_result:
    for doc_id in search_result:
        print("Title: ", documents[doc_id]['title'],documents[doc_id]['date'])
        keywords = extract_keywords(documents[doc_id]['article'])
        print(keywords, '\n')
else:
    print("No matching documents found.")

selected_title = input("\nEnter the title you want to read: ")
print()
for doc in documents:
    if doc['title'] == selected_title:
        print("Title:", doc['title'],"\n")
        print("Article:\n", doc['article'],"/n/n")
        break
else:
    print("Selected title not found in the search results.")

Enter your query:  총선


[[ Search results ]]
Below are the articles relevant with the keyword '총선':
Title:  민주 검증위, ‘1심 의원직 상실형’ 황운하에 적격…노웅래도 통과 2024/01/11 18:23
#의원 #청와대 #선거 #후보 #예비 

Title:  與공관위, 10명중 4명이 법조인 출신…당내선 중립성 우려 지적도 2024/01/11 17:32
#위원 #위원장 #관위 #법조인 #의원 

Title:  “업적들 무너뜨릴 것”“정계 은퇴하라”… 민주, 이낙연 탈당에 날선 ‘비난’ 2024/01/11 18:26
#대표 #탈당 #민주당 #입장 #이낙연 

Title:  민주, 현역의원 평가 마무리…컷오프 대상 하위 20% 통보설에 당 ‘술렁’ 2024/01/11 16:40
#의원 #평가 #현역 #하위 #통보 

Title:  이낙연 “양당 독점구도 깨는 의석수 필요…지역구 전부 출마”[일문일답] 2024/01/11 16:13
#민주당 #대표 #대통령 #생각 #말씀 

Title:  천하람 “개혁신당 12일 인재 영입 발표…아주 좋은 분”[중립기어 라이브] 2024/01/11 19:00
#저희 #위원장 #하람 #권기범 #기자 

Title:  노영민 文청와대 비서실장, 청주 상당 출사표…“정권 심판” 2024/01/11 15:39
#실장 #상당구 #출마 #대표 #민주당 

Title:  ‘이낙연 탈당’에 광주 정치권 맹비난…“정치적 반란행위” 2024/01/11 15:49
#민주당 #광주 #탈당 #대표 #정치 

Title:  친윤 이철규, ‘윤심 공천’ 우려에 “당 계파 없어…한동훈 교감 안 해” 2024/01/11 15:34
#의원 #위원장 #친윤 #비윤 #방향 

Title:  김은혜·방문규·염태영 등 출사표…경기도에 공직자출신 ‘출마 러시’ 2024/01/11 15:01
#경기도 #출마 #총선 #홍보수석 #대변인 

Title:  與 공관위원장 “외부 공관위원 총선 불출마…윤심 개입 없어” 2024/01/11 

In [24]:
len(search_result)

1240

In [None]:
def save_search_result_to_csv(documents, search_result, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Title', 'Date', 'Article'])
        for doc_id in search_result:
            doc = documents[doc_id]
            writer.writerow([doc['title'], doc['date'], doc['article']])

# 위에서 사용한 search_result와 documents를 활용하여 CSV 파일로 저장
save_search_result_to_csv(documents, search_result, 'Boolean_4월 10일 투표_1_search_result.csv')

In [5]:
import csv

import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from konlpy.tag import Okt

from collections import Counter
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

class BooleanModel:
    def __init__(self, documents):
        self.index = {}
        self.documents = documents
        self.build_index()

    def build_index(self):
        for doc_id, document in enumerate(self.documents):
            for word in document['article'].split():
                if word in self.index:
                    self.index[word].add(doc_id)
                else:
                    self.index[word] = {doc_id}

    def search(self, query):
        query_words = query.split()
        result = None
        for word in query_words:
            if word in self.index:
                if result is None:
                    result = self.index[word]
                else:
                    result = result.intersection(self.index[word])
            else:
                result = set()
                break
        return result

def save_articles_to_csv(documents, output_file):
    with open(output_file, mode='w', newline='', encoding='cp949') as file:
        writer = csv.writer(file)
        writer.writerow(['Title', 'Article'])
        for doc in documents:
            writer.writerow([doc['title'], doc['article']])

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', '', text)
    return ' '.join(word_tokenize(text))

def extract_keywords(article_text, num_clusters=5, num_keywords=5):
    okt = Okt()
    preprocessed_text = preprocess_text(article_text)
    words = okt.pos(preprocessed_text, stem=True)
    nouns = [word for word, pos in words if pos == 'Noun' and len(word) > 1]  # 명사이면서 길이가 1보다 긴 것만 선택
    preprocessed_text = ' '.join(nouns)

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform([preprocessed_text])

    num_samples = X.shape[0]
    if num_samples < num_clusters:
        num_clusters = num_samples

    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(X)

    cluster_centers = kmeans.cluster_centers_
    features = vectorizer.get_feature_names_out()
    top_keywords = []
    for cluster_center in cluster_centers:
        top_keyword_indices = cluster_center.argsort()[-num_keywords:][::-1]
        keywords = ['#' + features[int(i)] for i in top_keyword_indices]
        top_keywords.append(keywords)
    return ' '.join([' '.join(keywords) for keywords in top_keywords])


#################

documents = []
with open('../datasets/Korea_DB_0413.csv', mode='r', encoding='cp949') as file:
    reader = csv.DictReader(file)
    for row in reader:
        title = row['title']
        date=row['date']
        article = row['article']
        documents.append({'title': title, 'date':date, 'article': article})

boolean_model = BooleanModel(documents)
query = input("Enter your query: ")

search_result = boolean_model.search(query)

####################그래프 그리는 부분 코드입니당

date_ranges = ['Jan 1-15', 'Jan 16-31', 'Feb 1-15', 'Feb 16-29', 'Mar 1-15', 'Mar 16-31', 'Apr 1-15']
counts = [0] * len(date_ranges)

for doc in documents:
    date = int(doc['date'][8:10])  # 일(day) 추출
    month = int(doc['date'][5:7])  # 월(month) 추출

    if month == 1:
        if 1 <= date <= 15:
            counts[0] += 1
        elif 16 <= date <= 31:
            counts[1] += 1
    elif month == 2:
        if 1 <= date <= 15:
            counts[2] += 1
        elif 16 <= date <= 29:
            counts[3] += 1
    elif month == 3:
        if 1 <= date <= 15:
            counts[4] += 1
        elif 16 <= date <= 31:
            counts[5] += 1
    elif month == 4:
        if 1 <= date <= 15:
            counts[6] += 1


plt.bar(date_ranges, counts, color='skyblue')
plt.title('Article Distribution from January to April')
plt.xlabel('Date Range')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

##################

print("[[ Search results ]]")
plt.show()
print("Below are the articles relevant with the keyword '{}':".format(query))
if search_result:
    for doc_id in search_result:
        print("Title: ", documents[doc_id]['title'],documents[doc_id]['date'])
        keywords = extract_keywords(documents[doc_id]['article'])
        print(keywords, '\n')


else:
    print("No matching documents found.")

selected_title = input("\nEnter the title you want to read: ")
print()

for doc in documents:
    if doc['title'] == selected_title:
        print("Title:", doc['title'],"\n")
        print("Article:\n", doc['article'],"/n/n")
        break
else:
    print("Selected title not found in the search results.")

KeyboardInterrupt: Interrupted by user

In [2]:
class BooleanModel:
    def __init__(self, documents):
        self.index = {}
        self.documents = documents
        self.build_index()

    def build_index(self):
        for doc_id, document in enumerate(self.documents):
            for word in document['article'].split():
                if word in self.index:
                    self.index[word].add(doc_id)
                else:
                    self.index[word] = {doc_id}

    def search(self, query):
        query_words = query.split()
        result = None
        for word in query_words:
            if word in self.index:
                if result is None:
                    result = self.index[word]
                else:
                    result = result.intersection(self.index[word])
            else:
                result = set()
                break
        return result

def save_articles_to_csv(documents, output_file):
    with open(output_file, mode='w', newline='', encoding='cp949') as file:
        writer = csv.writer(file)
        writer.writerow(['Title', 'Article'])
        for doc in documents:
            writer.writerow([doc['title'], doc['article']])

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', '', text)
    return ' '.join(word_tokenize(text))

def extract_keywords(article_text, num_clusters=5, num_keywords=5):
    okt = Okt()
    preprocessed_text = preprocess_text(article_text)
    words = okt.pos(preprocessed_text, stem=True)
    nouns = [word for word, pos in words if pos == 'Noun' and len(word) > 1]  # 명사이면서 길이가 1보다 긴 것만 선택
    preprocessed_text = ' '.join(nouns)

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform([preprocessed_text])

    num_samples = X.shape[0]
    if num_samples < num_clusters:
        num_clusters = num_samples

    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(X)

    cluster_centers = kmeans.cluster_centers_
    features = vectorizer.get_feature_names_out()
    top_keywords = []
    for cluster_center in cluster_centers:
        top_keyword_indices = cluster_center.argsort()[-num_keywords:][::-1]
        keywords = ['#' + features[int(i)] for i in top_keyword_indices]
        top_keywords.append(keywords)
    return ' '.join([' '.join(keywords) for keywords in top_keywords])


#################

documents = []
with open('../datasets/Korea_DB_0413.csv', mode='r', encoding='cp949') as file:
    reader = csv.DictReader(file)
    for row in reader:
        title = row['title']
        date=row['date']
        article = row['article']
        documents.append({'title': title, 'date':date, 'article': article})

boolean_model = BooleanModel(documents)
query = input("Enter your query: ")

search_result = boolean_model.search(query)

print("[[ Search results ]]")

#### Graph.draw_graph(search_result)
print("Below are the articles relevant with the keyword '{}':".format(query))
if search_result:
    for doc_id in search_result:
        print("Title: ", documents[doc_id]['title'],documents[doc_id]['date'])
        keywords = extract_keywords(documents[doc_id]['article'])
        print(keywords, '\n')


else:
    print("No matching documents found.")

selected_title = input("\nEnter the title you want to read: ")
print()

for doc in documents:
    if doc['title'] == selected_title:
        print("Title:", doc['title'],"\n")
        print("Article:\n", doc['article'],"/n/n")
        break
else:
    print("Selected title not found in the search results.")

NameError: name 'csv' is not defined