# truncated svd

In [18]:
import csv
import re
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD

class VectorSpaceModel:
    def __init__(self, documents, num_components=100):
        self.documents = documents
        self.vectorizer = TfidfVectorizer()
        self.vectorized_documents = self.vectorizer.fit_transform([doc['article'] for doc in documents])
        
        self.svd = TruncatedSVD(n_components=num_components)
        self.reduced_vectorized_documents = self.svd.fit_transform(self.vectorized_documents)
        
        self.similarity_scores = {}
        
    def search(self, query):
        query_vector = self.vectorizer.transform([query])
        query_vector_reduced = self.svd.transform(query_vector)  # Transform query vector
        
        similarities = cosine_similarity(query_vector_reduced, self.reduced_vectorized_documents)
        ranked_indices = similarities.argsort()[0][::-1]
        
        ranked_similarities = [similarities[0][idx] for idx in ranked_indices]
        return ranked_indices, ranked_similarities

# Load documents from CSV
documents = []
with open('../datasets/Korea_DB_0413.csv', mode='r', encoding='cp949') as file:
    reader = csv.DictReader(file)
    for row in reader:
        title = row['title']
        article = row['article']
        date = row['date']
        documents.append({'title': title, 'date': date, 'article': article})

# Create VectorSpaceModel instance with TruncatedSVD
num_components = 100  # Number of components for TruncatedSVD
vector_space_model = VectorSpaceModel(documents, num_components=num_components)

# User query input
query = input("Enter your query: ")

# Ignore FutureWarning from sklearn
warnings.filterwarnings("ignore", category=FutureWarning)

# Perform search and retrieve ranked indices with similarities
ranked_indices, ranked_similarities = vector_space_model.search(query)

print("[[ Search results ]]")
print("Below are the articles relevant to the keyword '{}':".format(query))

# Display top results with titles, dates, and similarities
num_results_to_display = 10  # Number of top results to display
for idx, similarity in zip(ranked_indices[:num_results_to_display], ranked_similarities[:num_results_to_display]):
    print("Title:", documents[idx]['title'])
    print("Date:", documents[idx]['date'])
    print("Similarity:", similarity)
    print()  # Print a blank line for separation

Enter your query:  총선


[[ Search results ]]
Below are the articles relevant to the keyword '총선':
Title: 내일부터 사전투표…국힘 ‘보수 결집’ 민주 ‘반윤 표심’ 노려
Date: 2024/04/04 10:15
Similarity: 0.6641668223100176

Title: 최고치 갈아치운 사전투표율…21대 총선 투표율 ‘66.2%’ 깰까
Date: 2024/04/09 16:55
Similarity: 0.5958700785657038

Title: 민주 “4·10 총선 사전투표율 31.3%·최종 71.3% 국민께 호소”
Date: 2024/04/03 10:55
Similarity: 0.5896739933841286

Title: 정우택 “총선 불출마…선거방해 정치공작 만행 끝까지 싸울 것”
Date: 2024/03/20 11:14
Similarity: 0.5806226228202247

Title: 사전투표율 31.28%…與 “오만한 세력에 분노” 野 “나라 주인 누군지 보여줘”
Date: 2024/04/07 10:48
Similarity: 0.5648758728929868

Title: ‘62.8%’ 역대급 재외선거 투표율…여야 어느 쪽 웃을까
Date: 2024/04/02 17:47
Similarity: 0.5341448438662268

Title: 충북 총선 레이스 시작…‘4대 4’ 균형추 어디로?
Date: 2024/03/21 18:26
Similarity: 0.5265775846505858

Title: ‘31%’ 역대 총선 최고 사전투표율…“與 지지층 참여 때문” “정권심판 열기”
Date: 2024/04/06 19:47
Similarity: 0.5124936894588005

Title: 전남 유권자 10명 중 4명 사전투표… ‘전국 최고’ 41.1%
Date: 2024/04/06 19:29
Similarity: 0.5081908190159237

Title: 사전투표율 ‘31.3%’ 예견한 민주당…김민

# truncated upgrade : TfidfVectorizer의 매개변수를 최적화하고, 배치 처리를 통해 대용량 데이터를 효율적으로 처리

한국어 문서의 전처리에는 일반적으로 KoNLPy 라이브러리를 사용하여 형태소 분석을 수행하고 불용어를 제거하는 등의 작업이 포함됩니다

In [26]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/uiji/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
nltk.data.path.append('/home/uiji/nltk_data/corpora/stopwords/korean/nltk_data')

In [27]:
import csv
import re
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from konlpy.tag import Okt
import nltk
from nltk.corpus import stopwords

class EfficientVectorSpaceModel:
    def __init__(self, documents, batch_size=1000, max_features=10000):
        self.documents = documents
        self.vectorizer = TfidfVectorizer(max_features=max_features)
        self.batch_size = batch_size
        self.okt = Okt()

    def preprocess_documents(self):
        preprocessed_documents = [self._preprocess_text(doc['article']) for doc in self.documents]
        return preprocessed_documents

    def _preprocess_text(self, text):
        # 텍스트 전처리 함수
        text = text.lower()  # 소문자 변환
        text = re.sub(r'[^가-힣\s]', '', text)  # 한글 및 공백 제외 모든 문자 제거
        tokens = self.okt.morphs(text)  # 형태소 단위로 토큰화

        stop_words = set(stopwords.words('korean'))  # NLTK에서 제공하는 한국어 불용어 목록 사용
        tokens_filtered = [word for word in tokens if word not in stop_words]  # 불용어 제거

        return ' '.join(tokens_filtered)  # 전처리된 텍스트 반환

    def batch_vectorization(self, preprocessed_documents):
        vectorized_batches = []
        num_batches = len(self.documents) // self.batch_size + 1

        for i in range(num_batches):
            start_idx = i * self.batch_size
            end_idx = (i + 1) * self.batch_size
            batch_documents = preprocessed_documents[start_idx:end_idx]

            if not batch_documents:
                continue

            vectorized_batch = self.vectorizer.fit_transform(batch_documents)
            vectorized_batches.append(vectorized_batch)

        return vectorized_batches

    def search(self, query):
        query_vector = self.vectorizer.transform([self._preprocess_text(query)])

        for batch in self.vectorized_batches:
            similarities = cosine_similarity(query_vector, batch)
            ranked_indices = similarities.argsort()[0][::-1]

            for idx in ranked_indices:
                similarity_score = similarities[0][idx]
                if similarity_score > 0:
                    yield idx, similarity_score

# Load documents from CSV
documents = []
with open('../datasets/Korea_DB_0413.csv', mode='r', encoding='cp949') as file:
    reader = csv.DictReader(file)
    for row in reader:
        title = row['title']
        article = row['article']
        date = row['date']
        documents.append({'title': title, 'date': date, 'article': article})

# Create EfficientVectorSpaceModel instance
vector_space_model = EfficientVectorSpaceModel(documents, batch_size=1000, max_features=10000)

# Preprocess documents and perform batch vectorization
preprocessed_documents = vector_space_model.preprocess_documents()
vector_space_model.vectorized_batches = vector_space_model.batch_vectorization(preprocessed_documents)

# User query input
query = input("검색어를 입력하세요: ")

# Perform search and retrieve ranked indices with similarities
print("[[ 검색 결과 ]]")
print("키워드 '{}'와(과) 관련된 아티클 목록:".format(query))

for idx, similarity in vector_space_model.search(query):
    print("제목:", documents[idx]['title'])
    print("날짜:", documents[idx]['date'])
    print("유사도:", similarity)
    print()  # Print a blank line for separation

OSError: No such file or directory: '/home/uiji/nltk_data/corpora/stopwords/korean'

# 원래

In [17]:
import csv
import re
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from konlpy.tag import Okt
import nltk
from nltk.tokenize import word_tokenize
import warnings

class VectorSpaceModel:
    def __init__(self, documents):
        self.documents = documents
        self.vectorizer = TfidfVectorizer()
        self.vectorized_documents = self.vectorizer.fit_transform([doc['article'] for doc in documents])
        self.similarity_scores = {}
        
    def search(self, query):
        query_vector = self.vectorizer.transform([query])
        similarities = cosine_similarity(query_vector, self.vectorized_documents)
        ranked_indices = similarities.argsort()[0][::-1]
        
        # ranked_indices와 함께 유사도 값도 반환
        ranked_similarities = [similarities[0][idx] for idx in ranked_indices]
        
        return ranked_indices, ranked_similarities


documents = []
with open('../datasets/Korea_DB_0413.csv', mode='r', encoding='cp949') as file:
    reader = csv.DictReader(file)
    for row in reader:
        title = row['title']
        article = row['article']
        date = row['date']
        documents.append({'title': title, 'date': date, 'article': article})

vector_space_model = VectorSpaceModel(documents)

query = input("Enter your query: ")

warnings.filterwarnings("ignore", category=FutureWarning)

ranked_indices, ranked_similarities = vector_space_model.search(query)

print("[[ Search results ]]")
print("Below are the articles relevant with the keyword '{}':".format(query))

for idx, similarity in zip(ranked_indices[:500], ranked_similarities[:500]):
    print("Title: ", documents[idx]['title'], documents[idx]['date'])
    print("Similarity: ", similarity)

Enter your query:  총선


[[ Search results ]]
Below are the articles relevant with the keyword '총선':
Title:  민주 “4·10 총선 사전투표율 31.3%·최종 71.3% 국민께 호소” 2024/04/03 10:55
Similarity:  0.1859234171430282
Title:  내일부터 사전투표…국힘 ‘보수 결집’ 민주 ‘반윤 표심’ 노려 2024/04/04 10:15
Similarity:  0.17875549879685776
Title:  인천 사전투표율 30.06% 역대 최고…최종 70% 넘어 하위권 탈출 기대 2024/04/09 17:44
Similarity:  0.175712324122903
Title:  한동훈 “총선 예상 의석수 과장되게 전망 삼가야” 공개 경고…왜? 2024/02/25 22:00
Similarity:  0.14712334324011234
Title:  정우택 “총선 불출마…선거방해 정치공작 만행 끝까지 싸울 것” 2024/03/20 11:14
Similarity:  0.14186314850800064
Title:  한동훈, 이달곤 불출마에 “깊은 존경…선민후사 마음으로 헌신” 2024/02/25 18:56
Similarity:  0.1391722980264299
Title:  김종인, 이번엔 개혁신당으로… 총선 때마다 점퍼색 갈아 입어 2024/02/23 19:51
Similarity:  0.13544996026293993
Title:  조국, 12일 문재인 예방…13일 부산서 총선 입장 표명 2024/02/11 18:21
Similarity:  0.1346417311528453
Title:  ‘선거운동 2일차’ 법원 간 이재명 “재판받는 시간 아까워…정치검찰 노림수” 2024/03/29 13:56
Similarity:  0.130316603918077
Title:  ‘총선 후 유학설’ 일축한 한동훈… 당권·대권 도전 암시? 2024/03/31 9:58
Similarity:  0.129