# 원래

In [None]:
import csv
import re
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from konlpy.tag import Okt
import nltk
from nltk.tokenize import word_tokenize
import warnings

class VectorSpaceModel:
    def __init__(self, documents):
        self.documents = documents
        self.vectorizer = TfidfVectorizer()
        self.vectorized_documents = self.vectorizer.fit_transform([doc['article'] for doc in documents])
        self.similarity_scores = {}
        
    def search(self, query):
        query_vector = self.vectorizer.transform([query])
        similarities = cosine_similarity(query_vector, self.vectorized_documents)
        ranked_indices = similarities.argsort()[0][::-1]
        
        # ranked_indices와 함께 유사도 값도 반환
        ranked_similarities = [similarities[0][idx] for idx in ranked_indices]
        
        return ranked_indices, ranked_similarities


documents = []
with open('dataset/Korea_DB_0413.csv', mode='r', encoding='cp949') as file:
    reader = csv.DictReader(file)
    for row in reader:
        title = row['title']
        article = row['article']
        date = row['date']
        documents.append({'title': title, 'date': date, 'article': article})

vector_space_model = VectorSpaceModel(documents)

query = input("Enter your query: ")

warnings.filterwarnings("ignore", category=FutureWarning)

ranked_indices, ranked_similarities = vector_space_model.search(query)

print("[[ Search results ]]")
print("Below are the articles relevant with the keyword '{}':".format(query))

for idx, similarity in zip(ranked_indices[:500], ranked_similarities[:500]):
    print("Title: ", documents[idx]['title'], documents[idx]['date'])
    print("Similarity: ", similarity)

In [None]:
# import csv
# import re
# from collections import Counter
# import matplotlib.pyplot as plt
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.cluster import KMeans
# from konlpy.tag import Okt
# import nltk
# from nltk.tokenize import word_tokenize
# import warnings

# class VectorSpaceModel:
#     def __init__(self, documents):
#         self.documents = documents
#         self.vectorizer = TfidfVectorizer()
#         self.vectorized_documents = self.vectorizer.fit_transform([doc['article'] for doc in documents])
#         self.similarity_scores = {}
        
#     def search(self, query):
#         query_vector = self.vectorizer.transform([query])
#         similarities = cosine_similarity(query_vector, self.vectorized_documents)
#         ranked_indices = similarities.argsort()[0][::-1]
        
#         # ranked_indices와 함께 유사도 값도 반환
#         ranked_similarities = [similarities[0][idx] for idx in ranked_indices]
        
#         return ranked_indices, ranked_similarities

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class VectorSpaceModel:
    def __init__(self, documents):
        self.documents = documents
        self.vectorizer = TfidfVectorizer()
        self.vectorized_documents = self.vectorizer.fit_transform([doc['article'] for doc in documents])

    # def search(self, query):
    #     query_vector = self.vectorizer.transform([query])
    #     similarities = cosine_similarity(query_vector, self.vectorized_documents)        
    #     positive_similarities_indices = similarities[0] > 0
    #     ranked_indices = positive_similarities_indices.nonzero()[0]        
    #     ranked_similarities = similarities[0][ranked_indices]        
    #     ranked_indices = ranked_indices[ranked_similarities.argsort()[::-1]]
    #     ranked_similarities = sorted(ranked_similarities, reverse=True)
    #     return ranked_indices, ranked_similarities    

    def search(self, query):
        query_vector = self.vectorizer.transform([query])
        similarities = cosine_similarity(query_vector, self.vectorized_documents)
        ranked_indices = similarities.argsort()[0][::-1]
        ranked_similarities = [idx for idx in ranked_indices if similarities[0][idx] > 0.0]
        ranked_similarities = [similarities[0][idx] for idx in ranked_indices]
        return ranked_indices, ranked_similarities
        
documents = []
with open('dataset/Korea_DB_0413.csv', mode='r', encoding='cp949') as file:
    reader = csv.DictReader(file)
    for row in reader:
        title = row['title']
        article = row['article']
        date = row['date']
        documents.append({'title': title, 'date': date, 'article': article})

vector_space_model = VectorSpaceModel(documents)

query = input("Enter your query: ")

warnings.filterwarnings("ignore", category=FutureWarning)

ranked_indices, ranked_similarities = vector_space_model.search(query)

print("[[ Search results ]]")
print("Below are the articles relevant with the keyword '{}':".format(query))

for idx, similarity in zip(ranked_indices[:500], ranked_similarities[:500]):
    print("Title: ", documents[idx]['title'], documents[idx]['date'])
    print("Similarity: ", similarity)

def save_search_result_to_csv(documents, search_result, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Title', 'Date', 'Article'])
        for doc_id in search_result:
            doc = documents[doc_id]
            writer.writerow([doc['title'], doc['date'], doc['article']])

save_search_result_to_csv(documents, ranked_indices, 'vec_ori.csv')

In [None]:
len(ranked_indices)

# 최종

In [None]:
import csv
import re
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
import numpy as np
import time
import csv

class VectorSpaceModel:
    def __init__(self, documents, vectorizer, svd, num_components=50):
        self.documents = documents
        self.vectorizer = vectorizer
        self.svd = svd
        self.reduced_vectorized_documents = np.array([doc['vectors'] for doc in documents])
        
    def search(self, query):
        query_vector = self.vectorizer.transform([query])
        query_vector_reduced = self.svd.transform(query_vector)
        similarities = cosine_similarity(query_vector_reduced, self.reduced_vectorized_documents)
        ranked_indices = similarities.argsort()[0][::-1]        
        ranked_indices = [idx for idx in ranked_indices if similarities[0][idx] > 0]
        ranked_similarities = [similarities[0][idx] for idx in ranked_indices]
        
        return ranked_indices, ranked_similarities


# Load documents from CSV
documents = []
with open('dataset/Korea_DB_0413.csv', mode='r', encoding='cp949') as file:
    reader = csv.DictReader(file)
    for row in reader:
        title = row['title']
        article = row['article']
        date = row['date']
        documents.append({'title': title, 'date': date, 'article': article})
preprocessed_documents = []
with open('../datasets/preprocessed.csv', mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        title = row['title']
        article = row['article']
        preprocessed_documents.append({'title': title, 'article': article})

vectors, vectorizer, svd = vectorize(preprocessed_documents)
# Create VectorSpaceModel instance with runcatedSVD
for i in range(len(documents)):
    documents[i]['vectors'] = vectors[i]
        
vector_space_model = VectorSpaceModel(documents, vectorizer, svd)
# User query input
query = input("Enter your query: ")

# Ignore FutureWarning from sklearn
warnings.filterwarnings("ignore", category=FutureWarning)

# Perform search and retrieve ranked indices with similarities
ranked_indices, ranked_similarities = vector_space_model.search(query)

print("[[ Search results ]]")
print("Below are the articles relevant to the keyword '{}':".format(query))

# Display top results with titles, dates, and similarities
num_results_to_display = 10  # Number of top results to display
for idx, similarity in zip(ranked_indices[:num_results_to_display], ranked_similarities[:num_results_to_display]):
    print("Title:", documents[idx]['title'])
    print("Date:", documents[idx]['date'])
    print("Similarity:", similarity)
    print()  # Print a blank line for separation
def save_search_result_to_csv(documents, search_result, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Title', 'Date', 'Article'])
        for doc_id in search_result:
            doc = documents[doc_id]
            writer.writerow([doc['title'], doc['date'], doc['article']])
save_search_result_to_csv(documents, ranked_indices, 'Vector_제22대.csv')

# 쿼리도 전처리 조금 - stopwords 없애고 역시 한글만

In [None]:
import csv
import re
import string
from konlpy.tag import Komoran

class Preprocess:
    def __init__(self):
        self.komoran = Komoran()
        #with open('datasets/stopwords.txt', 'r', encoding='utf-8') as f:
        #    self.stopwords = set(f.read().split(','))

    def preprocess(self, text):
        text = text.strip()  
        text = re.compile('<.*?>').sub('', text) 
        text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
        text = re.sub('\s+', ' ', text)  
        text = re.sub(r'[^\w\s]', ' ', str(text).strip())
        text = re.sub(r'\d', ' ', text) 
        text = re.sub(r'\s+', ' ', text) 
        return text
    
    def is_korean(self, text):
        # Check if the text contains Hangul characters only
        korean_pattern = re.compile('[^ㄱ-ㅎ|ㅏ-ㅣ|가-힣]+')
        if korean_pattern.search(text):
            return False
        return True
    
    def final(self, text):
        n = []
        word = self.komoran.nouns(text)
        p = self.komoran.pos(text)
        for pos in p:
            if pos[1] in ['SL'] and self.is_korean(pos[0]):
                word.append(pos[0])
        for w in word:
         #   if len(w) > 1 and w not in self.stopwords:
            n.append(w)
        return " ".join(n)

    def finalpreprocess(self, documents):
        for doc in documents:
            doc['article'] = self.final(self.preprocess(doc['article']))
        return documents
        
preprocessor = Preprocess()

In [None]:
import csv
import re
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
import numpy as np
import time
import csv

class VectorSpaceModel:
    def __init__(self, documents, vectorizer, svd, num_components=50):
        self.documents = documents
        self.vectorizer = vectorizer
        self.svd = svd
        self.reduced_vectorized_documents = np.array([doc['vectors'] for doc in documents])
        
    def search(self, query):
        query_vector = self.vectorizer.transform([query])
        query_vector_reduced = self.svd.transform(query_vector)
        similarities = cosine_similarity(query_vector_reduced, self.reduced_vectorized_documents)
        ranked_indices = similarities.argsort()[0][::-1]        
        ranked_indices = [idx for idx in ranked_indices if similarities[0][idx] > 0]
        ranked_similarities = [similarities[0][idx] for idx in ranked_indices]
        
        return ranked_indices, ranked_similarities


# Load documents from CSV
documents = []
with open('dataset/Korea_DB_0413.csv', mode='r', encoding='cp949') as file:
    reader = csv.DictReader(file)
    for row in reader:
        title = row['title']
        article = row['article']
        date = row['date']
        documents.append({'title': title, 'date': date, 'article': article})
preprocessed_documents = []
with open('../datasets/preprocessed.csv', mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        title = row['title']
        article = row['article']
        preprocessed_documents.append({'title': title, 'article': article})

vectors, vectorizer, svd = vectorize(preprocessed_documents)
# Create VectorSpaceModel instance with runcatedSVD
for i in range(len(documents)):
    documents[i]['vectors'] = vectors[i]
        
vector_space_model = VectorSpaceModel(documents, vectorizer, svd)
# User query input
query = input("Enter your query: ")

# Ignore FutureWarning from sklearn
warnings.filterwarnings("ignore", category=FutureWarning)

pre_q = preprocessor.final(query)
# Perform search and retrieve ranked indices with similarities
ranked_indices, ranked_similarities = vector_space_model.search(pre_q)

print("[[ Search results ]]")
print("Below are the articles relevant to the keyword '{}':".format(query))

# Display top results with titles, dates, and similarities
num_results_to_display = 10  # Number of top results to display
for idx, similarity in zip(ranked_indices[:num_results_to_display], ranked_similarities[:num_results_to_display]):
    print("Title:", documents[idx]['title'])
    print("Date:", documents[idx]['date'])
    print("Similarity:", similarity)
    print()  # Print a blank line for separation
def save_search_result_to_csv(documents, search_result, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Title', 'Date', 'Article'])
        for doc_id in search_result:
            doc = documents[doc_id]
            writer.writerow([doc['title'], doc['date'], doc['article']])
save_search_result_to_csv(documents, ranked_indices, 'Vector_선거.csv')

# 유사도 0.01 이상만

In [None]:
import csv
import re
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
import numpy as np
import time
import csv

class VectorSpaceModel:
    def __init__(self, documents, vectorizer, svd, num_components=50):
        self.documents = documents
        self.vectorizer = vectorizer
        self.svd = svd
        self.reduced_vectorized_documents = np.array([doc['vectors'] for doc in documents])
        
    def search(self, query):
        query_vector = self.vectorizer.transform([query])
        query_vector_reduced = self.svd.transform(query_vector)
        similarities = cosine_similarity(query_vector_reduced, self.reduced_vectorized_documents)
        ranked_indices = similarities.argsort()[0][::-1]        
        ranked_indices = [idx for idx in ranked_indices if similarities[0][idx] > 0.01]
        ranked_similarities = [similarities[0][idx] for idx in ranked_indices]
        
        return ranked_indices, ranked_similarities


# Load documents from CSV
documents = []
with open('dataset/Korea_DB_0413.csv', mode='r', encoding='cp949') as file:
    reader = csv.DictReader(file)
    for row in reader:
        title = row['title']
        article = row['article']
        date = row['date']
        documents.append({'title': title, 'date': date, 'article': article})
preprocessed_documents = []
with open('../datasets/preprocessed.csv', mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        title = row['title']
        article = row['article']
        preprocessed_documents.append({'title': title, 'article': article})

vectors, vectorizer, svd = vectorize(preprocessed_documents)
# Create VectorSpaceModel instance with runcatedSVD
for i in range(len(documents)):
    documents[i]['vectors'] = vectors[i]
        
vector_space_model = VectorSpaceModel(documents, vectorizer, svd)
# User query input
query = input("Enter your query: ")

# Ignore FutureWarning from sklearn
warnings.filterwarnings("ignore", category=FutureWarning)

pre_q = preprocessor.final(query)
# Perform search and retrieve ranked indices with similarities
ranked_indices, ranked_similarities = vector_space_model.search(pre_q)

print("[[ Search results ]]")
print("Below are the articles relevant to the keyword '{}':".format(query))

# Display top results with titles, dates, and similarities
num_results_to_display = 10  # Number of top results to display
for idx, similarity in zip(ranked_indices[:num_results_to_display], ranked_similarities[:num_results_to_display]):
    print("Title:", documents[idx]['title'])
    print("Date:", documents[idx]['date'])
    print("Similarity:", similarity)
    print()  # Print a blank line for separation
def save_search_result_to_csv(documents, search_result, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Title', 'Date', 'Article'])
        for doc_id in search_result:
            doc = documents[doc_id]
            writer.writerow([doc['title'], doc['date'], doc['article']])
save_search_result_to_csv(documents, ranked_indices, 'Vector_410투표.csv')

# 유사도 0.05 이상만

In [None]:
import csv
import re
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
import numpy as np
import time
import csv

class VectorSpaceModel:
    def __init__(self, documents, vectorizer, svd, num_components=50):
        self.documents = documents
        self.vectorizer = vectorizer
        self.svd = svd
        self.reduced_vectorized_documents = np.array([doc['vectors'] for doc in documents])
        
    def search(self, query):
        query_vector = self.vectorizer.transform([query])
        query_vector_reduced = self.svd.transform(query_vector)
        similarities = cosine_similarity(query_vector_reduced, self.reduced_vectorized_documents)
        ranked_indices = similarities.argsort()[0][::-1]        
        ranked_indices = [idx for idx in ranked_indices if similarities[0][idx] > 0.05]
        ranked_similarities = [similarities[0][idx] for idx in ranked_indices]
        
        return ranked_indices, ranked_similarities


# Load documents from CSV
documents = []
with open('dataset/Korea_DB_0413.csv', mode='r', encoding='cp949') as file:
    reader = csv.DictReader(file)
    for row in reader:
        title = row['title']
        article = row['article']
        date = row['date']
        documents.append({'title': title, 'date': date, 'article': article})
preprocessed_documents = []
with open('../datasets/preprocessed.csv', mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        title = row['title']
        article = row['article']
        preprocessed_documents.append({'title': title, 'article': article})

vectors, vectorizer, svd = vectorize(preprocessed_documents)
# Create VectorSpaceModel instance with runcatedSVD
for i in range(len(documents)):
    documents[i]['vectors'] = vectors[i]
        
vector_space_model = VectorSpaceModel(documents, vectorizer, svd)
# User query input
query = input("Enter your query: ")

# Ignore FutureWarning from sklearn
warnings.filterwarnings("ignore", category=FutureWarning)

pre_q = preprocessor.final(query)
# Perform search and retrieve ranked indices with similarities
ranked_indices, ranked_similarities = vector_space_model.search(pre_q)

print("[[ Search results ]]")
print("Below are the articles relevant to the keyword '{}':".format(query))

# Display top results with titles, dates, and similarities
num_results_to_display = 10  # Number of top results to display
for idx, similarity in zip(ranked_indices[:num_results_to_display], ranked_similarities[:num_results_to_display]):
    print("Title:", documents[idx]['title'])
    print("Date:", documents[idx]['date'])
    print("Similarity:", similarity)
    print()  # Print a blank line for separation
def save_search_result_to_csv(documents, search_result, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Title', 'Date', 'Article'])
        for doc_id in search_result:
            doc = documents[doc_id]
            writer.writerow([doc['title'], doc['date'], doc['article']])
save_search_result_to_csv(documents, ranked_indices, 'Vector_제22대.csv')

# 유사도 0.1 이상만

In [None]:
import csv
import re
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
import numpy as np
import time
import csv

class VectorSpaceModel:
    def __init__(self, documents, vectorizer, svd, num_components=50):
        self.documents = documents
        self.vectorizer = vectorizer
        self.svd = svd
        self.reduced_vectorized_documents = np.array([doc['vectors'] for doc in documents])
        
    def search(self, query):
        query_vector = self.vectorizer.transform([query])
        query_vector_reduced = self.svd.transform(query_vector)
        similarities = cosine_similarity(query_vector_reduced, self.reduced_vectorized_documents)
        ranked_indices = similarities.argsort()[0][::-1]        
        ranked_indices = [idx for idx in ranked_indices if similarities[0][idx] > 0.1]
        ranked_similarities = [similarities[0][idx] for idx in ranked_indices]
        
        return ranked_indices, ranked_similarities


# Load documents from CSV
documents = []
with open('dataset/Korea_DB_0413.csv', mode='r', encoding='cp949') as file:
    reader = csv.DictReader(file)
    for row in reader:
        title = row['title']
        article = row['article']
        date = row['date']
        documents.append({'title': title, 'date': date, 'article': article})
preprocessed_documents = []
with open('../datasets/preprocessed.csv', mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        title = row['title']
        article = row['article']
        preprocessed_documents.append({'title': title, 'article': article})

vectors, vectorizer, svd = vectorize(preprocessed_documents)
# Create VectorSpaceModel instance with runcatedSVD
for i in range(len(documents)):
    documents[i]['vectors'] = vectors[i]
        
vector_space_model = VectorSpaceModel(documents, vectorizer, svd)
# User query input
query = input("Enter your query: ")

# Ignore FutureWarning from sklearn
warnings.filterwarnings("ignore", category=FutureWarning)

pre_q = preprocessor.final(query)
# Perform search and retrieve ranked indices with similarities
ranked_indices, ranked_similarities = vector_space_model.search(pre_q)

print("[[ Search results ]]")
print("Below are the articles relevant to the keyword '{}':".format(query))

# Display top results with titles, dates, and similarities
num_results_to_display = 10  # Number of top results to display
for idx, similarity in zip(ranked_indices[:num_results_to_display], ranked_similarities[:num_results_to_display]):
    print("Title:", documents[idx]['title'])
    print("Date:", documents[idx]['date'])
    print("Similarity:", similarity)
    print()  # Print a blank line for separation
def save_search_result_to_csv(documents, search_result, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Title', 'Date', 'Article'])
        for doc_id in search_result:
            doc = documents[doc_id]
            writer.writerow([doc['title'], doc['date'], doc['article']])
save_search_result_to_csv(documents, ranked_indices, 'Vector_선거.csv')

# 0.075

In [None]:
import csv
import re
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
import numpy as np
import time
import csv

class VectorSpaceModel:
    def __init__(self, documents, vectorizer, svd, num_components=50):
        self.documents = documents
        self.vectorizer = vectorizer
        self.svd = svd
        self.reduced_vectorized_documents = np.array([doc['vectors'] for doc in documents])
        
    def search(self, query):
        query_vector = self.vectorizer.transform([query])
        query_vector_reduced = self.svd.transform(query_vector)
        similarities = cosine_similarity(query_vector_reduced, self.reduced_vectorized_documents)
        ranked_indices = similarities.argsort()[0][::-1]        
        ranked_indices = [idx for idx in ranked_indices if similarities[0][idx] > 0.075]
        ranked_similarities = [similarities[0][idx] for idx in ranked_indices]
        
        return ranked_indices, ranked_similarities


# Load documents from CSV
documents = []
with open('dataset/Korea_DB_0413.csv', mode='r', encoding='cp949') as file:
    reader = csv.DictReader(file)
    for row in reader:
        title = row['title']
        article = row['article']
        date = row['date']
        documents.append({'title': title, 'date': date, 'article': article})
preprocessed_documents = []
with open('../datasets/preprocessed.csv', mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        title = row['title']
        article = row['article']
        preprocessed_documents.append({'title': title, 'article': article})

vectors, vectorizer, svd = vectorize(preprocessed_documents)
# Create VectorSpaceModel instance with runcatedSVD
for i in range(len(documents)):
    documents[i]['vectors'] = vectors[i]
        
vector_space_model = VectorSpaceModel(documents, vectorizer, svd)
# User query input
query = input("Enter your query: ")

# Ignore FutureWarning from sklearn
warnings.filterwarnings("ignore", category=FutureWarning)

pre_q = preprocessor.final(query)
# Perform search and retrieve ranked indices with similarities
ranked_indices, ranked_similarities = vector_space_model.search(pre_q)

print("[[ Search results ]]")
print("Below are the articles relevant to the keyword '{}':".format(query))

# Display top results with titles, dates, and similarities
num_results_to_display = 10  # Number of top results to display
for idx, similarity in zip(ranked_indices[:num_results_to_display], ranked_similarities[:num_results_to_display]):
    print("Title:", documents[idx]['title'])
    print("Date:", documents[idx]['date'])
    print("Similarity:", similarity)
    print()  # Print a blank line for separation
def save_search_result_to_csv(documents, search_result, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Title', 'Date', 'Article'])
        for doc_id in search_result:
            doc = documents[doc_id]
            writer.writerow([doc['title'], doc['date'], doc['article']])
save_search_result_to_csv(documents, ranked_indices, 'Vector_410투표.csv')

In [None]:
# 0.12

In [None]:
import csv
import re
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
import numpy as np
import time
import csv

class VectorSpaceModel:
    def __init__(self, documents, vectorizer, svd, num_components=50):
        self.documents = documents
        self.vectorizer = vectorizer
        self.svd = svd
        self.reduced_vectorized_documents = np.array([doc['vectors'] for doc in documents])
        
    def search(self, query):
        query_vector = self.vectorizer.transform([query])
        query_vector_reduced = self.svd.transform(query_vector)
        similarities = cosine_similarity(query_vector_reduced, self.reduced_vectorized_documents)
        ranked_indices = similarities.argsort()[0][::-1]        
        ranked_indices = [idx for idx in ranked_indices if similarities[0][idx] > 0.12]
        ranked_similarities = [similarities[0][idx] for idx in ranked_indices]
        
        return ranked_indices, ranked_similarities


# Load documents from CSV
documents = []
with open('dataset/Korea_DB_0413.csv', mode='r', encoding='cp949') as file:
    reader = csv.DictReader(file)
    for row in reader:
        title = row['title']
        article = row['article']
        date = row['date']
        documents.append({'title': title, 'date': date, 'article': article})
preprocessed_documents = []
with open('../datasets/preprocessed.csv', mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        title = row['title']
        article = row['article']
        preprocessed_documents.append({'title': title, 'article': article})

vectors, vectorizer, svd = vectorize(preprocessed_documents)
# Create VectorSpaceModel instance with runcatedSVD
for i in range(len(documents)):
    documents[i]['vectors'] = vectors[i]
        
vector_space_model = VectorSpaceModel(documents, vectorizer, svd)
# User query input
query = input("Enter your query: ")

# Ignore FutureWarning from sklearn
warnings.filterwarnings("ignore", category=FutureWarning)

pre_q = preprocessor.final(query)
# Perform search and retrieve ranked indices with similarities
ranked_indices, ranked_similarities = vector_space_model.search(pre_q)

print("[[ Search results ]]")
print("Below are the articles relevant to the keyword '{}':".format(query))

# Display top results with titles, dates, and similarities
num_results_to_display = 10  # Number of top results to display
for idx, similarity in zip(ranked_indices[:num_results_to_display], ranked_similarities[:num_results_to_display]):
    print("Title:", documents[idx]['title'])
    print("Date:", documents[idx]['date'])
    print("Similarity:", similarity)
    print()  # Print a blank line for separation
def save_search_result_to_csv(documents, search_result, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Title', 'Date', 'Article'])
        for doc_id in search_result:
            doc = documents[doc_id]
            writer.writerow([doc['title'], doc['date'], doc['article']])
save_search_result_to_csv(documents, ranked_indices, 'Vector_선거.csv')

In [6]:
import csv
import re
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
import numpy as np
import time
import csv

class VectorSpaceModel:
    def __init__(self, documents, vectorizer, svd, num_components=50):
        self.documents = documents
        self.vectorizer = vectorizer
        self.svd = svd
        self.reduced_vectorized_documents = np.array([doc['vectors'] for doc in documents])
        
    def search(self, query):
        query_vector = self.vectorizer.transform([query])
        query_vector_reduced = self.svd.transform(query_vector)
        similarities = cosine_similarity(query_vector_reduced, self.reduced_vectorized_documents)
        ranked_indices = similarities.argsort()[0][::-1]        
        ranked_indices = [idx for idx in ranked_indices if similarities[0][idx] > 0.12]
        ranked_similarities = [similarities[0][idx] for idx in ranked_indices]
        
        return ranked_indices, ranked_similarities


# Load documents from CSV
documents = []
with open('dataset/Korea_DB_0413.csv', mode='r', encoding='cp949') as file:
    reader = csv.DictReader(file)
    for row in reader:
        title = row['title']
        article = row['article']
        date = row['date']
        documents.append({'title': title, 'date': date, 'article': article})
preprocessed_documents = []
with open('../datasets/preprocessed.csv', mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        title = row['title']
        article = row['article']
        preprocessed_documents.append({'title': title, 'article': article})

vectors, vectorizer, svd = vectorize(preprocessed_documents)
# Create VectorSpaceModel instance with runcatedSVD
for i in range(len(documents)):
    documents[i]['vectors'] = vectors[i]
        
vector_space_model = VectorSpaceModel(documents, vectorizer, svd)
# User query input
query = input("Enter your query: ")

# Ignore FutureWarning from sklearn
warnings.filterwarnings("ignore", category=FutureWarning)

preprocessor = Preprocess()
pre_q = preprocessor.final(query)
# Perform search and retrieve ranked indices with similarities
ranked_indices, ranked_similarities = vector_space_model.search(pre_q)

print("[[ Search results ]]")
print("Below are the articles relevant to the keyword '{}':".format(query))

# Display top results with titles, dates, and similarities
num_results_to_display = 10  # Number of top results to display
for idx, similarity in zip(ranked_indices[:num_results_to_display], ranked_similarities[:num_results_to_display]):
    print("Title:", documents[idx]['title'])
    print("Date:", documents[idx]['date'])
    print("Similarity:", similarity)
    print()  # Print a blank line for separation

def save_search_result_to_csv(documents, search_result, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Title', 'Date', 'Article'])
        for doc_id in search_result:
            doc = documents[doc_id]
            writer.writerow([doc['title'], doc['date'], doc['article']])
save_search_result_to_csv(documents, ranked_indices, '../datasets/Vector_제22대.csv')

Enter your query:  제22대 국회의원선거


[[ Search results ]]
Below are the articles relevant to the keyword '제22대 국회의원선거':
Title: “참정권 행사 못 할 수도…” 선거인 명부 열람 24~26일
Date: 2024/03/22 15:42
Similarity: 0.9157079984487292

Title: 내일부터 4·10총선 선거운동 시작…후보 관련글 SNS 공유 주의
Date: 2024/03/27 10:23
Similarity: 0.9060294843105663

Title: 지방공사 상근직원 선거운동 금지 ‘위헌’…교회서 ‘금지’는 ‘합헌’
Date: 2024/01/25 18:30
Similarity: 0.8697277142264345

Title: 총선 직전 유권자 대상 집회 개최…선관위, 후보·사무장 경찰 고발
Date: 2024/04/06 21:05
Similarity: 0.8653829746493353

Title: 헌재, “○장로 속한 당 뽑아라” 목사 선거운동 불가…지방공사 직원은 가능
Date: 2024/01/25 15:41
Similarity: 0.8650936564864831

Title: “푸바오 탈은 되지만 복장은 위반”…與野, 까다로운 선거법에 진땀
Date: 2024/03/24 19:19
Similarity: 0.8624896701642011

Title: 중앙선관위 “이달 24~26일 총선 선거인명부 열람·이의신청”
Date: 2024/03/22 15:19
Similarity: 0.8614209445605282

Title: “시각장애인도 선거 정보 쉽게”…선관위, 인권위 권고 수용
Date: 2024/02/29 13:48
Similarity: 0.8449213739037442

Title: 선관위, 위법 선거문자 단속 강화…수신거부에도 발송시 엄중조치
Date: 2024/02/22 13:24
Similarity: 0.8335817751379645

Title: 국민의힘 “이재명, 타 정당 후보 유세·꼼수

In [2]:
def vectorize(preprocessed_documents, num_components=50):
        vectorizer = TfidfVectorizer()
        vectorized_documents = vectorizer.fit_transform([doc['article'] for doc in preprocessed_documents])    
        svd = TruncatedSVD(n_components=num_components)
        reduced_vectorized_documents = svd.fit_transform(vectorized_documents)
        return reduced_vectorized_documents, vectorizer, svd

In [1]:
import re
import string
from konlpy.tag import Komoran

class Preprocess:
    def __init__(self):
        self.komoran = Komoran()
        #with open('stopwords.txt', 'r', encoding='utf-8') as f:
        #    self.stopwords = set(f.read().split(','))

    def preprocess(self, text):
        text = text.strip()  
        text = re.compile('<.*?>').sub('', text) 
        text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
        text = re.sub('\s+', ' ', text)  
        text = re.sub(r'[^\w\s]', ' ', str(text).strip())
        text = re.sub(r'\d', ' ', text) 
        text = re.sub(r'\s+', ' ', text) 
        return text
    
    def final(self, text):
        n = []
        word = self.komoran.nouns(text)
        p = self.komoran.pos(text)
        for pos in p:
            if pos[1] in ['SL']:
                word.append(pos[0])
        for w in word:
            #if len(w) > 1 and w not in self.stopwords:
            n.append(w)
        return " ".join(n)

#     def finalpreprocess(self, documents):
#         for doc in documents:
#             doc['article'] = self.final(self.preprocess(doc['article']))
#         return documents