# 1. 환경 설정

In [5]:
!pip install selenium
!apt-get update
!apt install chromium-chromedriver
!pip install pandas
!pip install nltk
!pip install numpy

zsh:1: command not found: pip
zsh:1: command not found: apt-get
The operation couldn’t be completed. Unable to locate a Java Runtime.
Please visit http://www.java.com for information on installing Java.

zsh:1: command not found: pip
zsh:1: command not found: pip
zsh:1: command not found: pip


In [6]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from selenium import webdriver
from bs4 import BeautifulSoup as bs

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

ModuleNotFoundError: No module named 'numpy'

## 1-1. 로마 숫자 변환 함수

In [None]:
def roman_to_integer(roman):
    roman_values = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
    integer = 0
    for i in range(len(roman)):
        if i > 0 and roman_values[roman[i]] > roman_values[roman[i - 1]]:
            integer += roman_values[roman[i]] - 2 * roman_values[roman[i - 1]]
        else:
            integer += roman_values[roman[i]]
    return integer

# 2. 텍스트 전처리

In [None]:
def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return ''
    
    text = re.sub(r'[^\w\s.,?]', '', text.lower())
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    return ' '.join(tokens)

# 3. 텍스트 추출 및 전처리

## 3-1. Selenium을 사용한 텍스트 추출

In [None]:
def extract_text_from_web():
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=chrome_options)

    url = "https://www.gutenberg.org/cache/epub/1342/pg1342.txt"
    driver.get(url)
    soup = bs(driver.page_source, 'html.parser')

    contents = soup.find('body').string
    driver.quit()
    
    return contents

contents = extract_text_from_web()

## 3-2. 책 챕터 로드 및 전처리

In [None]:
def load_and_preprocess_book(text):
    chapters = []
    chapter_numbers = [3, 5, 7, 9, 11, 13, 15, 17, 19]
    
    chapter_pattern = r"Chapter\s+([IVXLCDM]+)"
    chapter_matches = list(re.finditer(chapter_pattern, text, re.IGNORECASE))
    
    for i, match in enumerate(chapter_matches):
        roman_num = match.group(1).upper()
        chapter_num = roman_to_integer(roman_num)
        if chapter_num in chapter_numbers:
            if i+1 < len(chapter_matches):
                chapter_text = text[match.start():chapter_matches[i+1].start()]
            else:
                chapter_text = text[match.start():]
            
            chapters.append(preprocess_text(chapter_text))
    
    return chapters, chapter_numbers

chapters, chapter_numbers = load_and_preprocess_book(contents)

# 4. 벡터 공간 모델 구축

## 4-1. VectorSpaceModel 클래스 정의

In [None]:
class VectorSpaceModel:
    def __init__(self, preprocessed_chapters, chapter_numbers):
        self.preprocessed_chapters = preprocessed_chapters
        self.chapter_numbers = chapter_numbers
        self.vectorizer = TfidfVectorizer()
        self.chapter_vectors = self.vectorizer.fit_transform(preprocessed_chapters)
    
    def load_questions(self, file_path):
        qa_data = pd.read_excel(file_path)
        self.questions = qa_data['질문'].fillna("").tolist()
    
    def vectorize_queries(self):
        self.query_vectors = self.vectorizer.transform(self.questions)
    
    def calculate_similarity(self):
        self.cosine_sim = cosine_similarity(self.query_vectors, self.chapter_vectors)
    
    def get_top_n_chapters(self, n=3):
        self.top_n_chapters = []
        for cos_sim in self.cosine_sim:
            top_indices = (-cos_sim).argsort()[:n]
            top_chapters = [self.chapter_numbers[i] for i in top_indices]
            self.top_n_chapters.append(top_chapters)
    
    def output_results(self):
        top_n_df = pd.DataFrame({
            'Question': self.questions,
            'Top N Chapters': self.top_n_chapters
        })
        return top_n_df


# 5. 메인 실행

## 5-1. 메인 함수 정의 및 실행

In [None]:
def main():
    # 책 챕터 로드 및 전처리
    chapters, chapter_numbers = load_and_preprocess_book(contents)
    
    # 벡터 공간 모델 생성 및 질문 처리
    question_file_path = 'Q&A_pride_and_prejudice_training.xlsx'
    
    vsm = VectorSpaceModel(chapters, chapter_numbers)
    vsm.load_questions(question_file_path)
    vsm.vectorize_queries()
    vsm.calculate_similarity()
    vsm.get_top_n_chapters(n=3)
    
    results = vsm.output_results()
    print(results)

if __name__ == "__main__":
    main()