In [1]:
! pip install pandas langchain chromadb transformers flask

Collecting flask
  Downloading flask-3.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting Werkzeug>=3.1 (from flask)
  Downloading werkzeug-3.1.3-py3-none-any.whl.metadata (3.7 kB)
Collecting itsdangerous>=2.2 (from flask)
  Downloading itsdangerous-2.2.0-py3-none-any.whl.metadata (1.9 kB)
Collecting blinker>=1.9 (from flask)
  Downloading blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Downloading flask-3.1.0-py3-none-any.whl (102 kB)
Downloading blinker-1.9.0-py3-none-any.whl (8.5 kB)
Downloading itsdangerous-2.2.0-py3-none-any.whl (16 kB)
Downloading werkzeug-3.1.3-py3-none-any.whl (224 kB)
Installing collected packages: Werkzeug, itsdangerous, blinker, flask
Successfully installed Werkzeug-3.1.3 blinker-1.9.0 flask-3.1.0 itsdangerous-2.2.0


In [13]:
! pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [7]:
! pip install discord

Collecting discord
  Downloading discord-2.3.2-py3-none-any.whl.metadata (381 bytes)
Collecting discord.py>=2.3.2 (from discord)
  Using cached discord.py-2.4.0-py3-none-any.whl.metadata (6.9 kB)
Downloading discord-2.3.2-py3-none-any.whl (1.1 kB)
Using cached discord.py-2.4.0-py3-none-any.whl (1.1 MB)
Installing collected packages: discord.py, discord
Successfully installed discord-2.3.2 discord.py-2.4.0


In [8]:
# 1. 환경 설정 및 데이터 로드
import pandas as pd
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.text_splitter import CharacterTextSplitter

# 데이터 파일 경로 설정
html_file_path = "/usr/workspace/raw/output_with_text.html"

# HTML 데이터를 로드합니다
loader = UnstructuredHTMLLoader(html_file_path)
documents = loader.load()

# 문서를 분할합니다 (500자 단위)
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
split_docs = text_splitter.split_documents(documents)

# 결과 확인
print(f"총 {len(split_docs)}개의 문서로 분할되었습니다.")


Created a chunk of size 1366, which is longer than the specified 500
Created a chunk of size 1089, which is longer than the specified 500
Created a chunk of size 1362, which is longer than the specified 500
Created a chunk of size 1257, which is longer than the specified 500
Created a chunk of size 1740, which is longer than the specified 500
Created a chunk of size 740, which is longer than the specified 500
Created a chunk of size 1308, which is longer than the specified 500
Created a chunk of size 708, which is longer than the specified 500
Created a chunk of size 1038, which is longer than the specified 500
Created a chunk of size 1724, which is longer than the specified 500
Created a chunk of size 535, which is longer than the specified 500
Created a chunk of size 703, which is longer than the specified 500
Created a chunk of size 661, which is longer than the specified 500
Created a chunk of size 702, which is longer than the specified 500
Created a chunk of size 664, which is lo

총 34개의 문서로 분할되었습니다.


In [10]:
import openai
import os
os.chdir("/usr/workspace")
from scr.config import DISCORD_BOT_TOKEN, OPENAI_API_KEY

# OpenAI API 키 설정
openai.api_key = OPENAI_API_KEY  # 여기에 API 키를 입력하세요.

# OpenAI 임베딩 생성 함수
def generate_embeddings(texts, model="text-embedding-ada-002"):
    """
    OpenAI API를 사용하여 텍스트 리스트에 대한 임베딩 생성
    """
    embeddings = []
    for text in texts:
        response = openai.Embedding.create(input=text, model=model)
        embeddings.append(response['data'][0]['embedding'])
    return embeddings

# 문서 텍스트 추출
texts = [doc.page_content for doc in split_docs]

# 임베딩 생성
embeddings = generate_embeddings(texts)
print(f"총 {len(embeddings)}개의 임베딩이 생성되었습니다.")


총 34개의 임베딩이 생성되었습니다.


In [12]:
import faiss
import numpy as np

# FAISS 인덱스 생성
dimension = len(embeddings[0])  # 임베딩 차원
index = faiss.IndexFlatL2(dimension)

# 벡터를 NumPy 배열로 변환
embedding_matrix = np.array(embeddings).astype("float32")

# FAISS 인덱스에 추가
index.add(embedding_matrix)
print(f"FAISS 인덱스에 {index.ntotal}개의 벡터가 추가되었습니다.")

# 검색 함수
def search_faiss(query, top_k=5):
    # 쿼리 임베딩 생성
    query_embedding = generate_embeddings([query])[0]
    query_embedding = np.array([query_embedding]).astype("float32")

    # 유사한 문서 검색
    distances, indices = index.search(query_embedding, top_k)
    return indices[0], distances[0]


FAISS 인덱스에 34개의 벡터가 추가되었습니다.


In [None]:
# RAG 챗봇 클래스
class RAGChatbot:
    def __init__(self, documents, faiss_index, llm_api_func):
        self.documents = documents
        self.faiss_index = faiss_index
        self.llm_api_func = llm_api_func

    def ask(self, query, top_k=10):
        # FAISS를 통해 관련 문서 검색
        indices, _ = search_faiss(query, top_k=top_k)
        context = "\n\n".join([self.documents[idx].page_content for idx in indices])

        # LLM 프롬프트 생성
        prompt = f"""
        아래는 관련 문서입니다:
        {context}

        질문: {query}
        답변:
        """
        
        # LLM API 호출
        answer = self.llm_api_func(prompt)
        return answer, context

# LLM API 호출 함수
def call_llm_api(prompt, model="gpt-4o"):
    """
    OpenAI ChatCompletion API를 사용하여 답변 생성
    """
    try:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=500,
            temperature=0.7
        )
        return response['choices'][0]['message']['content'].strip()
    except Exception as e:
        print(f"Error calling LLM API: {e}")
        return "죄송합니다. 답변을 생성할 수 없습니다."


# RAG 챗봇 인스턴스 생성
rag_chatbot = RAGChatbot(documents=split_docs, faiss_index=index, llm_api_func=call_llm_api)

# 테스트 질문
query = "기초교양 과목이름 알려줘"
answer, context = rag_chatbot.ask(query)
print("답변:", answer)
print("관련 문서:", context)


In [None]:
import discord
from discord.ext import commands
import openai
import numpy as np
import faiss
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from dotenv import load_dotenv

# (.env 파일) 로드
load_dotenv()

# OpenAI 및 Discord 토큰 로드
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
DISCORD_BOT_TOKEN = os.getenv("DISCORD_BOT_TOKEN")
llm = ChatOpenAI(temperature=0, model_name="gpt-4o", openai_api_key=OPENAI_API_KEY)


# ----- 문서 준비 -----
# 데이터 파일 경로
html_file_path = "/usr/workspace/raw/output.html"

# 문서 로드 및 분할
loader = UnstructuredHTMLLoader(html_file_path)
loaded_documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
split_docs = text_splitter.split_documents(loaded_documents)

# 문서와 텍스트 저장
documents = [doc.page_content for doc in split_docs]

# ----- OpenAI 임베딩 생성 -----
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

# ----- FAISS 리트리버 초기화 -----
# FAISS 인덱스 생성 및 추가
vectorstore = FAISS.from_texts(documents, embedding=embeddings)
# 리트리버 생성
retriever = vectorstore.as_retriever()

# ----- Few-shot 템플릿 설정 -----
from langchain.prompts import PromptTemplate
from langchain.prompts.few_shot import FewShotPromptTemplate


examples = """
예제 1:
문서: 창의적 문제 해결 역량의 주요 과목은 '상상속의 아이디어'와 '융합의 수학'입니다.
질문: 창의적 문제 해결 역량의 주요 과목이 무엇인가요?
답변: 창의적 문제 해결 역량의 주요 과목은 '상상속의 아이디어'와 '융합의 수학'입니다.

예제 2:
문서: 응용적 역량의 과목으로 '영화속의 건축여행'이 있습니다.
질문: 응용적 역량의 과목 중 하나를 알려주세요.
답변: 응용적
"""

# 예제
example_prompt = PromptTemplate(
    input_variables=["question", "answer"],
    template="Question: {question}\n{answer}"
)

# 프롬프트 템플릿
prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    prefix="""
너는 상명대학교 수강신청에 관한 질문에 답변하는 도우미야. 또한 일상적인 대화도 자연스럽게 이어나갈 수 있어.
지침:
- **무조건 한국어로** 친숙한 어투로 반말로 답변해.
""",
    suffix="""
대화 기록: {chat_history}
질문: {question}
컨텍스트: {context}

답변:
""",
    input_variables=["question", "context", "chat_history"]
)

# ----- 메모리 및 체인 생성 함수 -----
from langchain.memory import ConversationSummaryBufferMemory
from langchain.chains import ConversationalRetrievalChain

def create_memory(llm):
    return ConversationSummaryBufferMemory(
        llm=llm,
        memory_key="chat_history",
        return_messages=True,
        max_token_limit=2000,
        output_key="answer"
    )

def create_qa_chain(llm, retriever, memory, prompt):
    return ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=retriever,
        memory=memory,
        combine_docs_chain_kwargs={"prompt": prompt},
        return_source_documents=True,
        return_generated_question=False,
        verbose=True
    )

def process_qa(chain, question):
    try:
        result = chain({"question": question})
        return result.get("answer", "응답을 생성할 수 없습니다.")
    except Exception as e:
        print(f"QA Chain Error: {e}")
        return None

# # ----- 5. 문서 검색 및 답변 생성 -----
# def get_rag_answer(query):
#     """
#     리트리버와 LangChain FewShotPromptTemplate을 통합하여 답변 생성
#     """
#     try:
#         # 리트리버로 관련 문서 검색
#         related_docs = retriever.get_relevant_documents(query)
#         context = "\n\n".join([doc.page_content for doc in related_docs])

#         # Few-shot 템플릿을 사용하여 프롬프트 생성
#         prompt = fewshot_template.format(context=context, question=query)

#         # ChatOpenAI 호출
#         response = chat([HumanMessage(content=prompt)])
#         return response.content.strip()
#     except Exception as e:
#         return f"오류 발생: {e}"

# ----- 디스코드 봇 초기화 -----
# Discord 봇 설정
intents = discord.Intents.default()
intents.typing = False
intents.presences = False
intents.message_content = True
# 사용자별 메모리와 QA 체인 저장소
user_memories = {}
user_qa_chains = {}

bot = commands.Bot(command_prefix='#', intents=intents)

# 비동기 잠금 설정(무한루프 때메)
lock = asyncio.Lock()
bot = commands.Bot(command_prefix='#', intents=intents)

# ----- 디스코드 봇 이벤트 -----
@bot.event
async def on_ready():
    print(f'Logged in as: {bot.user}')

@bot.event
async def on_message(message):
    if message.author == bot.user:
        return

    if message.content.startswith(bot.command_prefix):
        await bot.process_commands(message)
        return

    user_id = str(message.author.id)

    async with lock:
        async with message.channel.typing():
            try:
                if user_id not in user_memories:
                    user_memories[user_id] = create_memory(llm)
                    user_qa_chains[user_id] = create_qa_chain(llm, retriever, user_memories[user_id], prompt)

                response = await asyncio.to_thread(
                    process_qa,
                    user_qa_chains[user_id],
                    message.content
                )

                if response:
                    await message.channel.send(response)
                else:
                    await message.channel.send("죄송합니다. 응답을 생성하는 데 문제가 발생했습니다.")

            except Exception as e:
                print(f"Error: {e}")
                await message.channel.send("요청을 처리하는 동안 오류가 발생했습니다.")



@bot.command()
@commands.has_role("관리자")
async def logout(ctx):
    await ctx.send("Logging out...")
    await bot.close()

# ----- 7. 봇 실행 -----
bot.run(DISCORD_BOT_TOKEN)
