# Baseline RAG LLM

何も手を加えず、RAGを組んで実行する

In [1]:
import os
import pandas as pd
import openai
import datetime
import tiktoken
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter

In [2]:
# OpenAI APIキーを設定
openai.api_key = input()

In [3]:
novel_file_path = "../data/novels_preprocess/works/"

In [4]:
# .txt ファイルを読み込み、ドキュメントをリスト化
documents = []
for filename in os.listdir(novel_file_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(novel_file_path, filename)
        loader = TextLoader(file_path, encoding="utf-8")
        documents.extend(loader.load())

In [6]:
# テキストを分割するためのテキストスプリッターを定義
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=300, chunk_overlap=0)
split_docs = text_splitter.split_documents(documents)

In [1]:
# for i in range(len(split_docs)):
    # print(split_docs[i].page_content)

In [8]:
# OpenAIの埋め込みモデルを使ってドキュメントをベクトル化
embedding = OpenAIEmbeddings(openai_api_key=openai.api_key)

  embedding = OpenAIEmbeddings(openai_api_key=openai.api_key)


In [9]:
# FAISSでベクトルストアを作成
vectorstore = FAISS.from_documents(split_docs, embedding)

In [10]:
# OpenAIの言語モデルを設定（ここではGPT-3を使用）
llm = ChatOpenAI(model="gpt-4o-mini", openai_api_key=openai.api_key)

  llm = ChatOpenAI(model="gpt-4o-mini", openai_api_key=openai.api_key)


In [11]:
# 検索用のQAチェーンを構築
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # "stuff" モードはシンプルに関連ドキュメントをまとめて渡すモード
    retriever=vectorstore.as_retriever(),
    return_source_documents=True  # 検索結果としてソースドキュメントを返す
)

## 質問ファイルを読み込んでQ&Aを作成

In [12]:
# 提供されたCSVファイルを読み込み
query_df = pd.read_csv("../data/query.csv", encoding="utf-8")

In [5]:
query_df.head()

In [2]:
answers = []
evidences = []

for _, row in query_df.iterrows():
    print(_)
    problem = row["problem"]
    print(problem)
    result = qa_chain({"query": problem})
    answer = result["result"]
    print(answer)
    evidence = result["source_documents"][0].page_content # 証拠部分を抽出
    answers.append(answer)
    evidences.append(evidence)

In [15]:
# DataFrameに回答と証拠を追加
query_df['full_answer'] = answers
query_df['full_evidence'] = evidences

In [3]:
query_df.head()

In [17]:
# 結果をCSVファイルとして保存
dt_now = datetime.datetime.now()
ymdm = dt_now.strftime("%Y%m%d-%H%M")

query_df.to_csv(f"../results/{ymdm}_output_with_answers_and_evidence.csv", index=False)

## 50トークンに収まるように回答を要約

In [18]:
# LLMを使ってanswerに基づき、evidenceから200文字程度を抜き出す関数
def extract_relevant_evidence(full_answer, full_evidence):
    extract_prompt = PromptTemplate(
        input_variables=["full_answer", "full_evidence"],
        template=
            """
                f"以下は回答と関連する証拠文です。"
                f"回答に必要な部分を200文字以内で抜き出してください。\n"
                f"回答: {full_answer}\n\n"
                f"証拠文: {full_evidence}\n"
            """
    )
    chain = extract_prompt | llm

    response = chain.invoke(
        {"full_answer": full_answer, "full_evidence": full_evidence}
    )
    return response.content

In [19]:
# full_evidenceを使って関連する部分を抜き出す
query_df['evidence'] = query_df.apply(
    lambda row: extract_relevant_evidence(row['full_answer'], row['full_evidence']),
    axis=1
)

In [20]:
replace_dict = {
        "\n": "",
        "\r": "",
    }
query_df = query_df.replace(
        {"full_answer": replace_dict},
        regex=True
    )
query_df = query_df.replace(
        {"evidence": replace_dict},
        regex=True
    )

In [48]:
# LLMを使って要約を行う関数
def summarize_answer(problem: str, full_answer: str, full_evidence: str) -> str:

    summarize_prompt = PromptTemplate(
        input_variables=["problem", "full_answer", "full_evidence"],
        template=
            """
                以下の質問に対する回答の文章を証拠を元に50文字以内に収まるように簡潔に答え直してください。\n
                分からない場合は「分かりません」とだけ答えてください。\n
                f"質問: {problem}\n\n"
                f"回答: {full_answer}\n"
                f"証拠: {full_evidence}\n"
            """
    )
    chain = summarize_prompt | llm

    response = chain.invoke(
        {"problem": problem, "full_answer": full_answer, "full_evidence": full_evidence}
    )
    return response.content

In [49]:
# tiktokenとgpt-4のトークナイザーを取得
enc = tiktoken.encoding_for_model("gpt-4-2024-08-06")

# query_df の "answer" 列のトークン数を計算し、50トークンを超える場合は要約を行う関数
def check_and_summarize_answers(query_df: pd.DataFrame) -> pd.DataFrame:
    def summarize_if_needed(problem: str, full_answer: str, full_evidence: str) -> str:
        # トークン数を計算
        token_count = len(enc.encode(answer))
        
        # トークン数が50を超えた場合は要約する
        if token_count > 50:
            # LLMを使って要約
            summarized_answer = summarize_answer(problem, full_answer, full_evidence)
            return summarized_answer
        return answer

    # "answer" 列に対して処理を適用
    query_df["answer"] = query_df["full_answer"]
    for i in range(len(query_df.index)):
        query_df["answer"][i] = summarize_answer(query_df["problem"][i], query_df["full_answer"][i], query_df["full_evidence"][i])
    return query_df

In [50]:
query_df = check_and_summarize_answers(query_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  query_df["answer"][i] = summarize_answer(query_df["problem"][i], query_df["full_answer"][i], query_df["full_evidence"][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  query_df["answer"][i] = summarize_answer(query_df["problem"][i], query_df["full_answer"][i], query_df["full_evidence"][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  query_df["answer"][i] = summarize_answer(query_df["problem"][i], query_df["full_answer"][i], query_df["full_evidence"][i])
A value

In [4]:
query_df.head(10)

In [52]:
# 必要な列（id, answer, evidence）をヘッダなしでCSVに書き出し
query_df[['index', 'answer', 'evidence']].to_csv(
    "../submit/predictions.csv",
    index=False,
    header=False,
    encoding="utf-8-sig"
)

In [53]:
# backup
dt_now = datetime.datetime.now()
ymdm = dt_now.strftime("%Y%m%d-%H%M")

query_df[['index', 'problem', 'full_answer', 'answer', 'full_evidence', 'evidence']].to_csv(
    f"../submit/{ymdm}_predictions.csv",
    index=False,
    header=True,
    encoding="utf-8-sig"
)