<a href="https://colab.research.google.com/github/SakayanagiTOYOTA/forLLM_edu/blob/main/readpdf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PDFファイルからRAGを作成

参考URL https://qiita.com/tatsuki-tsuchiyama/items/179c1ded40b9c54de09a

PDFを読込み、RAGとして保存する。本スクリプト実行後にloadrag.pyを実行する事

In [None]:
# AzureOpenAIの場合
import os
os.environ["AZURE_OPENAI_API_KEY"] = "xxxxxxx"
os.environ["AZURE_OPENAI_ENDPOINT"] = 'https://aoai-aoai04-japaneast.openai.azure.com/'
DEPLOYMENT_GPT4o='gpt-4o-2024-05-13'
API_VERSION_GPT4o='2024-02-15-preview'
DEPLOYMENT_EMBEDDING_ADA='text-embedding-ada-002-2'
API_VERSION_EMBEDDING_ADA='2023-05-15'
DEPLOYMENT_EMBEDDING_LARGE='text-embedding-3-large-1'
API_VERSION_EMBEDDING_LARGE='2023-05-15'

MODEL_TYPE='AzureOpenAI'

In [1]:
# AWS Bedrockの場合
import os
os.environ['AWS_ACCESS_KEY_ID'] = 'xxxxx'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'xxxxxx'
os.environ['AWS_SESSION_TOKEN'] = 'xxxxxx'
os.environ['AWS_DEFAULT_REGION'] = 'ap-northeast-1'

MODEL_TYPE='Bedrock'

Google Colaboの場合にライブラリインストール

In [5]:
!pip install langchain-aws pypdf  faiss-cpu



定数設定

In [3]:
CHUNK_SIZE = 1000
PDF_PATH = 'OCPP-2.0.1_part2_specification.pdf'

モデルの読込

In [6]:
if MODEL_TYPE == 'AzureOpenAI':
  from langchain.embeddings import AzureOpenAIEmbeddings
  embeddings = AzureOpenAIEmbeddings(azure_deployment=DEPLOYMENT_EMBEDDING_LARGE,
    chunk_size=CHUNK_SIZE
  )

elif MODEL_TYPE == 'Bedrock':
  from langchain_aws import BedrockEmbeddings
  embeddings=BedrockEmbeddings(model_id="amazon.titan-embed-text-v1",region_name="ap-northeast-1")


## PDF読込

Google Colaboの場合はPDF読み込む前にPDFファイルをアップロードしてください。

In [8]:
# PDFを読み込み
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader(PDF_PATH)
pages = loader.load_and_split()

ページの一部を表示

In [9]:
print(pages[1])

page_content='Table of Contents
Disclaimer . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  1
Generic. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  2
Version History. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  3
1. Scope . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  4
1.1. OCPP 2.0.1 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

## ベクトル化

In [10]:
# 空のベクトルストアを作成
from langchain.vectorstores import FAISS
dummy_text, dummyy_id = "1",1
vectorstore = FAISS.from_texts([dummy_text],embeddings, ids=[dummyy_id])
vectorstore.delete([dummyy_id])

True

In [11]:
# ページごとにベクトル化
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter( chunk_size = CHUNK_SIZE)
for page in pages:
    docs = text_splitter.split_documents([page])
    vectorstore.merge_from(FAISS.from_documents(docs,embeddings))
#    print(f"ベクトル化完了: {page.metadata["source"]} - ページ {page.metadata["page"]}")
    print(f"finish: {page.metadata['source']} - page {page.metadata['page']}")

finish: OCPP-2.0.1_part2_specification.pdf - page 0
finish: OCPP-2.0.1_part2_specification.pdf - page 1
finish: OCPP-2.0.1_part2_specification.pdf - page 1
finish: OCPP-2.0.1_part2_specification.pdf - page 1
finish: OCPP-2.0.1_part2_specification.pdf - page 2
finish: OCPP-2.0.1_part2_specification.pdf - page 2
finish: OCPP-2.0.1_part2_specification.pdf - page 2
finish: OCPP-2.0.1_part2_specification.pdf - page 3
finish: OCPP-2.0.1_part2_specification.pdf - page 3
finish: OCPP-2.0.1_part2_specification.pdf - page 3
finish: OCPP-2.0.1_part2_specification.pdf - page 3
finish: OCPP-2.0.1_part2_specification.pdf - page 4
finish: OCPP-2.0.1_part2_specification.pdf - page 4
finish: OCPP-2.0.1_part2_specification.pdf - page 4
finish: OCPP-2.0.1_part2_specification.pdf - page 5
finish: OCPP-2.0.1_part2_specification.pdf - page 5
finish: OCPP-2.0.1_part2_specification.pdf - page 5
finish: OCPP-2.0.1_part2_specification.pdf - page 5
finish: OCPP-2.0.1_part2_specification.pdf - page 6
finish: OCPP

# RAGとして保存

In [12]:
vectorstore.save_local("./vectorstore")