In [1]:
import os
import pickle
import langchain
from langchain.llms import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [4]:
os.environ['OPENAI_API_KEY'] = ''

In [5]:
llm = OpenAI(temperature=0.9, max_tokens=500) 

In [6]:
loader = UnstructuredURLLoader(urls=[
    "https://en.wikipedia.org/wiki/OpenAI",
    ])

In [7]:
data = loader.load()
len(data)

1

In [8]:
data[0].metadata

{'source': 'https://en.wikipedia.org/wiki/OpenAI'}

In [9]:
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ".", " "],
    chunk_size = 1000,
    chunk_overlap = 200
)

In [10]:
docs = text_splitter.split_documents(data)
len(docs)

91

In [11]:
model_id = '../src/models/sentence-transformers_all-mpnet-base-v2'
model_kwargs = {'device': 'cpu'}
hf_embedding = HuggingFaceEmbeddings(
    model_name=model_id,
    model_kwargs=model_kwargs
)



  from .autonotebook import tqdm as notebook_tqdm


In [12]:
# Pass the documents and embeddings inorder to create FAISS vector index
vectorindex = FAISS.from_documents(docs, hf_embedding)

In [13]:
file_path="../src/data/vector_index.pkl"
with open(file_path, "wb") as f:
    pickle.dump(vectorindex, f)

In [14]:
if os.path.exists(file_path):
    with open(file_path, "rb") as f:
        vectorIndex = pickle.load(f)

In [15]:
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorIndex.as_retriever())
chain


