In [73]:
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
import json
from typing import Iterable

def load_docs_from_jsonl(file_path)->Iterable[Document]:
    array = []
    with open(file_path, 'r') as jsonl_file:
        for line in jsonl_file:
            data = json.loads(line)
            obj = Document(**data)
            array.append(obj)
    return array


documents = load_docs_from_jsonl("./docs/data_web_complete.jsonl")

text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=100)
docs = text_splitter.split_documents(documents)


In [74]:

from langchain.embeddings.openai import OpenAIEmbeddings
import os
os.environ['OPENAI_API_KEY'] = ""

embeddings = OpenAIEmbeddings()

In [75]:
from langchain.vectorstores import Chroma

db = Chroma.from_documents(docs, embeddings, persist_directory="./chroma_db_extended")

In [76]:
# k=1 because we only want one result
db.similarity_search("who is frodo?", k=1)

[Document(page_content='Like Aragorn, Frodo Baggins also presents a differing version of the book-character, who is the nephew and pupil of Bilbo Baggins, and is hobbit-gentry, wealthy and educated, schooled in Elvish lore and all-round adventuring. He is described by Gandalf as "a stout little fellow with red cheeks, taller than some, fairer than most," and "a perky chap with a bright eye," and showing a fondness for', metadata={'source': 'http://lotr.fandom.com/wiki/Tolkien vs. Jackson: Differences Between Story and Screenplay'})]

In [None]:
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema import StrOutputParser


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

retriever = db.as_retriever()


In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnableParallel, RunnablePassthrough
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
# Connect to GPT-3.5 turbo
os.environ['OPENAI_API_KEY'] = "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"
prompt = ChatPromptTemplate.from_template(template)

# Use temperature=0 to get the same results every time
llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0)

In [None]:
chain = (
    RunnableParallel({"context": retriever, "question": RunnablePassthrough()})
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
query = "who is frodo?"
chain.invoke(query)