In [1]:
##Data Ingestion
from langchain_community.document_loaders import JSONLoader
import json
from pathlib import Path
from pprint import pprint


In [2]:
# Step 2: Load the scraped data JSON file

loader = JSONLoader(
    file_path='C:/Users/yahoo/OneDrive/Desktop/project sample ready/Data Ingestion/scraped_data.json',
    jq_schema='.content',
    text_content=False)

text_documents = loader.load()
text_documents

[Document(metadata={'source': 'C:\\Users\\yahoo\\OneDrive\\Desktop\\project sample ready\\Data Ingestion\\scraped_data.json', 'seq_num': 1}, page_content='Kaiser Permanente - Wikipedia\nJump to content\nMain menu\nMain menu\nmove to sidebar\nhide\nNavigation\nMain page\nContents\nCurrent events\nRandom article\nAbout Wikipedia\nContact us\nDonate\nContribute\nHelp\nLearn to edit\nCommunity portal\nRecent changes\nUpload file\nSearch\nSearch\nAppearance\nCreate account\nLog in\nPersonal tools\nCreate account\nLog in\nPages for logged out editors\nlearn more\nContributions\nTalk\nContents\nmove to sidebar\nhide\n(Top)\n1\nStructure and governance\nToggle Structure and governance subsection\n1.1\nGovernance\n1.2\nOperations\n1.3\nRegional entities\n1.4\nLobbying entity\n2\nHistory\nToggle History subsection\n2.1\nEarly years\n2.2\nWorld War II\n2.3\nPostwar growth\n2.4\nManaged care era\n2.5\nRegional evolution\n2.6\nKP HealthConnect\n2.7\nInternational reputation\n2.8\n2023 strike\n3\nQu

In [3]:
import os 
from dotenv import load_dotenv
load_dotenv()

os.environ['OPENAI_API_KEY']=os.getenv("OPENAI_API_KEY")

In [4]:
# Step 4: Split the documents into chunks

from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
documents=text_splitter.split_documents(text_documents)
documents[:5]

[Document(metadata={'source': 'C:\\Users\\yahoo\\OneDrive\\Desktop\\project sample ready\\Data Ingestion\\scraped_data.json', 'seq_num': 1}, page_content='Kaiser Permanente - Wikipedia\nJump to content\nMain menu\nMain menu\nmove to sidebar\nhide\nNavigation\nMain page\nContents\nCurrent events\nRandom article\nAbout Wikipedia\nContact us\nDonate\nContribute\nHelp\nLearn to edit\nCommunity portal\nRecent changes\nUpload file\nSearch\nSearch\nAppearance\nCreate account\nLog in\nPersonal tools\nCreate account\nLog in\nPages for logged out editors\nlearn more\nContributions\nTalk\nContents\nmove to sidebar\nhide\n(Top)\n1\nStructure and governance\nToggle Structure and governance subsection\n1.1\nGovernance\n1.2\nOperations\n1.3\nRegional entities\n1.4\nLobbying entity\n2\nHistory\nToggle History subsection\n2.1\nEarly years\n2.2\nWorld War II\n2.3\nPostwar growth\n2.4\nManaged care era\n2.5\nRegional evolution\n2.6\nKP HealthConnect\n2.7\nInternational reputation\n2.8\n2023 strike\n3\nQu

In [5]:
# Step 5: Generate embeddings using OllamaEmbeddings

from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(
    model="gemma",
)

In [7]:
# Step 6: Create a FAISS vectorstore to store and retrieve embeddings
import faiss
from langchain_community.vectorstores import FAISS

vectorstore = FAISS.from_documents(documents, embeddings)
retriever = vectorstore.as_retriever()


In [8]:
# Step 7: Create a retriever to fetch relevant documents based on queries

from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA

llm = Ollama(model="gemma")  # Replace "valid-model-name" with an actual model name
qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)



In [9]:

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

from langchain import hub

In [10]:
# Step 8: Define a custom prompt for the chatbot's response
custom_prompt = """
You are a highly knowledgeable chatbot, and your task is to answer user questions based on the following documents.
Respond in a concise and informative manner.

Documents:
{context}

Question: {question}

Answer:
"""

In [11]:
from langchain.prompts import PromptTemplate
prompt_template = PromptTemplate(
    template=custom_prompt,
    input_variables=["context", "question"]
)


In [12]:
# Step 10: Set up the chatbot with OpenAI for response generation using the custom prompt
llm = Ollama(model="gemma")  

In [13]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # Using the "stuff" chain type
    retriever=retriever,
    return_source_documents=True,  # This will also return the source documents for reference
    chain_type_kwargs={"prompt": prompt_template}  # Pass the custom prompt here
)

In [17]:
def chat_with_bot(query):
    # Call qa_chain directly as it supports multiple outputs
    response = qa_chain({"query": query})  # or use qa_chain([{"query": query}])
    
    # Extract the answer from the 'result' key
    answer = response['result']
    
    # Optionally, you can also access the source documents if needed
    source_documents = response.get('source_documents', [])
    
    # Return just the answer for now
    return answer


In [19]:
# Example usage of the chat_with_bot function
user_input = "Tell me about the text."
response = chat_with_bot(user_input)

# Print the chatbot's response
if isinstance(response, list):
    # Assuming the first item in the list is the response dictionary
    response_dict = response[0]
    answer = response_dict.get('result', 'No result found')
    print(answer)
else:
    print("Unexpected response format:", response)



Unexpected response format: The provided text discusses Kaiser Permanente, a healthcare organization founded in the United States. It highlights KP's performance practices, its financial structure, and its founding story.

**Key points:**

* **Performance practices:** KP emphasizes preventive care, salaried doctors, and efficient hospital care to reduce costs.
* **Financial structure:** KP is a non-profit organization that provides health insurance plans and invests in hospitals.
* **Founding story:** Kaiser Permanente was founded by Sidney Garfield in partnership with Industrial Indemnity executives.
* **Medical groups:** KP works with physician-owned medical groups that provide care to members.
