In [39]:

import openai, langchain, pinecone


from langchain.document_loaders import DirectoryLoader, TextLoader,UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone

from langchain.llms import OpenAI
from langsmith.wrappers import wrap_openai
from langsmith import traceable

from langchain.chains.question_answering import load_qa_chain
from langchain.chains import RetrievalQA

import gradio as gr

# from langsmith import LangSmithClient
from langchain.prompts import PromptTemplate

In [6]:
# Open the data file and read its content

loader = DirectoryLoader('./data', glob="./*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

In [7]:
# Set up the RecursiveCharacterTextSplitter, then Split the documents

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [None]:
texts[0]

In [None]:
import os
from openai import OpenAI
import getpass
from dotenv import load_dotenv

client = OpenAI(
    api_key = os.getenv("OPENAI_API_KEY")
)

# Function to get embeddings
def get_embeddings(text):
    response = client.embeddings.create(
      input=text,
      model="text-embedding-3-small"
    )
    return response.data[0].embedding

# Get embeddings for each chunk of the PDF
embeddings = [get_embeddings(page.page_content) for page in texts]

print(len(embeddings[0]))

In [10]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
# Pinecone related setup

pc = Pinecone(api_key='025ab0cd-e9e2-4114-89bd-2160e36f4acd')

# Set the index name for this project in pinecone first

index_name = 'rag'


In [11]:
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=(len(embeddings[0])),
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

index = pc.Index(index_name)

In [None]:
# Prepare vectors with metadata
vectors_to_upsert = [
    (str(i), embedding, { "text": page.page_content})
    for i, (embedding, page) in enumerate(zip(embeddings, texts))
]

# Perform upsert
index.upsert(vectors_to_upsert)

print(f"Upserted {len(vectors_to_upsert)} vectors into the Pinecone index.")

In [None]:
# Examine pinecone index. Delete all vectors, if you want to start fresh

index = pc.Index(index_name)
index.describe_index_stats()
#index.delete(deleteAll='true', namespace='')

In [30]:
# Initialize RetrievalQA with tracing
LANGCHAIN_TRACING_V2=True
LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"
LANGCHAIN_API_KEY=os.getenv('LANGCHAIN_API_KEY')
LANGCHAIN_PROJECT="rag"
llm = traceable(client.chat.completions ,model="gpt-3.5-turbo")

In [31]:
# Define the legal chatbot prompt
legal_prompt = PromptTemplate(
    input_variables=["question"],
    template="""You are a legal assistant chatbot specialized in providing concise and relevant information based on legal documents.

    Question: {question}

    Please provide a legal answer considering the context of legal practices and common law terminology.
    """
)

# Optional

In [44]:
# # Don't run this cell

# if index_name not in pinecone.list_indexes():
#     print("Index does not exist: ", index_name)
#     # docsearch = Pinecone.from_documents(texts, embeddings, index_name = index_name)
# else:
#     print("Index exists: ", index_name)
#     # docsearch = Pinecone.from_existing_index(index_name, embeddings)


In [32]:
# Prepare the embedding so that we can pass it to the pinecone call in the next step
import os
from dotenv import load_dotenv
load_dotenv()

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

embeddings = OpenAIEmbeddings(api_key=os.getenv(OPENAI_API_KEY))


In [33]:
#Initialize a LangChain embedding object

from langchain_openai import OpenAIEmbeddings  
# get openai api key from platform.openai.com  
model_name = 'text-embedding-ada-002'  
embeddings = OpenAIEmbeddings(  
    model=model_name,  
    openai_api_key=OPENAI_API_KEY  
)  

In [34]:
#Initialize the LangChain vector store

from langchain_pinecone import PineconeVectorStore  
text_field = "text"  
vectorstore = PineconeVectorStore(  
    index, embeddings, text_field  
)  

In [None]:
query = "name to whom the document is sent"  
vectorstore.similarity_search(  
    query,  # our search query  
    k=3  # return 3 most relevant docs  
)  

# Response:
# [Document(page_content='Benito Amilcare Andrea Mussolini KSMOM GCTE (29 July 1883 – 28 April 1945) was an Italian politician and journalist...', metadata={'chunk': 0.0, 'source': 'https://simple.wikipedia.org/wiki/Benito%20Mussolini', 'title': 'Benito Mussolini', 'wiki-id': '6754'}),  
# Document(page_content='Fascism as practiced by Mussolini\nMussolini\'s form of Fascism, "Italian Fascism"- unlike Nazism, the racist ideology...', metadata={'chunk': 1.0, 'source': 'https://simple.wikipedia.org/wiki/Benito%20Mussolini', 'title': 'Benito Mussolini', 'wiki-id': '6754'}),  
# Document(page_content='Veneto was made part of Italy in 1866 after a war with Austria. Italian soldiers won Latium in 1870. That was when...', metadata={'chunk': 5.0, 'source': 'https://simple.wikipedia.org/wiki/Italy', 'title': 'Italy', 'wiki-id': '363'})]


In [None]:
 # To do this, initialize a RetrievalQA object like so:
from langchain_openai import ChatOpenAI  
from langchain.chains import RetrievalQA  

# completion llm  
llm = ChatOpenAI(  
    openai_api_key=OPENAI_API_KEY,  
    model_name='gpt-3.5-turbo',  
    temperature=0.0  
)  
qa = RetrievalQA.from_chain_type(  
    llm=llm,  
    chain_type="stuff",  
    retriever=vectorstore.as_retriever()  
)  
qa.invoke(query)  

# Response:
# Benito Mussolini was an Italian politician and journalist who served as the Prime Minister of Italy from 1922 until 1943. He was the leader of the National Fascist Party and played a significant role in the rise of fascism in Italy...


# Source (Optional)

In [None]:
from langchain.chains import RetrievalQAWithSourcesChain  

qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(  
    llm=llm,  
    chain_type="stuff",  
    retriever=vectorstore.as_retriever()  
)  
qa_with_sources.invoke(query)

# Response:
# {'question': 'who was Benito Mussolini?',  
# 'answer': "Benito Mussolini was an Italian politician and journalist who served as the Prime Minister of Italy from 1922 until 1943. He was the leader of the National Fascist Party and played a significant role in the rise of fascism in Italy...",  
# 'sources': 'https://simple.wikipedia.org/wiki/Benito%20Mussolini'}  


In [41]:
# Gradio Interface
def legal_chatbot(query):
    if query:
        # Get the response from the legal chatbot
        response = qa.invoke(query)
        return response
    return "Please enter a valid legal question."

# Set up Gradio UI
iface = gr.Interface(
    fn=legal_chatbot,  # the function that processes input
    inputs="text",     # input type
    outputs="text",    # output type
    title="Legal Assistant Chatbot with RAG and LangSmith",
    description="Ask legal questions, and the chatbot will provide relevant information based on legal documents."
)

# Launch the Gradio app
if __name__ == "__main__":
    iface.launch(share=True)

Running on local URL:  http://127.0.0.1:7861


OSError: [WinError 225] Operation did not complete successfully because the file contains a virus or potentially unwanted software: 'c:\\Users\\vinay\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\gradio\\frpc_windows_amd64_v0.2'