In [25]:
%pwd


'd:\\devops\\project\\genchat'

In [2]:
import os
os.chdir("../")

In [26]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [27]:
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents

In [28]:
extracted_data = load_pdf_file(data='Data/')

In [29]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [30]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 6368


In [31]:
from langchain.embeddings import HuggingFaceEmbeddings

In [32]:
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [33]:
embeddings = download_hugging_face_embeddings()

In [34]:
from dotenv import load_dotenv
load_dotenv()

True

In [35]:
import os
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY=os.environ.get('OPENAI_API_KEY')

In [None]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "genchat"


pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    ) 
) 

In [37]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [16]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [38]:
docsearch


<langchain_pinecone.vectorstores.PineconeVectorStore at 0x28c44200b50>

In [39]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [40]:
retrieved_docs = retriever.invoke("BOPP?")

In [41]:
retrieved_docs

[Document(id='dddc4b66-7081-4d4d-81d2-18d2dd76772d', metadata={'author': 'Rjun', 'creationdate': '2023-02-19T14:00:59+06:00', 'creator': 'CorelDRAW 2020', 'moddate': '2023-05-19T04:55:23+00:00', 'page': 0.0, 'page_label': '1', 'producer': 'iLovePDF', 'source': 'Data\\TDS-T18TTOLG.pdf', 'title': 'TDS-T18TOLG.cdr', 'total_pages': 2.0}, page_content='BOPP film. This film is specially designed for printing and lamination process for higher yield.'),
 Document(id='00074d9b-c7fa-40b6-a69a-8e1aabd1bcfe', metadata={'author': 'Rjun', 'creationdate': '2023-02-26T10:31:38+05:00', 'creator': 'CorelDRAW 2020', 'moddate': '2023-02-26T10:31:38+05:00', 'page': 0.0, 'page_label': '1', 'producer': 'Corel PDF Engine Version 22.1.1.523', 'source': 'Data\\TDS-TIHR.pdf', 'title': 'TDS-TIHR.cdr', 'total_pages': 2.0}, page_content='BOPP FILMS\nDESCRIPTION\nPRODUCT FEATURES\n-     Endurance to heat and shrinking during sealing\n-     Easy Jaw release\n-     Good wetting tension for printing and lamination\n-  

In [42]:
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4, max_tokens=500)

In [43]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [44]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [47]:
print(OPENAI_API_KEY)

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
