In [1]:
# pip install pinecone-client==2.2.4

In [2]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

  from tqdm.autonotebook import tqdm


In [3]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [4]:
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_API_ENV = os.getenv('PINECONE_API_ENV')

#Extract data from the PDF
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [5]:
#Extract data from the PDF
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [6]:
extracted_data = load_pdf("data/")

In [7]:
print(extracted_data)

[Document(page_content='Unit 1 - Information Security - www\n Information Security (Rajiv Gandhi Proudyogiki Vishwavidyalaya)\nScan to open on Studocu\nStudocu is not sponsored or endorsed by any college or university\n Unit 1 - Information Security - www\n Information Security (Rajiv Gandhi Proudyogiki Vishwavidyalaya)\nScan to open on Studocu\nStudocu is not sponsored or endorsed by any college or university\n Downloaded by ?  (freetoweb5@gmail.com)\nlOMoARcPSD|39362990\n', metadata={'source': 'data\\Unit1IS.pdf', 'page': 0}), Document(page_content="Please do not share these notes on apps like WhatsApp or Telegram.\nThe revenue we generate from the ads we show on our website and app \nfunds our services. The generated revenue helps us prepare new notes \nand improve the quality of existing study materials , which are \navailable on our website and mobile app.\nIf you don't use our website and app directly, it will hurt our revenue, \nand we might not be able to run the services and h

In [8]:
#Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [9]:
text_chunks = text_split(extracted_data)
print("length of text chunks: ", len(text_chunks))

length of text chunks:  76


In [10]:
#download embedding mdoel

def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name ="sentence-transformers/all-MiniLM-L6-v2")

    return embeddings

In [11]:
embeddings = download_hugging_face_embeddings()

In [12]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={})

In [13]:
#Initializing the Pinecone
pinecone.init(api_key=PINECONE_API_KEY,
              environment=PINECONE_API_ENV)

index_name = "courseoutline"

#Creating Embeddings for Each of The Text Chunks & storing
docsearch=Pinecone.from_texts([t.page_content for t in text_chunks], embeddings, index_name=index_name)

In [14]:
#If we already have an index we can load it like this
docsearch=Pinecone.from_existing_index(index_name, embeddings)

query = "Types of Substitution Techniques "

docs=docsearch.similarity_search(query, k=3)

print("Result", docs)

Result [Document(page_content='different from the substitution technique. On the one hand, t he substitution technique substitutes a plain text \nsymbol with a ciphertext symbol. On the other hand, the t ransposition technique executes permutation on the \nplain text to obtain the ciphertext. \nTransposition Techniques are : \n1. Rail Fence Transposition\n2. Columnar Transposition\n3. Improved Columnar Transposition\n4. Book Cipher/Running Key Cipher\nRail Fence Cipher', metadata={}), Document(page_content='different from the substitution technique. On the one hand, t he substitution technique substitutes a plain text \nsymbol with a ciphertext symbol. On the other hand, the t ransposition technique executes permutation on the \nplain text to obtain the ciphertext. \nTransposition Techniques are : \n1. Rail Fence Transposition\n2. Columnar Transposition\n3. Improved Columnar Transposition\n4. Book Cipher/Running Key Cipher\nRail Fence Cipher', metadata={}), Document(page_content="The H

In [15]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [16]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [17]:
llm=CTransformers(model="model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':512,
                          'temperature':0.8})

In [18]:
qa=RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(search_kwargs={'k': 2}),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)

In [19]:
user_input=input(f"Input Prompt:")
result=qa({"query": user_input})
print("Response : ", result["result"])

Response :  The Mono-Alphabetic Cipher is a type of encryption technique where each letter in the plaintext is replaced by a different letter a fixed number of positions down the alphabet. For example, if the key is 3, then 'A' will be replaced by 'D', 'B' will be replaced by 'E', and so on. If the key is known, it is relatively easy to break the cipher by trying all possible combinations of letters until the correct one is found.
