In [181]:
from langchain.embeddings import (
    LlamaCppEmbeddings, 
    HuggingFaceEmbeddings, 
    SentenceTransformerEmbeddings
)

from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.vectorstores import FAISS
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders import (
    PyPDFLoader,
    DataFrameLoader,
    GitLoader
  )

import streamlit as st
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline
import torch
import base64
import textwrap
from langchain.embeddings import SentenceTransformerEmbeddings,HuggingFaceEmbeddings
from langchain.vectorstores import Chroma,FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline

import pandas as pd
import nbformat
from nbconvert import PythonExporter
import os

In [182]:

def get_text_splits(text_file):
  """Function takes in the text data and returns the  
  splits so for further processing can be done."""
  with open(text_file,'r') as txt:
    data = txt.read()

  textSplit = RecursiveCharacterTextSplitter(chunk_size=150,
                                             chunk_overlap=15,
                                             length_function=len)
  doc_list = textSplit.split_text(data)
  return doc_list
     

In [183]:

loader = PyPDFLoader("The-Holy-Bible-King-James-Version.pdf")
pages = loader.load_and_split()
     

In [184]:
print(len(pages))
print(pages[0].page_content)

1342
The King James Version of the
Holy Bible
Downloaded from www.holybooks.com
www.holybooks.com


In [185]:

def get_pdf_splits(pdf_file):
  """Function takes in the pdf data and returns the  
  splits so for further processing can be done."""
  
  loader = PyPDFLoader(pdf_file)
  pages = loader.load_and_split()  

  textSplit = RecursiveCharacterTextSplitter(chunk_size=150,
                                             chunk_overlap=15,
                                             length_function=len)
  doc_list = []
  #Pages will be list of pages, so need to modify the loop
  for pg in pages:
    pg_splits = textSplit.split_text(pg.page_content)
    doc_list.extend(pg_splits)

  return doc_list


def get_excel_splits(excel_file,target_col,sheet_name):
  trialDF = pd.read_excel(io=excel_file,
                          engine='openpyxl',
                          sheet_name=sheet_name)
  
  df_loader = DataFrameLoader(trialDF,
                              page_content_column=target_col)
  
  excel_docs = df_loader.load()

  return excel_docs


def get_csv_splits(csv_file):
  """Function takes in the csv and returns the  
  splits so for further processing can be done."""
  csvLoader = CSVLoader(csv_file)
  csvdocs = csvLoader.load()
  return csvdocs


def get_ipynb_splits(notebook):
  """Function takes the notebook file,reads the file 
  data as python script, then splits script data directly"""

  with open(notebook) as fh:
    nb = nbformat.reads(fh.read(), nbformat.NO_CONVERT)

  exporter = PythonExporter()
  source, meta = exporter.from_notebook_node(nb)

  #Python file data is in the source variable
  
  textSplit = RecursiveCharacterTextSplitter(chunk_size=150,
                                             chunk_overlap=15,
                                             length_function=len)
  doc_list = textSplit.split_text(source)
  return doc_list  


def get_git_files(repo_link, folder_path, file_ext):
  # eg. loading only python files
  git_loader = GitLoader(clone_url=repo_link,
    repo_path=folder_path, 
    file_filter=lambda file_path: file_path.endswith(file_ext))
  #Will take each file individual document
  git_docs = git_loader.load()

  textSplit = RecursiveCharacterTextSplitter(chunk_size=150,
                                             chunk_overlap=15,
                                             length_function=len)
  doc_list = []
  #Pages will be list of pages, so need to modify the loop
  for code in git_docs:
    code_splits = textSplit.split_text(code.page_content)
    doc_list.extend(code_splits)

  return doc_list

In [186]:

def embed_index(doc_list, embed_fn, index_store):
  """Function takes in existing vector_store, 
  new doc_list and embedding function that is 
  initialized on appropriate model. Local or online. 
  New embedding is merged with the existing index. If no 
  index given a new one is created"""
  #check whether the doc_list is documents, or text
  try:
    faiss_db = FAISS.from_documents(doc_list, 
                              embed_fn)  
  except Exception as e:
    faiss_db = FAISS.from_texts(doc_list, 
                              embed_fn)
  
  if os.path.exists(index_store):
    local_db = FAISS.load_local(index_store,embed_fn)
    #merging the new embedding with the existing index store
    local_db.merge_from(faiss_db)
    print("Merge completed")
    local_db.save_local(index_store)
    print("Updated index saved")
  else:
    faiss_db.save_local(folder_path=index_store)
    print("New store created...")

In [187]:
def get_docs_length(index_path, embed_fn):
  test_index = FAISS.load_local(index_path,
                              embeddings=embed_fn)
  test_dict = test_index.docstore._dict
  return len(test_dict.values())  

In [188]:
#testing out the above function with the open source 
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [189]:
docs=get_pdf_splits("The-Holy-Bible-King-James-Version.pdf")

In [190]:
print(len(docs))
print(docs[0])

40034
The King James Version of the
Holy Bible
Downloaded from www.holybooks.com
www.holybooks.com


In [191]:
faiss_db = embed_index(doc_list=docs,
            embed_fn=embeddings,
            index_store='new_index')

Merge completed
Updated index saved


In [192]:
print(get_docs_length(index_path='new_index',embed_fn=embeddings))

40055


In [193]:
test_idex = FAISS.load_local("new_index",embeddings)

In [194]:
test_idex.similarity_search("Education")

[Document(page_content='Federal Capital Territory Administration\nJul 2021 - Jun 2022 (1 year)\nEngineering Intern\nEducation\nAfe Babalola University', metadata={}),
 Document(page_content='knowledge. {8:10} Receive my instruction, and not silver;\nand knowledge rather than choice gold. {8:11} For wisdom', metadata={}),
 Document(page_content='increaseth learning. {16:22} Understanding [is] a wellspring\nof life unto him that hath it: but the instruction of fools [is]', metadata={}),
 Document(page_content='learning; and a man of understanding shall attain unto wise\ncounsels: {1:6} To understand a proverb, and the', metadata={})]

In [195]:
question = "Tell me his experience"
test_idex.max_marginal_relevance_search(question,k=2, fetch_k=3)

[Document(page_content='him with the spirit of God, in wisdom, in understanding,\nand in knowledge, and in all manner of workmanship;', metadata={}),
 Document(page_content='if any have caused grief, he hath not grieved me, but in part:', metadata={})]

In [196]:
vectordb=test_idex

In [210]:

checkpoint = "LaMini-T5-738M"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
base_model = AutoModelForSeq2SeqLM.from_pretrained(
    checkpoint,
    device_map="auto",
    torch_dtype= torch.float32
)


pipe = pipeline(
    'text2text-generation',
    model = base_model,
    tokenizer = tokenizer,
    max_length = 900,
    do_sample = True,
    temperature = 0.3,
    top_p = 0.95
)
local_llm= HuggingFacePipeline(pipeline=pipe)
llm = local_llm

In [211]:
question = "Who is adam?"
docs = vectordb.similarity_search(question,k=3)
len(docs)

3

In [212]:
from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [213]:
result = qa_chain({"query": question})
result["result"]

'Adam is a man who was first formed, then Eve.'

In [214]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)


In [215]:
# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [216]:
question = "what are his certification"
result = qa_chain({"query": question})
print(result["result"])
print(result["source_documents"][0])

The context does not provide information about his certifications.
page_content='him with the spirit of God, in wisdom, in understanding,\nand in knowledge, and in all manner of workmanship;' metadata={}


In [217]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

In [218]:
from langchain.chains import ConversationalRetrievalChain
retriever=vectordb.as_retriever()
qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory)

In [219]:
question = ""
result = qa({"question": question})
print(result['answer'])

The context does not provide enough information to determine if it is helpful or not.


In [226]:
question = "what were the parables Jesus told?"
result = qa({"question": question})
print(result['answer'])

Jesus was told parables.
