In [9]:
# !pip install langchain faiss-cpu pypdf GitPython openpyxl sentence-transformers transformers llama-cpp-python > /dev/null
     

In [10]:
from langchain.embeddings import (
    LlamaCppEmbeddings, 
    HuggingFaceEmbeddings, 
    SentenceTransformerEmbeddings
)

from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.vectorstores import FAISS
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders import (
    PyPDFLoader,
    DataFrameLoader,
    GitLoader
  )
import pandas as pd
import nbformat
from nbconvert import PythonExporter
import os

In [11]:

def get_text_splits(text_file):
  """Function takes in the text data and returns the  
  splits so for further processing can be done."""
  with open(text_file,'r') as txt:
    data = txt.read()

  textSplit = RecursiveCharacterTextSplitter(chunk_size=150,
                                             chunk_overlap=15,
                                             length_function=len)
  doc_list = textSplit.split_text(data)
  return doc_list
     

In [12]:

loader = PyPDFLoader("fastfacts-what-is-climate-change.pdf")
pages = loader.load_and_split()
     

In [13]:
len(pages)

2

In [14]:
pages[0].page_content

'What Is Climate Change?\n1. Climate change  can be a natural process where temperature, rainfall, wind and \nother elements vary over decades or more. In millions of years, our world has been \nwarmer and colder than it is now. But today we are experiencing rapid warming from \nhuman activities, primarily due to burning fossil fuels that generate greenhouse gas \nemissions.\n2. Increasing greenhouse gas emissions  from human activity act like a blanket \nwrapped around the earth, trapping the sun’s heat and raising temperatures.\n3. Examples of greenhouse gas emissions that are causing climate change include \ncarbon dioxide and methane. These come from burning fossil fuels such as gasoline \nfor driving a car or coal for heating a building. Clearing land and forests can also \nrelease carbon dioxide. Landfills for garbage are another source. Energy, industry, \nagriculture and waste disposal are among the major emitters.\n4. Greenhouse gas concentrations are at their highest levels i

In [15]:

def get_pdf_splits(pdf_file):
  """Function takes in the pdf data and returns the  
  splits so for further processing can be done."""
  
  loader = PyPDFLoader(pdf_file)
  pages = loader.load_and_split()  

  textSplit = RecursiveCharacterTextSplitter(chunk_size=150,
                                             chunk_overlap=15,
                                             length_function=len)
  doc_list = []
  #Pages will be list of pages, so need to modify the loop
  for pg in pages:
    pg_splits = textSplit.split_text(pg.page_content)
    doc_list.extend(pg_splits)

  return doc_list

In [16]:

def get_excel_splits(excel_file,target_col,sheet_name):
  trialDF = pd.read_excel(io=excel_file,
                          engine='openpyxl',
                          sheet_name=sheet_name)
  
  df_loader = DataFrameLoader(trialDF,
                              page_content_column=target_col)
  
  excel_docs = df_loader.load()

  return excel_docs

In [17]:

def get_csv_splits(csv_file):
  """Function takes in the csv and returns the  
  splits so for further processing can be done."""
  csvLoader = CSVLoader(csv_file)
  csvdocs = csvLoader.load()
  return csvdocs

In [18]:

def get_ipynb_splits(notebook):
  """Function takes the notebook file,reads the file 
  data as python script, then splits script data directly"""

  with open(notebook) as fh:
    nb = nbformat.reads(fh.read(), nbformat.NO_CONVERT)

  exporter = PythonExporter()
  source, meta = exporter.from_notebook_node(nb)

  #Python file data is in the source variable
  
  textSplit = RecursiveCharacterTextSplitter(chunk_size=150,
                                             chunk_overlap=15,
                                             length_function=len)
  doc_list = textSplit.split_text(source)
  return doc_list  

In [19]:

def get_git_files(repo_link, folder_path, file_ext):
  # eg. loading only python files
  git_loader = GitLoader(clone_url=repo_link,
    repo_path=folder_path, 
    file_filter=lambda file_path: file_path.endswith(file_ext))
  #Will take each file individual document
  git_docs = git_loader.load()

  textSplit = RecursiveCharacterTextSplitter(chunk_size=150,
                                             chunk_overlap=15,
                                             length_function=len)
  doc_list = []
  #Pages will be list of pages, so need to modify the loop
  for code in git_docs:
    code_splits = textSplit.split_text(code.page_content)
    doc_list.extend(code_splits)

  return doc_list

In [20]:

def embed_index(doc_list, embed_fn, index_store):
  """Function takes in existing vector_store, 
  new doc_list and embedding function that is 
  initialized on appropriate model. Local or online. 
  New embedding is merged with the existing index. If no 
  index given a new one is created"""
  #check whether the doc_list is documents, or text
  try:
    faiss_db = FAISS.from_documents(doc_list, 
                              embed_fn)  
  except Exception as e:
    faiss_db = FAISS.from_texts(doc_list, 
                              embed_fn)
  
  if os.path.exists(index_store):
    local_db = FAISS.load_local(index_store,embed_fn)
    #merging the new embedding with the existing index store
    local_db.merge_from(faiss_db)
    print("Merge completed")
    local_db.save_local(index_store)
    print("Updated index saved")
  else:
    faiss_db.save_local(folder_path=index_store)
    print("New store created...")

In [21]:
def get_docs_length(index_path, embed_fn):
  test_index = FAISS.load_local(index_path,
                              embeddings=embed_fn)
  test_dict = test_index.docstore._dict
  return len(test_dict.values())  

In [22]:

#testing out the above function with the open source 
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [23]:
mail_docs=get_pdf_splits("fastfacts-what-is-climate-change.pdf")

In [24]:

len(mail_docs)

53

In [25]:

mail_docs[0]

'What Is Climate Change?\n1. Climate change  can be a natural process where temperature, rainfall, wind and'

In [35]:
'Space via IFTTT <action@ifttt.com>\nAstronomy Picture of the Day:'

faiss_db = embed_index(doc_list=mail_docs,
            embed_fn=embeddings,
            index_store='new_index')

Merge completed
Updated index saved


In [31]:

get_docs_length(index_path='new_index',embed_fn=embeddings)

79

In [32]:

test_idex = FAISS.load_local("new_index",embeddings)

In [34]:
test_idex.similarity_search("Stellar Nursery in Perseus")

[Document(page_content='wrapped around the earth, trapping the sun’s heat and raising temperatures.', metadata={}),
 Document(page_content='where everything is connected, changes in one area can influence changes in all', metadata={}),
 Document(page_content='like sea-level rise and saltwater intrusion have advanced to the point where whole', metadata={}),
 Document(page_content='temperature rise is only the beginning of the story. Because the Earth is a system,', metadata={})]

In [36]:
question = "Tell me about all-white mushrooms with large fruiting bodies"
test_idex.max_marginal_relevance_search(question,k=2, fetch_k=3)

[Document(page_content='release carbon dioxide. Landfills for garbage are another source. Energy, industry, \nagriculture and waste disposal are among the major emitters.', metadata={}),
 Document(page_content='other elements vary over decades or more. In millions of years, our world has been', metadata={})]

In [38]:
# from langchain.retrievers import SVMRetriever

# svm_retriever = SVMRetriever.from_documents(all_splits,OpenAIEmbeddings())
# docs_svm=svm_retriever.get_relevant_documents(question)
# len(docs_svm)
vectordb=test_idex

In [39]:
question = "What are major topics for this class?"
docs = vectordb.similarity_search(question,k=3)
len(docs)

3

In [42]:
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline
import torch
import base64
import textwrap
from langchain.embeddings import SentenceTransformerEmbeddings,HuggingFaceEmbeddings
from langchain.vectorstores import Chroma,FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline

In [44]:
# @st.cache_resource

checkpoint = "LaMini-T5-738M"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
base_model = AutoModelForSeq2SeqLM.from_pretrained(
    checkpoint,
    device_map="auto",
    torch_dtype= torch.float32
)

# @st.cache_resource

pipe = pipeline(
    'text2text-generation',
    model = base_model,
    tokenizer = tokenizer,
    max_length = 256,
    do_sample = True,
    temperature = 0.3,
    top_p = 0.95
)
local_llm= HuggingFacePipeline(pipeline=pipe)


In [46]:
llm = local_llm

In [47]:
question = "What are major topics for this class?"
docs = vectordb.similarity_search(question,k=3)
len(docs)


3

In [48]:
from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [49]:
result = qa_chain({"query": question})

In [50]:
result["result"]

'Major topics for this class include infrastructure and natural ecosystems, climate change, and the Paris Agreement.'

In [51]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)


In [52]:
# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [53]:
question = "Is probability a class topic?"

In [54]:
result = qa_chain({"query": question})

In [55]:
result["result"]

'No, probability is not a class topic.'

In [56]:
result["source_documents"][0]

Document(page_content='5. Many people think climate change mainly means warmer temperatures. But', metadata={})

In [57]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

In [58]:
from langchain.chains import ConversationalRetrievalChain
retriever=vectordb.as_retriever()
qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory)

In [59]:
question = "Is probability a class topic?"
result = qa({"question": question})

In [60]:
result['answer']

'No.'

In [61]:
question = "why are those prerequesites needed?"
result = qa({"question": question})

In [62]:
result['answer']

'The passage states that adaptation will be required everywhere, but must be prioritized now for the most vulnerable people with the fewest resources to cope with climate hazards.'