In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
import os
import shutil
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

  from .autonotebook import tqdm as notebook_tqdm



In [2]:
model_name = "google/flan-t5-base"
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [3]:
def generate(prompt):
    inputs = tokenizer(prompt, return_tensors="pt")
    output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"],
        max_new_tokens=200,
    )[0],
    skip_special_tokens=True
)
    return output


In [4]:
prompt = f"""
What is Global Risk?
"""
generate(prompt)

'Global Risk'

In [5]:
loaders = [
    PyPDFLoader(r"D:\LLM\WEF_The_Global_Risks_Report_2024.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [6]:
len(docs)

124

In [7]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 150,
    chunk_overlap = 20
)
splits = text_splitter.split_documents(docs)

In [9]:
len(splits)

3700

In [10]:
persist_directory = 'docs/chroma/'

In [11]:
!rm -rf ./docs/chroma #if have any other data in db

'rm' is not recognized as an internal or external command,
operable program or batch file.


In [12]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding_model,               
    persist_directory=persist_directory
)

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [13]:
print(vectordb._collection.count())

3737


In [14]:
question = "What is Global Risk?"
doc1 = vectordb.similarity_search(question,k=5)
#len(doc1)
doc1[2].page_content

'How these global risks evolve will reflect the global'

In [15]:
context = "\n\n---\n\n".join([doc.page_content for doc in doc1])

In [16]:
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=template,)

In [18]:
final_prompt = QA_CHAIN_PROMPT.format(context=context, question=question)
print(final_prompt)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
“Global risk” is defined as the possibility of the

---

“Global risk” is the possibility of the occurrence

---

How these global risks evolve will reflect the global

---

the possible short-term and country-level manifestations of global risks. To ensure legibility, the names of some of the global risks have been

---

global risks over time and identify areas of key concern.
Question: What is Global Risk?
Helpful Answer:


In [19]:
inputs = tokenizer(final_prompt, return_tensors="pt")
output1 = original_model.generate(
         inputs["input_ids"],
         max_new_tokens=200,
     )
output1
original_output = tokenizer.decode(output1[0], skip_special_tokens=True)
original_output


'The possibility of the occurrence'

In [20]:
def model(question):
    doc1 = vectordb.similarity_search(question,k=5)
    context = "\n\n---\n\n".join([doc.page_content for doc in doc1])
    final_prompt = QA_CHAIN_PROMPT.format(context=context, question=question)
    inputs = tokenizer(final_prompt, return_tensors="pt")
    output1 = original_model.generate(
         inputs["input_ids"],
         max_new_tokens=200,
     )
    original_output = tokenizer.decode(output1[0], skip_special_tokens=True)
    return original_output

In [21]:
model("What is Global Risk?") #Using this model function you can ask any question about globle risk

'The possibility of the occurrence'