# Using Zephyr 7B Beta Quantised Model

* [TheBloke/zephyr-7B-beta-GGUF](https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF)
* Used CTransformers wrapper

In [None]:
%pip install torch==2.2.1
%pip install langchain==0.1.9
%pip install langchain-community==0.0.24
%pip install ctransformers==0.2.27
%pip install streamlit==1.31.1
%pip install streamlit-extras==0.4.0
%pip install langchain==0.1.9
%pip install rank_bm25==0.2.2
%pip install pypdf==4.0.2
%pip install chromadb==0.4.24
%pip install tiktoken==0.6.0

In [None]:
import os
from langchain_community.llms import CTransformers
from langchain import PromptTemplate, LLMChain

In [None]:
model_type = "mistral"
model_id = "TheBloke/zephyr-7B-beta-GGUF"
model_file = "zephyr-7b-beta.Q4_K_S.gguf"

In [None]:
config = {
    "max_new_tokens": 1024,
    "repetition_penalty": 1.1,
    "temperature": 1,
    "top_k": 50,
    "top_p": 0.9,
    "stream": True,
    "threads": int(os.cpu_count() / 2),
}

In [None]:
init_model = CTransformers(model=model_id, model_file=model_file, model_type=model_type, **config, lib="avx2")

## Without Prompt Template

In [None]:
query = "what is the meaning of the life ?"

In [None]:
result = init_model(query)
print(result)

## With Prompt Template

In [None]:
template = """You are a helpful AI Assistant that follows instructions extremely well.
Question: {question}

Answer: Let's think step by step and answer it faithfully.
"""

In [None]:
prompt = PromptTemplate(template=template, input_variables=["question"])

In [None]:
chain = LLMChain(prompt=prompt, llm=init_model, verbose=True)

In [None]:
query = "What is LLM ?"

In [None]:
result = chain.run(query)

In [None]:
print(result)

## RAG - Talk to PDF

In [None]:
import os
from langchain_community.llms import CTransformers
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.retrievers import BM25Retriever, EnsembleRetriever

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings

### Load Data

In [None]:
file_path = "../data/Orca Progressive Learning from Complex.pdf"
data_file = PyPDFLoader(file_path)
docs = data_file.load()

### Split & Chunk Docs

In [None]:
# create chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
chunks = splitter.split_documents(docs)

### Load Embedder

In [None]:
HF_TOKEN = input("Enter your HuggingFace Token")

In [None]:
# https://huggingface.co/BAAI/bge-base-en-v1.5
embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=HF_TOKEN, model_name="BAAI/bge-base-en-v1.5"
)

### Retrievers

In [None]:
# retrieve k
k = 5

#### Vector Retriever

In [None]:
vector_store = Chroma.from_documents(chunks, embeddings)
vector_retriever = vector_store.as_retriever(search_kwargs={"k": k})

#### Semantic Retriever

In [None]:
semantic_retriever = BM25Retriever.from_documents(chunks)
semantic_retriever.k = k

#### Ensemble Retriever

In [None]:
ensemble_retriever = EnsembleRetriever(
    retrievers=[vector_retriever, semantic_retriever], weights=[0.5, 0.5]
)

### Init LLM Model

In [None]:
model_type = "mistral"
model_id = "TheBloke/zephyr-7B-beta-GGUF"
model_file = "zephyr-7b-beta.Q4_K_S.gguf"

In [None]:
config = {
    "max_new_tokens": 2048,
    "repetition_penalty": 1.1,
    "temperature": 1,
    "top_k": 50,
    "top_p": 0.9,
    "stream": True,
    "context_length": 4096,
    "gpu_layers": 0,
    "threads": int(os.cpu_count() / 2),
}

In [None]:
llm = CTransformers(
    model=model_id, model_file=model_file, model_type=model_type, config=config, lib="avx2"
)

### Prompting

In [None]:
template = """You are a helpful AI Assistant that follows instructions extremely well.
Use the following context to answer user question.

Think step by step before answering the question. 
You will get a $100 tip if you provide correct answer.

Context: {context}

Question: {question}

Answer: Let's think step by step and answer it faithfully.
"""

In [None]:
prompt = ChatPromptTemplate.from_template(template)
output_parser = StrOutputParser()

In [None]:
chain = (
    {"context": ensemble_retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | output_parser
)

In [None]:
print(chain.invoke("What is instruction tuning?"))