# New Section

In [None]:
!pip install datasets tqdm pandas matplotlib langchain sentence_transformers faiss-gpu langchain-community torch accelerate

In [None]:

import pandas as pd
from tqdm.notebook import tqdm
from datasets import Dataset
import matplotlib.pyplot as plt
from langchain.docstore.document import Document as LangchainDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
import torch

In [None]:
import pandas as pd
from tqdm.notebook import tqdm
from datasets import Dataset
import matplotlib.pyplot as plt
from langchain.docstore.document import Document as LangchainDocument

# Set display option for pandas
pd.set_option("display.max_colwidth", None)

# Open and read the first file
with open("/content/only_answers_formatted.txt", "r") as fp1:
    s = fp1.read()


# Split the combined content into sections
#s = combined_content.split("\n\n\n")

# Print the first section and the number of sections
print(s[0])
print(len(s))

# Create a RAW_KNOWLEDGE_BASE using LangchainDocument
RAW_KNOWLEDGE_BASE = [
    LangchainDocument(page_content=doc)
    for doc in tqdm(s)
]


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer

MARKDOWN_SEPARATORS = [
    "\n#{1,6}",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n__+\n",
    "\n\n",
    "\n",
    " ",
    ""
]

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    add_start_index=True,
    strip_whitespace=True,
    separators=MARKDOWN_SEPARATORS,
)

docs_processed = []
for doc in RAW_KNOWLEDGE_BASE:
    docs_processed += text_splitter.split_documents([doc])

tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-small")
lengths = [len(tokenizer.encode(doc.page_content)) for doc in tqdm(docs_processed)]

fig = pd.Series(lengths).hist()
fig.set_title("Histogram of Document Lengths")
plt.title("Distribution")
plt.show()


In [None]:
from typing import Optional, List
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer

EMBEDDING_MODEL_NAME = "thenlper/gte-small"

def split_documents(
        chunk_size: int,
        knowledge_base: list[LangchainDocument],
        tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
) -> List[LangchainDocument]:
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=MARKDOWN_SEPARATORS,
    )
    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)
    return docs_processed_unique

docs_processed = split_documents(512, RAW_KNOWLEDGE_BASE, tokenizer_name=EMBEDDING_MODEL_NAME)
print(len(docs_processed))
print(docs_processed[0:3])


In [None]:
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
import torch

print(torch.cuda.is_available())

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},
)

KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
    docs_processed,
    embedding_model,
    distance_strategy=DistanceStrategy.COSINE,
)


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

torch.random.manual_seed(0)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}


In [None]:
prompt_chat=[
    {
        "role":"system",
        "content":"""Using the information contained in the context,
Give a comprehensive answer to the question.
Respond only to the question asked , response should be concise and relevant to the question.
provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer""",

    },
    {
        "role":"user",
        "content":"""Context:
{context}
---
Now here is the Question you need to answer.
Question:{question}
        """,
    },
]
RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
    prompt_chat,tokenize = False,add_generation_prompt=True,

)
print(RAG_PROMPT_TEMPLATE)

In [None]:
u_query = "i have muscle pain"
# ret_text = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=u_query,k=3)
retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=u_query,k=3)

context = retrieved_docs[0].page_content
final_prompt = RAG_PROMPT_TEMPLATE.format(
    question= u_query, context = context
)

output = pipe(final_prompt, **generation_args)
print("YOUR QUESTION:\n",u_query,"\n")
print("MICROSOFT 128K ANSWER: \n",output[0]['generated_text'])

In [None]:
pip install fastapi uvicorn pyngrok

In [None]:
#!ngrok config add-authtoken 2j6RAXfPqnZGd5s0A81p8K9kTTr_7qgYLG37aNBMMBS9yMR6P

In [None]:
!ngrok config add-authtoken 2j6aBw4LlFdyIJUWYG99x5lxUWt_6WceUU6tcbt7E7eSKT1hT

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from pyngrok import ngrok
import uvicorn
import nest_asyncio

# Initialize FastAPI app
app = FastAPI()

# Set up CORS middleware to allow all origins for development purposes
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Allows all origins
    allow_credentials=True,
    allow_methods=["*"],  # Allows all methods
    allow_headers=["*"],  # Allows all headers
)

class QueryModel(BaseModel):
    query: str


@app.post("/query")
async def query_api(request: QueryModel):
    u_query = request.query
    retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=u_query, k=3)
    context = retrieved_docs[0].page_content
    final_prompt = RAG_PROMPT_TEMPLATE.format(
        question=u_query, context=context
    )
    output = pipe(final_prompt, **generation_args)
    return {"question": u_query, "answer": output[0]['generated_text']}

In [None]:
from pyngrok import ngrok
import uvicorn
import nest_asyncio

# Necessary to run uvicorn in Jupyter notebook
nest_asyncio.apply()

# Start ngrok tunnel
public_url = ngrok.connect(8000)
print("Public URL:", public_url)

# Start FastAPI app
uvicorn.run(app, host="0.0.0.0", port=8000)

INFO:     Started server process [244]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


Public URL: NgrokTunnel: "https://8244-34-125-95-91.ngrok-free.app" -> "http://localhost:8000"
INFO:     152.58.196.151:0 - "POST /query HTTP/1.1" 200 OK
INFO:     2409:40f0:1148:66a7:7dbc:d78f:1032:8356:0 - "OPTIONS /query HTTP/1.1" 200 OK
INFO:     2409:40f0:1148:66a7:7dbc:d78f:1032:8356:0 - "POST /query HTTP/1.1" 200 OK
INFO:     2409:40f0:1148:66a7:7dbc:d78f:1032:8356:0 - "POST /query HTTP/1.1" 200 OK
INFO:     2409:40f0:1148:66a7:7dbc:d78f:1032:8356:0 - "POST /query HTTP/1.1" 200 OK
INFO:     2409:40f0:1148:66a7:7dbc:d78f:1032:8356:0 - "POST /query HTTP/1.1" 200 OK
INFO:     2409:40f0:1148:66a7:7dbc:d78f:1032:8356:0 - "POST /query HTTP/1.1" 200 OK
INFO:     2409:40f0:1148:66a7:7dbc:d78f:1032:8356:0 - "POST /query HTTP/1.1" 200 OK
INFO:     2409:40f0:1148:66a7:7dbc:d78f:1032:8356:0 - "OPTIONS /query HTTP/1.1" 200 OK
INFO:     2409:40f0:1148:66a7:7dbc:d78f:1032:8356:0 - "POST /query HTTP/1.1" 200 OK
INFO:     2409:40f0:1148:66a7:7dbc:d78f:1032:8356:0 - "OPTIONS /query HTTP/1.1" 200 

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


INFO:     2409:40f0:1148:66a7:7dbc:d78f:1032:8356:0 - "POST /query HTTP/1.1" 200 OK
INFO:     2409:40f0:1148:66a7:7dbc:d78f:1032:8356:0 - "OPTIONS /query HTTP/1.1" 200 OK
INFO:     2409:40f0:1148:66a7:7dbc:d78f:1032:8356:0 - "POST /query HTTP/1.1" 200 OK
INFO:     2409:40f0:1148:66a7:7dbc:d78f:1032:8356:0 - "POST /query HTTP/1.1" 200 OK
INFO:     2409:40f0:1148:66a7:7dbc:d78f:1032:8356:0 - "OPTIONS /query HTTP/1.1" 200 OK
INFO:     2409:40f0:1148:66a7:7dbc:d78f:1032:8356:0 - "POST /query HTTP/1.1" 200 OK
INFO:     2409:40f0:1148:66a7:7dbc:d78f:1032:8356:0 - "POST /query HTTP/1.1" 200 OK
INFO:     2409:40f0:1148:66a7:7dbc:d78f:1032:8356:0 - "POST /query HTTP/1.1" 200 OK
INFO:     2409:40f0:1148:66a7:7dbc:d78f:1032:8356:0 - "POST /query HTTP/1.1" 200 OK
