Install packages

In [1]:
!pip install -q langchain langchain_community langchain-huggingface transformers accelerate bitsandbytes langchain_unstructured faiss-cpu unstructured unstructured[pdf]

Load libraries

In [2]:
from langchain.chains import LLMChain, SequentialChain
from langchain.memory import ConversationBufferMemory
from langchain_huggingface  import HuggingFacePipeline
from langchain import PromptTemplate,  LLMChain
from langchain.schema.output_parser import StrOutputParser
from langchain_unstructured import UnstructuredLoader
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.chains import ConversationalRetrievalChain


from transformers import AutoModel, pipeline
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

import json
import textwrap

Download the model

In [3]:
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf")

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
    "NousResearch/Llama-2-7b-chat-hf",
    device_map='auto',
    torch_dtype=torch.float16,
    quantization_config=quantization_config
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Define transformers pipeline

In [4]:
pipe = pipeline("text-generation",
                model=model,
                tokenizer= tokenizer,
                torch_dtype=torch.float16,
                device_map="auto",
                max_new_tokens = 512,
                do_sample=True,
                top_k=30,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id,
                return_full_text=False
                )

Define LLM

In [5]:
llm = HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0.7,'max_length': 256, 'top_k' :50})

Create vector store

In [6]:


# Papers
docs_path = [
    "/content/documents/16458859.pdf",
    "/content/documents/234242.pdf",
    "/content/documents/234256.pdf",
    "/content/documents/234789.pdf",
    "/content/documents/23789243.pdf",
    "/content/documents/43521232.pdf",
    "/content/documents/9827347.pdf"
]

# Get documents
loader = UnstructuredLoader(docs_path)
documents = loader.load()

# Vectorize documents
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

vector_store = FAISS.from_documents(documents, embeddings)

# Save the vector store to disk
vector_store.save_local("vector_store")

"""
new_vector_store = FAISS.load_local(
    "vector_store", embeddings, allow_dangerous_deserialization=True
)

docs = new_vector_store.similarity_search("qux")

docs[0]
"""




'\nnew_vector_store = FAISS.load_local(\n    "vector_store", embeddings, allow_dangerous_deserialization=True\n)\n\ndocs = new_vector_store.similarity_search("qux")\n\ndocs[0]\n'

Define prompt format

In [7]:
custom_template = """
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Keep the answer concise.

Chat History: {chat_history}

Question: {question}

Context: {context}

Answer:
"""

Define chain

In [8]:
chain = ConversationalRetrievalChain.from_llm(llm, vector_store.as_retriever(), return_source_documents=True, combine_docs_chain_kwargs={"prompt": ChatPromptTemplate.from_template(custom_template)})

Test

In [9]:
chat_history = []

query = "is there a problem with birds?"
result = chain({"question": query, "chat_history": chat_history})

chat_history.append((query, result['answer']))

print(result['answer'])


query = "Which was my first question?"
result = chain({"question": query, "chat_history": chat_history})

chat_history.append((query, result['answer']))

print(result['answer'])

  result = chain({"question": query, "chat_history": chat_history})



There is no problem with birds.
There is no problem with birds.


Run flask

In [10]:
!pip install --upgrade pyngrok flasgger



In [11]:
!pip install flask-ngrok



In [12]:
!ngrok authtoken 'YOUR_NGROK_KEY'

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [13]:
import os
import threading
import time
from flask import Flask, jsonify, request, redirect, session
from pyngrok import ngrok
from flasgger import Swagger

class TokenBucket:
    def __init__(self, capacity, refill_rate):
        self.capacity = capacity
        self.tokens = capacity
        self.refill_rate = refill_rate  # Tokens per second
        self.last_refill = time.time()

    def refill(self):
        now = time.time()
        elapsed = now - self.last_refill
        refill_amount = elapsed * self.refill_rate
        self.tokens = min(self.capacity, self.tokens + refill_amount)
        self.last_refill = now

    def take_token(self):
        self.refill()
        if self.tokens >= 1:
            self.tokens -= 1
            return True
        return False

# Initialize the token bucket
bucket = TokenBucket(capacity=10, refill_rate=3)

app = Flask(__name__)
app.secret_key = 'your_secret_key'  # Add your secret key
port = "5000"

# Initialize Swagger
swagger = Swagger(app)

# Open a ngrok tunnel to the HTTP server
public_url = ngrok.connect(port).public_url
print(f" * ngrok tunnel \"{public_url}\" -> \"http://127.0.0.1:{port}\"")

app.config["BASE_URL"] = public_url

# Rate limiter middleware
@app.before_request
def rate_limiter():
    if not bucket.take_token():
        return jsonify({"detail": "Rate limit exceeded"}), 429

# Define Flask routes
@app.route("/")
def index():
    # Redirect to the Swagger UI
    return redirect("/apidocs/")

@app.route("/chat", methods=["POST"])
def chat():
    """
    Chat with the bot
    ---
    parameters:
      - name: query
        in: body
        required: true
        schema:
          type: object
          properties:
            query:
              type: string
              description: The question to ask the bot
    responses:
      200:
        description: Answer from the bot
        schema:
          type: object
          properties:
            answer:
              type: string
      400:
        description: Bad request due to missing query parameter
        schema:
          type: object
          properties:
            detail:
              type: string
      429:
        description: Rate limit exceeded
        schema:
          type: object
          properties:
            detail:
              type: string
    """
    data = request.json
    query = data.get('query')

    if not query:
        return jsonify({"detail": "Query parameter is required"}), 400

    # Initialize chat_history in the session if it doesn't exist
    if 'chat_history' not in session:
        session['chat_history'] = []

    # Use the chat history from the session
    chat_history = session['chat_history']

    try:
        result = chain({"question": query, "chat_history": chat_history})  # Ensure `chain` is defined

        # Append the user query and the bot response to the session's chat history
        chat_history.append((query, result['answer']))
        session['chat_history'] = chat_history  # Save back to the session

        return jsonify({"answer": result['answer']})

    except Exception as e:
        return jsonify({"detail": str(e)}), 500  # Handle unexpected errors

# Start the Flask server in a new thread
threading.Thread(target=app.run, kwargs={"use_reloader": False, "port": port}).start()


 * ngrok tunnel "https://2b5d-34-124-243-19.ngrok-free.app" -> "http://127.0.0.1:5000"
