# CFRCChatBot
This project develop an end-to-end ChatBot based on Llama2 and RAG on AWS Sagemaker.

# 1. Embeddings Deployment
This section is to deploy the Embedding model as endpoint.

In [None]:
!pip install -qU pypdf chromadb==0.4.10 langchain==0.0.295  tiktoken faiss-cpu tqdm


In [None]:
import sagemaker
import jinja2
from langchain.embeddings.sagemaker_endpoint import EmbeddingsContentHandler
from langchain.embeddings import SagemakerEndpointEmbeddings
from langchain.llms.sagemaker_endpoint import ContentHandlerBase
from typing import Any, Dict, List, Optional
import json
import sagemaker, boto3
from tqdm import tqdm
import numpy as np

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = "cfrcchatbot"# sess.default_bucket()  # bucket to house artifacts
model_bucket = "cfrcchatbot" #sess.default_bucket()  # bucket to house artifacts
s3_code_prefix = (
    "djl-serving"  # folder within bucket where code artifact will go
)

region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

jinja_env = jinja2.Environment()


!mkdir -p emb_torch
%%writefile ./emb_torch/serving.properties
engine=Python
!pygmentize emb_torch/serving.properties | cat -n


In [None]:
%%writefile./emb_torch/model.py
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from djl_python import Input, Output

JSON_CONTENT_TYPE = 'application/json'

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


def model_fn():
    """
    Load the model for inference
    """

    checkpoint = "sentence-transformers/all-MiniLM-L6-v2"

    # Load model from HuggingFace Hub
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

    model_dict = {'model': model, 'tokenizer': tokenizer}

    return model_dict


def predict_fn(input_data, model):
    """
    Apply model to the incoming request
    """

    tokenizer = model['tokenizer']
    model = model['model']

    encoded_input = tokenizer(input_data, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    return sentence_embeddings


def handle(inputs: Input):
    model = model_fn()

    if inputs.is_empty():
        # Model server makes an empty call to warmup the model on startup
        return None
    data = inputs.get_as_json()

    input_sentences = data["inputs"]

    outputs = predict_fn(input_sentences, model)
    result = {"outputs": outputs.cpu().numpy()}
    return Output().add_as_json(result)


In [None]:

%%writefile ./emb_torch/requirements.txt
transformers==4.30.2
torch==2.0.1

In [None]:

!rm emb_torch.tar.gz
!tar czvf emb_torch.tar.gz -C emb_torch .
s3_code_artifact = sess.upload_data("emb_torch.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}")

inference_image_uri = (
    f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.23.0-cpu-full"
)
print(f"Image going to be used is ---- > {inference_image_uri}")

In [None]:
from sagemaker.utils import name_from_base

model_name = name_from_base(f"emb-torch")
print(model_name)

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri,
        "ModelDataUrl": s3_code_artifact
    },

)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")
endpoint_config_name = f"{model_name}-config"
endpoint_name = f"{model_name}-endpoint"

endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": "ml.t2.xlarge",
            "InitialInstanceCount": 1,
            "ContainerStartupHealthCheckTimeoutInSeconds": 600,
        },
    ],
)
endpoint_config_response
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

In [None]:

role = sagemaker.get_execution_role()  # execution role for the endpoint
session = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
region = session._region_name


class SagemakerEndpointEmbeddingsJumpStart(SagemakerEndpointEmbeddings):
    def embed_documents(self, texts: List[str], chunk_size: int = 32) -> List[List[float]]:
        """Compute doc embeddings using a SageMaker Inference Endpoint.

        Args:
            texts: The list of texts to embed.
            chunk_size: The chunk size defines how many input texts will
                be grouped together as request. If None, will use the
                chunk size specified by the class.

        Returns:
            List of embeddings, one for each text.
        """
        # return self._embedding_func(texts)
        results = []
        _chunk_size = len(texts) if chunk_size > len(texts) else chunk_size
        for i in tqdm(range(0, len(texts), _chunk_size)):
            response = self._embedding_func(texts[i : i + _chunk_size])
            results.extend(response)
        return results


class ContentHandler(EmbeddingsContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt: str, model_kwargs={}) -> bytes:
        input_str = json.dumps({"inputs": prompt, "parameters": {**model_kwargs}})
        return input_str.encode("utf-8")

    def transform_output(self, output: bytes) -> str:
        response_json = json.loads(output.read().decode('utf8'))['outputs']
        embeddings = response_json
        return embeddings

parameters = { 'pooling': 'mean', 'normalize': True}

content_handler = ContentHandler()
embed_endpoint_name=endpoint_name
embeddings = SagemakerEndpointEmbeddingsJumpStart(
    endpoint_name=embed_endpoint_name,
    region_name=region,
    model_kwargs=parameters,
    content_handler=content_handler,
)

# 2. GPT Deployment
This section is to deploy the LLM model Llama2 as endpoint.

In [None]:
import sagemaker
import jinja2
from sagemaker import image_uris
import boto3
import os
import time
import json
from pathlib import Path

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = "cfrcchatbot"# sess.default_bucket()  # bucket to house artifacts
model_bucket = "cfrcchatbot" #sess.default_bucket()  # bucket to house artifacts
s3_code_prefix = (
    "djl-serving"  # folder within bucket where code artifact will go
)
s3_model_prefix = "model"
region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

jinja_env = jinja2.Environment()
!rm -rf Llama2_deepspeed
!mkdir -p Llama2_deepspeed

%%writefile Llama2_deepspeed/serving.properties
engine=DeepSpeed
option.tensor_parallel_degree=4
option.s3url = {{s3url}}

In [None]:
template = jinja_env.from_string(Path("Llama2_deepspeed/serving.properties").open().read())
Path("Llama2_deepspeed/serving.properties").open("w").write(
    template.render(s3url="s3://cfrcchatbot/model/Llama-2-13b-chat-hf/")
)
!pygmentize Llama2_deepspeed/serving.properties | cat -n

In [None]:
%%writefile ./Llama2_deepspeed/model.py
from djl_python import Input, Output
import deepspeed
import torch
import logging
import math
import os
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from torch import cuda, bfloat16

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

from transformers import StoppingCriteria, StoppingCriteriaList

def load_model(properties):
    tensor_parallel = properties["tensor_parallel_degree"]

    model_location = properties["model_dir"]
    if "model_id" in properties:
        model_location = properties["model_id"]
    logging.info(f"Loading model in {model_location}")
    
    model = AutoModelForCausalLM.from_pretrained(model_location,torch_dtype=torch.bfloat16, low_cpu_mem_usage=True)
    tokenizer = AutoTokenizer.from_pretrained(model_location)
    model = deepspeed.init_inference(model,
                                     mp_size=tensor_parallel,
                                     dtype=torch.bfloat16,
                                     replace_method='auto',
                                     replace_with_kernel_inject=True
                                    )
    class StopOnTokens(StoppingCriteria):
        def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
            for stop_ids in stop_token_ids:
                if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                    return True
            return False

    device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
    stop_list = ['\nHuman:', '\n```\n', '\n']
    stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
    stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
    stopping_criteria = StoppingCriteriaList([StopOnTokens()])


    local_rank = int(os.getenv('LOCAL_RANK', '0'))
    generate_text = pipeline(
        model=model, tokenizer=tokenizer,
        return_full_text=True, 
        task='text-generation',
        stopping_criteria=stopping_criteria, 
        temperature=0.3, 
        max_new_tokens=512, 
        repetition_penalty=1.2,
        device=local_rank
    )
    return generate_text
    return model.module, tokenizer


model = None
tokenizer = None
generator = None
predictor = None


def run_inference(model, tokenizer, data, params):
    generate_kwargs = params
    tokenizer.pad_token = tokenizer.eos_token
    input_tokens = tokenizer.encode_plus(data,return_token_type_ids=False, truncation= True, max_length = 1024, 
                                         padding=True, return_tensors="pt").to(torch.cuda.current_device())
    outputs = model.generate(**input_tokens, **generate_kwargs)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)


def handle(inputs: Input):
    global predictor
    if not predictor:
        predictor = load_model(inputs.get_properties())

    if inputs.is_empty():
        # Model server makes an empty call to warmup the model on startup
        return None

    data = inputs.get_as_json()
    result = predictor(data["inputs"])
    return Output().add_as_json(result)

    global model, tokenizer
    if not model:
        model, tokenizer = load_model(inputs.get_properties())

    if inputs.is_empty():
        return None
    data = inputs.get_as_json()
    
    input_sentences = data["inputs"]
    params = data["parameters"]
    
    outputs = run_inference(model, tokenizer, input_sentences, params)
    result = {"outputs": outputs}
    return Output().add_as_json(result)

In [None]:

%%writefile ./Llama2_deepspeed/requirements.txt
transformers==4.30.2
accelerate==0.20.3
xformers
deepspeed==0.10.3
bitsandbytes==0.41.0
einops
torch==2.0.1


In [None]:
inference_image_uri = (
    f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.23.0-deepspeed0.9.5-cu118"
)
print(f"Image going to be used is ---- > {inference_image_uri}")

In [None]:

!rm model.tar.gz
!tar czvf model.tar.gz -C Llama2_deepspeed .
s3_code_artifact = sess.upload_data("model.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}")

In [None]:
from sagemaker.utils import name_from_base

model_name = name_from_base(f"Llama2-deepspeed")
print(model_name)

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri,
        "ModelDataUrl": s3_code_artifact
    },

)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")
endpoint_config_name = f"{model_name}-config"
endpoint_name = f"{model_name}-endpoint"

endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": "ml.g5.12xlarge",
            "InitialInstanceCount": 1,
            "ContainerStartupHealthCheckTimeoutInSeconds": 300,
        },
    ],
)
endpoint_config_response
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

In [None]:

from langchain.llms.sagemaker_endpoint import LLMContentHandler, SagemakerEndpoint

parameters = { "max_new_tokens":256,
                    "temperature":0.2,
                    "repetition_penalty":1.3,"length_penalty":1.2,
                    'do_sample': True,}


class ContentHandler(LLMContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt: str, model_kwargs={}) -> bytes:
        input_str = json.dumps({"inputs": prompt, "parameters": {**model_kwargs}})
        return input_str.encode("utf-8")

    def transform_output(self, output: bytes) -> str:
        ans = json.loads(output.read().decode('utf8'))['outputs']

        return ans#[0]


content_handler = ContentHandler()
endpoint_name=endpoint_name
sm_llm = SagemakerEndpoint(
    endpoint_name=endpoint_name,
    region_name=region,
    model_kwargs=parameters,
    content_handler=content_handler,
)


# 3. Chatbot
This section is to download the text data from CFRC Web and deploy the ChatBot.

In [None]:
# download text data from web
!pip install -q xmltodict  BeautifulSoup4
import xmltodict
import requests
from bs4 import BeautifulSoup

def extract_text_from(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, features="html.parser")
    text = soup.get_text()

    lines = (line.strip() for line in text.splitlines())
    return '\n'.join(line for line in lines if line)

r = requests.get("https://cityfutures.ada.unsw.edu.au/sitemap.xml")
xml = r.text
raw = xmltodict.parse(xml)

pages = []
text = ''
c=0
for info in raw['urlset']['url']:
    url = info['loc']
    if 'https://cityfutures.ada.unsw.edu.au/' in url:
        pages.append({'text': extract_text_from(url), 'source': url})
        text = extract_text_from(url)       
        
        text_file = open("doc/CFRC_web_page"+str(c)+".txt", "w")
        n = text_file.write(text)
        text_file.close()
        c+=1


In [None]:
# deploy the chatbot
from langchain.vectorstores import Chroma, AtlasDB, FAISS
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader,TextLoader
from langchain.text_splitter import TokenTextSplitter
loader = TextLoader("CFRC_web.txt")
loader = DirectoryLoader("doc", glob='**/*.txt', show_progress=True, loader_cls=TextLoader)
documents_text = loader.load_and_split()

loader = PyPDFLoader("brochure.pdf")
documents_pdf = loader.load_and_split()
text_splitter = TokenTextSplitter(chunk_size=300, chunk_overlap=10)
texts = text_splitter.split_documents(documents_text+documents_pdf)
docsearch = FAISS.from_documents(texts, embeddings)

from langchain.chains.question_answering import load_qa_chain
from langchain.prompts.prompt import PromptTemplate

def CFRCbot(q):
    
    docs = docsearch.similarity_search(question, k=2)
    prompt_template = """Answer based on context:\n\n{context}\n\n{question}\n\n"""

    PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
    chain = load_qa_chain(llm=sm_llm, prompt=PROMPT)

    result = chain({"input_documents": docs, "question": question}, return_only_outputs=True)[
        "output_text"
    ]
    return result.split("\n\n")[-1]

In [None]:

question = "What is 2+2?"
print("\nQuestion: "+question+"\n"+CFRCbot(question))

question = "What did Professor Chris Pettit do?"
print("\nQuestion: "+question+"\n"+CFRCbot(question))

question =  "Who is Chris Pettit"
print("\nQuestion: "+question+"\n"+CFRCbot(question))

question =  "Who is Yang Lin at UNSW CFRC"
print("\nQuestion: "+question+"\n"+CFRCbot(question))

question =  "I am doing research in Planning of future cities, how do I collaborate with UNSW CFRC"
print("\nQuestion: "+question+"\n"+CFRCbot(question))

question =  "What did city futures research center do"
print("\nQuestion: "+question+"\n"+CFRCbot(question))

question =  "Can you list some research topics of city futures research center do"
print("\nQuestion: "+question+"\n"+CFRCbot(question))

question =  "How can I reach out to Chris Pettit"
print("\nQuestion: "+question+"\n"+CFRCbot(question))

question = "Is Yang Lin handsome"
print("\nQuestion: "+question+"\n"+CFRCbot(question))

question = "Is Yang Lin ethical"
print("\nQuestion: "+question+"\n"+CFRCbot(question))

question = "Is Yang Lin a good cooker"
print("\nQuestion: "+question+"\n"+CFRCbot(question))

question = "Is Yang Lin a good researcher"
print("\nQuestion: "+question+"\n"+CFRCbot(question))