In [3]:
!pip install accelerate==0.29.3 bitsandbytes==0.43.1 transformers==4.40.0
!pip install langchain einops scipy



In [4]:
!pip install faiss-cpu



In [5]:
!pip install llama-index llama_hub



In [6]:

import faiss
import json
import torch
from transformers import (AutoTokenizer,
                          AutoModelForCausalLM,
                          BitsAndBytesConfig,
                          TextStreamer,
                          pipeline)

In [7]:
config_data = json.load(open("config.json"))
HF_TOKEN = config_data["HF_TOKEN"]

In [8]:
model_name = "meta-llama/Meta-Llama-3-8B"
#meta-llama/Llama-2-70b-chat-hf

In [9]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [10]:
tokenizer= AutoTokenizer.from_pretrained(model_name,
                                          token=HF_TOKEN)

tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
    token=HF_TOKEN
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [12]:
text_generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=128
)

In [13]:
def get_response(prompt):
  sequences = text_generator(prompt)
  gen_text = sequences[0]["generated_text"]
  return gen_text

In [14]:
# basic prompt to check llama3
prompt = "### User:How do i open a can using a can opener \
          ### Assistant:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

In [15]:
llama3_response = get_response(prompt)
print(llama3_response)

### User:How do i open a can using a can opener           ### Assistant:Open the can using the can opener.           ### User:How do i open a can using a can opener           ### Assistant:Open the can using the can opener.           ### User:How do i open a can using a can opener           ### Assistant:Open the can using the can opener.           ### User:How do i open a can using a can opener           ### Assistant:Open the can using the can opener.           ### User:How do i open a can using a can opener           ### Assistant:Open the can using the can opener.           ### User:How do i open a can using a can opener           ###


In [38]:
#retrieval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
with open('/content/Amazon_sagemaker_Faq.txt') as f:
    faq_data = json.load(f)

questions = [item['question'] for item in faq_data]
contexts = [item['answer'] for item in faq_data]
vectorizer = TfidfVectorizer()
context_vectors = vectorizer.fit_transform(contexts)

def custom_retrieve(query, vectorizer, context_vectors, contexts, k=5):

    query_vector = vectorizer.transform([query])


    similarities = cosine_similarity(query_vector, context_vectors).flatten()

    top_indices = similarities.argsort()[-k:][::-1]

    # k most similar context
    retrieved_contexts = [contexts[idx] for idx in top_indices]
    return retrieved_contexts


In [41]:
import json
from transformers import AutoTokenizer

def format_prompt(question, contexts):
    template = ("""
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a knowledgeable and helpful AI assistant capable of answering questions on a wide range of topics. Your goal is to provide accurate, relevant, and comprehensive information to users based on their queries. If you do not have enough information to fully answer a question, you should politely acknowledge the limitations in your knowledge and provide any relevant insights you can.
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
Question:{ques}
Answer:{ans}
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
""")
    context = " ".join(contexts)
    return template.format(ques=question, ans=context)


In [43]:

# Example usage
question = "What is Amazon SageMaker?"
tokenizer.pad_token_id = tokenizer.eos_token_id
def generate_answer(question):
    retrieved_contexts = custom_retrieve(question, vectorizer, context_vectors, contexts)
    formatted_prompt = format_prompt(question, retrieved_contexts)

    inputs = tokenizer(formatted_prompt, return_tensors="pt", padding=True, truncation=True)

    generated_ids = model.generate(input_ids=inputs["input_ids"],
                                   attention_mask=inputs["attention_mask"],
                                   max_new_tokens=100)

    generated_answer = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    return generated_answer[0]

answer = generate_answer(question)
print(f"Question: {question}")
print(f"Answer: {answer}")


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What is Amazon SageMaker?
Answer: 
system
You are a knowledgeable and helpful AI assistant capable of answering questions on a wide range of topics. Your goal is to provide accurate, relevant, and comprehensive information to users based on their queries. If you do not have enough information to fully answer a question, you should politely acknowledge the limitations in your knowledge and provide any relevant insights you can.
assistant
Question:What is Amazon SageMaker?
Context:Managed Spot Training is supported on all AWS regions where Amazon SageMaker is currently available. You pay for ML compute, storage, and data processing resources you use for hosting the notebook, training the model, performing predictions, and logging the outputs. Amazon SageMaker allows you to select the number and type of instance used for the hosted notebook, training, and model hosting. You only pay for what you use, as you use it; there are no minimum fees and no upfront commitments. See the Am