In [1]:
!pip install -q datasets groq "semantic-router[local]" pinecone-client
#Install required packages for the notebook.

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 24.6.1 requires cubinlinker, which is not installed.
cudf 24.6.1 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 24.6.1 requires ptxcompiler, which is not installed.
cuml 24.6.1 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 24.6.1 requires cupy-cuda11x>=12.0.0, which is not installed.
tensorflow-decision-forests 1.8.1 requires wurlitzer, which is not installed.
aiobotocore 2.13.1 requires botocore<1.34.132,>=1.34.70, but you have botocore 1.34.158 which is incompatible.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.8 which is incompatible.
apache-beam 2.46.0 requires numpy<1.25.0,>=1.14.3, but you have numpy 1.26.4 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3.0.0, but you have pyarrow 16.1.0 which is incompatible.

In [2]:
from datasets import load_dataset
#Load a dataset from the "jamescalam/ai-arxiv2-semantic-chunks" repository.

data = load_dataset(
    "jamescalam/ai-arxiv2-semantic-chunks",
    split="train[:10000]"
)
data

Downloading data:   0%|          | 0.00/253M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/209760 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'title', 'content', 'prechunk_id', 'postchunk_id', 'arxiv_id', 'references'],
    num_rows: 10000
})

In [3]:
data[0]

{'id': '2401.04088#0',
 'title': 'Mixtral of Experts',
 'content': '4 2 0 2 n a J 8 ] G L . s c [ 1 v 8 8 0 4 0 . 1 0 4 2 : v i X r a # Mixtral of Experts Albert Q. Jiang, Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, Blanche Savary, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Emma Bou Hanna, Florian Bressand, Gianna Lengyel, Guillaume Bour, Guillaume Lample, LÃ©lio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Sandeep Subramanian, Sophia Yang, Szymon Antoniak, Teven Le Scao, ThÃ©ophile Gervet, Thibaut Lavril, Thomas Wang, TimothÃ©e Lacroix, William El Sayed Abstract We introduce Mixtral 8x7B, a Sparse Mixture of Experts (SMoE) language model. Mixtral has the same architecture as Mistral 7B, with the difference that each layer is composed of 8 feedforward blocks (i.e. experts). For every token, at each layer, a router network selects two experts to process the current state and combine their outputs. Even though each token only sees two experts

In [4]:
# Map the dataset to a new format where each entry contains only 'id', 'title', and 'content'
data = data.map(lambda x: {
    "id": x["id"],
    "metadata": {
        "title": x["title"],
        "content": x["content"],
    }
})

# Drop unneeded columns from the dataset
data = data.remove_columns([
    "title", "content", "prechunk_id",
    "postchunk_id", "arxiv_id", "references"
])

# Display the modified dataset
data

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'metadata'],
    num_rows: 10000
})

In [5]:
# Import the HuggingFaceEncoder class from the semantic_router.encoders module
from semantic_router.encoders import HuggingFaceEncoder

# Initialize the HuggingFaceEncoder with the specified model name
encoder = HuggingFaceEncoder(name="dwzhu/e5-base-4k")

* 'allow_population_by_field_name' has been renamed to 'populate_by_name'
* 'smart_union' has been removed


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/691 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/225M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [6]:
encoder.device

'cuda'

In [7]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

class E5Encoder:
    def __init__(self, model_name="dwzhu/e5-base-4k"):
        # Initialize the tokenizer and model from the specified model name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name, 
                                               torch_dtype=torch.float16,  # Use half-precision
                                               low_cpu_mem_usage=True)     # Efficient loading
        
        # Set the device to GPU if available, otherwise use CPU
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

    def average_pool(self, last_hidden_states, attention_mask):
        # Mask the last hidden states where attention mask is 0 and compute the average
        last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
        return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

    def get_position_ids(self, input_ids, max_original_positions=512, encode_max_length=4096):
        # Generate position IDs for the input tokens
        position_ids = list(range(input_ids.size(1)))
        factor = max(encode_max_length // max_original_positions, 1)
        if input_ids.size(1) <= max_original_positions:
            position_ids = [(pid * factor) for pid in position_ids]
        
        # Convert position IDs to a tensor and expand to match input IDs shape
        position_ids = torch.tensor(position_ids, dtype=torch.long, device=self.device)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
        
        return position_ids

    @torch.cuda.amp.autocast()  # Enable automatic mixed precision
    def __call__(self, texts, batch_size=16):  # Reduced batch size
        all_embeddings = []
        
        # Process texts in batches
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            batch_texts = ["query: " + text for text in batch_texts]
            
            # Tokenize the batch of texts
            batch_dict = self.tokenizer(batch_texts, max_length=4096, padding=True, truncation=True, return_tensors='pt')
            batch_dict['position_ids'] = self.get_position_ids(batch_dict['input_ids'])
            
            # Move the batch to the appropriate device
            batch_dict = {k: v.to(self.device) for k, v in batch_dict.items()}
            
            # Perform inference without computing gradients
            with torch.no_grad():
                outputs = self.model(**batch_dict)
            
            # Compute the embeddings and normalize them
            embeddings = self.average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
            embeddings = F.normalize(embeddings, p=2, dim=1)
            
            # Append the embeddings to the list
            all_embeddings.append(embeddings.cpu())
        
        # Concatenate all embeddings and return as a numpy array
        return torch.cat(all_embeddings, dim=0).numpy()

# Initialize the encoder
encoder = E5Encoder()

To create embeddings

In [8]:
embeds = encoder(["this is a test"])
embeds

array([[-0.01055724, -0.05916505, -0.00923487, -0.00987848,  0.04704633,
        -0.02660513,  0.0119974 ,  0.02583785, -0.01812095, -0.00403736,
         0.01254273,  0.04673668, -0.04430786,  0.01941498, -0.04972987,
         0.04060301,  0.06545679, -0.05947537,  0.07019977, -0.03831637,
        -0.05704334, -0.03428105,  0.01516932,  0.00629467,  0.02391649,
         0.00840249, -0.00198298,  0.02226302, -0.05795388, -0.00991976,
         0.01475909,  0.02386614,  0.06164724, -0.02544833, -0.02752871,
         0.04772342, -0.0555387 , -0.00140684, -0.05019806, -0.05783498,
         0.00524427, -0.0008233 , -0.01428638, -0.02522869, -0.0517312 ,
        -0.04685078, -0.04709958,  0.03122895, -0.03741217, -0.04093179,
        -0.03790484,  0.0456459 ,  0.03549443,  0.00430107, -0.03165791,
        -0.0003196 , -0.00999418, -0.00646695, -0.02008371, -0.03663   ,
         0.05299647,  0.02984488,  0.03329681, -0.00053476,  0.05796737,
         0.05524962,  0.0061562 , -0.01345492, -0.0

In [9]:
dims = len(embeds[0])
dims

768

In [10]:
import os
import getpass
from pinecone import Pinecone

In [11]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
api_key = user_secrets.get_secret("PINECONE_API_KEY")
#get api from pinecone
#api = os.getenv("PINECONE_API_KEY") or getpass.getpass("Enter your pinecone API key")
#config client
pc = Pinecone(api_key=api_key)

In [12]:
from pinecone import ServerlessSpec

spec = ServerlessSpec(
    cloud="aws", region="us-east-1")

In [13]:
import time

# Define the name of the index to be created or checked
index_name = "groq-llama-3-rag"

# Get a list of existing index names
existing_indexers = [
    index_info["name"] for index_info in pc.list_indexes()
]

# Check if the index already exists
if index_name not in existing_indexers:
    # If the index does not exist, create it with the specified parameters
    pc.create_index(
        index_name,
        dimension=dims,  # Specify the dimension of the index
        metric='cosine',  # Use cosine similarity as the metric
        spec=spec  # Additional specifications for the index
    )
    
    # Wait until the index is ready
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)  # Sleep for 1 second before checking again

# Initialize the index object
index = pc.Index(index_name)
time.sleep(1)  # Sleep for 1 second to ensure the index is fully initialized

# View the statistics of the index
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 10000}},
 'total_vector_count': 10000}

In [14]:
# import torch

# # Clear cache for all available GPUs
# def clear_gpu_cache():
#     for i in range(torch.cuda.device_count()):
#         torch.cuda.set_device(i)
#         torch.cuda.empty_cache()
#         print(f"Cleared cache for GPU {i}")

# clear_gpu_cache()

In [15]:
from tqdm.auto import tqdm

# Define the batch size for processing the data
batch_size = 384

# Iterate over the data in batches
for i in tqdm(range(0, len(data), batch_size)):
    # Determine the end index of the current batch
    i_end = min(len(data), i + batch_size)
    
    # Extract the current batch from the data
    batch = data[i:i_end]
    
    # Create a list of chunks by combining the title and content of each metadata entry
    chunks = [f'{x["title"]}: {x["content"]}' for x in batch["metadata"]]
    
    # Generate embeddings for the chunks using the encoder
    embeds = encoder(chunks)
    
    # Ensure the number of embeddings matches the number of items in the batch
    assert len(embeds) == (i_end - i)
    
    # Prepare the data for upserting to Pinecone by combining IDs, embeddings, and metadata
    to_upsert = list(zip(batch["id"], embeds, batch["metadata"]))
    
    # Upsert the data to the Pinecone index
    index.upsert(vectors=to_upsert)

  0%|          | 0/27 [00:00<?, ?it/s]

In [16]:
batch['metadata']

[{'content': '4 . So the answer is 9Ï 4 . Are there variables in the solution? the form of "1. variable is defined as...". If so, please list the definition of variable in The underlined parts are the type of question, the question itself and the steps in its solution, respectively. The output from the LLM is: Yes. There are variables in the solution. x + yi, where xxx and yyy are real numbers. x + yi 1. zzz is defined as a complex number of the form x + yi',
  'title': 'SelfCheck: Using LLMs to Zero-Shot Check Their Own Step-by-Step Reasoning'},
 {'content': 'The bold part is then saved to form a part of the input in the regeneration stage. Target extraction To get a brief and clear target of the current step, the input to the LLM is: The following is a part of the solution to the problem: Let S be the set of complex numbers z such that the real part of 1 6 . This set forms a curve. Find the area of the 12 region inside the curve. (Step 0) Let z = x + yi be a complex number, where x a

In [17]:
import numpy as np

def get_docs(query: str, top_k: int) -> list[str]:
    # Encode the query using the encoder
    xq = encoder([query])
    
    # Convert the encoded query to a list format
    xq_list = xq.tolist()
    
    # Search the Pinecone index with the encoded query
    res = index.query(vector=xq_list, top_k=top_k, include_metadata=True)
    
    # Extract the document content from the search results
    docs = [x['metadata']['content'] for x in res['matches']]
    
    # Return the list of document contents
    return docs

In [18]:
query = 'cab you tell me about LLama?'
docs = get_docs(query, top_k=5)
print("\n---\n".join(docs))

LLaMA LLaMA (Touvron et al., 2023a) is an auto-regressive, decoder-only large language model based on the Transformer architecture. The model is characterized by its billions of param- eters, pre-trained on a vast amount of web data. Being uni-directional means that the modelâ s at- tention mechanism only considers the preceding elements in the input sequence when making pre- dictions. Specifically, given an input sequence x = [t1, t2, ..., tnâ 1], the model computes the prob- ability of the next token tn based solely on the preceding tokens. The prediction process can be mathematically represented as P (tn|t1, ..., tnâ 1), where P denotes the probability and tn represents the next element in the sequence.
---
Stanford alpaca: An instruction-following llama model, 2023. Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, TimothÂ´ee Lacroix, Baptiste Rozi`ere, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Ar- mand Joulin, Edouard Grave, and 

In [19]:
from groq import Groq
GROQ_API_KEY = user_secrets.get_secret("GROQ_API_KEY")
groq_client = Groq(api_key=GROQ_API_KEY)

In [20]:
#build the generate function
def generate(query:str, docs: list[str]):
    system_message = (
    "You are a knowledgeable and informative AI assistant, providing comprehensive and engaging explanations of AI concepts. Use a clear and professional tone. Focus on general AI knowledge, but feel free to delve deeper into specific AI fields when relevant."
    "CONTEXT:The provided documents contain relevant information. Prioritize recent documents and search for keywords related to the query. If unsure, provide general information and suggest relevant resources. Cite specific sources from the context when possible. Use examples or analogies to illustrate complex topics." 
    "\n---\n".join(docs)
    )
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": query}
    ]
    
    #generate response
    chat_response = groq_client.chat.completions.create(
        model = "llama3-groq-70b-8192-tool-use-preview",
        messages=messages
    )
    
    return chat_response.choices[0].message.content

In [21]:
out = generate(query=query, docs=docs)
print(out)

The LLaMA model is a type of large language model developed by Meta, which is based on the transformer architecture. It is a decoder-only model, meaning it is trained to predict the next token in a sequence given the context of the previous tokens. This model is known for its ability to understand and generate human-like language, making it useful for a variety of natural language processing tasks.

In the context of the provided documents, LLaMA-2-7B-chat and LLaMA-2-13B-chat are two versions of the LLaMA model. These models are designed to be more conversational and are fine-tuned on user-shared data. They are part of the second version of the LLaMA model released by Meta, which aims to improve the model's conversational capabilities.

The LLaMA model is often compared to other large language models, such as GLM-130B, in terms of performance and cost. The performance of these models can be evaluated on various benchmarks, including the HellaSwag, MMLU, and TruthfulQA datasets. These 

In [22]:
import requests
import json

api_key = GROQ_API_KEY
url = "https://api.groq.com/openai/v1/models"

headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json"
}

response = requests.get(url, headers=headers)
data = response.json()

# Access the data list
model_list = data['data']

# Iterate over the model list and print each model's information
for model in model_list:
    print(f"Model ID: {model['id']}")
    print(f"Object: {model['object']}")
    print(f"Created: {model['created']}")
    print(f"Owned by: {model['owned_by']}")
    print(f"Active: {model['active']}")
    print(f"Context window: {model['context_window']}")
    print(f"Public apps: {model['public_apps']}")
    print("-" * 20)  # Separator


Model ID: gemma2-9b-it
Object: model
Created: 1693721698
Owned by: Google
Active: True
Context window: 8192
Public apps: None
--------------------
Model ID: gemma-7b-it
Object: model
Created: 1693721698
Owned by: Google
Active: True
Context window: 8192
Public apps: None
--------------------
Model ID: llama-3.1-70b-versatile
Object: model
Created: 1693721698
Owned by: Meta
Active: True
Context window: 131072
Public apps: None
--------------------
Model ID: llama-3.1-8b-instant
Object: model
Created: 1693721698
Owned by: Meta
Active: True
Context window: 131072
Public apps: None
--------------------
Model ID: llama3-70b-8192
Object: model
Created: 1693721698
Owned by: Meta
Active: True
Context window: 8192
Public apps: None
--------------------
Model ID: llama3-8b-8192
Object: model
Created: 1693721698
Owned by: Meta
Active: True
Context window: 8192
Public apps: None
--------------------
Model ID: llama3-groq-70b-8192-tool-use-preview
Object: model
Created: 1693721698
Owned by: Groq
Ac