In [48]:
!python -m pip install redis tabulate haystack-ai google-ai-haystack sourcegraph==0.0.6 google-generativeai

Collecting protobuf (from google-generativeai)
  Using cached protobuf-4.25.4-cp37-abi3-macosx_10_9_universal2.whl.metadata (541 bytes)
Using cached protobuf-4.25.4-cp37-abi3-macosx_10_9_universal2.whl (394 kB)
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 5.27.3
    Uninstalling protobuf-5.27.3:
      Successfully uninstalled protobuf-5.27.3
Successfully installed protobuf-4.25.4


In [2]:
%%capture
!python -m pip install --upgrade --force-reinstall protobuf

In [28]:
import os
from dotenv import load_dotenv
import numpy as np
import redis
from typing import List
from redis.commands.search.query import Query
import google.generativeai as genai

In [29]:
load_dotenv()

True

In [30]:
client = redis.Redis(
  host=os.environ['REDIS_HOST'],
  port=12305,
  password=os.environ['REDIS_PASSWORD'])

In [31]:
client.ping()

True

In [32]:
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

def get_embeddings(content: List):
    return genai.embed_content(model='models/text-embedding-004',content=content)['embedding']

In [33]:
query = "Training new tokenizer takes a lot time to complete. Also memory consumption seems pretty high"

In [34]:
def draft_prompt(query: str, chat_history: str) -> str:
    """
    Perform a vector similarity search and retrieve related functions.

    Args:
        query (str): The input query to encode.

    Returns:
        str: A formatted string containing details of related functions.
    """
    INDEX_NAME = "idx:codes_vss"
    
    vector_search_query = (
        Query('(*)=>[KNN 2 @vector $query_vector AS vector_score]')
        .sort_by('vector_score')
        .return_fields('vector_score', 'id', 'name', 'definition', 'file_name', 'type', 'uses')
        .dialect(2)
    )
    
    encoded_query = get_embeddings(query)
    vector_params = {
        "query_vector": np.array(encoded_query, dtype=np.float32).tobytes()
    }
    
    result_docs = client.ft(INDEX_NAME).search(vector_search_query, vector_params).docs
    
    related_items: List[str] = []
    dependencies: List[str] = []
    for doc in result_docs:
        related_items.append(doc.name)
        if doc.uses:
            dependencies.extend(use for use in doc.uses.split(", ") if use)
    
    dependencies = list(set(dependencies) - set(related_items))
    
    def get_query(item_list):
        return Query(f"@name:({' | '.join(item_list)})").return_fields(
            'id', 'name', 'definition', 'file_name', 'type'
        )
    
    related_docs = client.ft(INDEX_NAME).search(get_query(related_items)).docs
    dependency_docs = client.ft(INDEX_NAME).search(get_query(dependencies)).docs
    
    def format_doc(doc):
        return (
            f"{'*' * 28} CODE SNIPPET {doc.id} {'*' * 28}\n"
            f"* Name: {doc.name}\n"
            f"* File: {doc.file_name}\n"
            f"* {doc.type.capitalize()} definition:\n"
            f"```python\n{doc.definition}\n```\n"
        )
    
    formatted_results_main = [format_doc(doc) for doc in related_docs]
    formatted_results_support = [format_doc(doc) for doc in dependency_docs]
    
    return (
        f"User Question: {query}\n\n"
        f"Current Chat History: \n{chat_history}\n\n"
        f"USE BELOW CODES TO ANSWER USER QUESTIONS.\n"
        f"{chr(10).join(formatted_results_main)}\n\n"
        f"SOME SUPPORTING FUNCTIONS AND CLASS YOU MAY WANT.\n"
        f"{chr(10).join(formatted_results_support)}"
    )

In [35]:
prompt = draft_prompt(query,{"user":"hi","Agent":"hello there"})

I0000 00:00:1723259930.726312 1717903 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported


In [36]:
print(prompt)

User Question: Training new tokenizer takes a lot time to complete. Also memory consumption seems pretty high

Current Chat History: 
{'user': 'hi', 'Agent': 'hello there'}

USE BELOW CODES TO ANSWER USER QUESTIONS.
**************************** CODE SNIPPET code:004 ****************************
* Name: load_tokenizer
* File: dataset_utils.py
* Function definition:
```python
def load_tokenizer():
    tokenizer = Tokenizer(max_length=MAX_LENGTH)
    tokenizer.load_from_pretrained(DUMMY_FILE_NAME)
    return tokenizer
```

**************************** CODE SNIPPET code:002 ****************************
* Name: Tokenizer
* File: tokenizer.py
* Class definition:
```python
class Tokenizer:

    def __init__(self, max_length: int, unknown_token: str='<unk>', pad_token: str='<pad>'):
        self.unknown_token_id = None
        self.token_id_to_token_map = None
        self.vocab_map = None
        self.unknown_token = unknown_token
        self.pad_token = pad_token
        self.tokenized_char

In [37]:
from haystack import Pipeline
from haystack import component
from haystack.utils import Secret
from haystack.components.builders import PromptBuilder
from haystack_integrations.components.generators.google_ai import GoogleAIGeminiGenerator
from haystack.components.generators import OpenAIGenerator
from IPython.display import Markdown

In [38]:
@component
class RedisRetreiver:
  @component.output_types(context=str)
  def run(self, query:str, chat_history:str):
    return {"context": draft_prompt(query, chat_history)}


In [39]:
llm = GoogleAIGeminiGenerator(api_key=Secret.from_env_var("GEMINI_API_KEY"), model='gemini-1.5-pro')
# llm = OpenAIGenerator()

In [40]:
template = """
You are a helpful agent optimized to resolve GitHub issues for your organization's libraries. Users will ask questions when they encounter problems with the code repository.
You have access to all the necessary code for addressing these issues. 
First, you should understand the user's question and identify the relevant code blocks. 
Then, craft a precise and targeted response that allows the user to find an exact solution to their problem. 
You must provide code snippets rather than just opinions.
You should always assume user has installed this python package in their system and raised question raised while they are using the library.

In addition to the above tasks, you are free to:
 * Greet the user.
 * [ONLY IF THE QUESTION IS INSUFFICIENT] Request additional clarity.
 * Politely decline irrelevant queries.
 * Inform the user if their query cannot be processed or accomplished.

By any chance you should NOT,
 * Ask or recommend user to use different library. Or code snipits related to other similar libraies.
 * Provide inaccurate explnations.
 * Provide sugestions without code examples.

{{context}}
"""

prompt_builder = PromptBuilder(template=template)

In [41]:
pipeline = Pipeline()
pipeline.add_component(name="retriever", instance=RedisRetreiver())
pipeline.add_component("prompt_builder", prompt_builder)
pipeline.add_component("llm", llm)
pipeline.connect("retriever.context", "prompt_builder")
pipeline.connect("prompt_builder", "llm")
pipeline.draw(path='pipeline.png')

In [43]:
query

'Training new tokenizer takes a lot time to complete. Also memory consumption seems pretty high'

In [44]:
query = 'Training new tokenizer takes a lot time to complete. Also memory consumption seems pretty high'
response = pipeline.run({"retriever": {"query": query, "chat_history": {}}})
Markdown(response["llm"]["replies"][0])

Hello! I see that you are facing issues with training a new tokenizer that takes a lot of time to complete and results in high memory consumption. To address this, you can optimize the training process by specifying a smaller corpus for training.

You can try the following code snippet in your training process to reduce the memory consumption:

```python
from sinlib import Tokenizer

# Load the tokenizer
tokenizer = Tokenizer(max_length=MAX_LENGTH)

# Train the tokenizer on a subset of the corpus
corpus_subset = [...]  # Provide a smaller subset of your corpus
tokenizer.train(corpus_subset)

# Encode/decode using the trained tokenizer
encoded_text = tokenizer("Your input text here")
decoded_text = tokenizer.decode(encoded_text)
```

By training the tokenizer on a smaller corpus, you can reduce the time taken for training and lower the memory consumption. Let me know if you need further assistance!

In [45]:
question = "Give me example of training Tokenizer for new dataset. My dataset contains text lines. It's name is corpus.txt"
response = pipeline.run({"retriever": {"query": question, "chat_history": {}}})
Markdown(response["llm"]["replies"][0])

Hello! To train a Tokenizer for a new dataset, you need to follow these steps:

1. Load the Tokenizer:
```python
from sinlib import Tokenizer

tokenizer = Tokenizer()
```

2. Read and preprocess the text from your dataset (corpus.txt):
```python
with open('corpus.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

corpus = [line.strip() for line in lines]
```

3. Train the Tokenizer on this corpus:
```python
tokenizer.train(corpus)
```

4. Save the trained Tokenizer for future use:
```python
save_path = "path_to_save"
tokenizer.save_tokenizer(save_path)
```

By following these steps, your Tokenizer will be trained on the new dataset from corpus.txt. Let me know if you need any more assistance!

In [46]:
question = "Training new tokenizer take forever to finish."
response = pipeline.run({"retriever": {"query": question, "chat_history": {}}}, include_outputs_from=['retriever'])
Markdown(response["llm"]["replies"][0])

Hello! It seems like your training process for the tokenizer is taking a lot of time. One potential reason for this could be the character level tokenization process that might get slow with large amounts of text data. 

To optimize this process, you can consider the following modifications:

1. **Reduce Text Size**: If possible, try training the tokenizer on a smaller subset of the text data to see if the training time improves.

2. **Multithreading**: The tokenizer training process is already set up to run using multiple threads for faster processing. You can further optimize this by adjusting the number of threads based on your system's capabilities.

3. **Batch Processing**: Instead of processing the entire text corpus at once, consider breaking it down into smaller batches for training.

Here's a sample code snippet on how you can adjust the number of threads for multithreading in the `Tokenizer` class:

```python
import concurrent.futures

...

def train(self, text_list) -> None:
    """
    Train the tokenizer on a list of text strings.

    Parameters
    ----------
    text_list : list of str
        List of text strings to be used for training the tokenizer.

    Examples
    --------
    >>> from sinlib import Tokenizer
    >>> corpus = [...]
    >>> tokenizer = Tokenizer()
    >>> tokenizer.train(corpus)
    """
    max_threads = 4  # Adjust the number of threads as per your system
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
        results = list(executor.map(self.__process_text, text_list))
        self.tokenized_chars = [char for sublist in results for char in sublist]
    self.unique_chars = set(self.tokenized_chars)
    self.vocab_map = dict(zip(self.unique_chars, range(len(self.unique_chars))))
    self.vocab_map[self.unknown_token] = len(self.vocab_map)
    self.vocab_map[self.pad_token] = len(self.vocab_map)
    self.unknown_token_id = self.vocab_map[self.unknown_token]
    self.pad_token_id = self.vocab_map[self.pad_token]
    self.token_id_to_token_map = {value: key for key, value in self.vocab_map.items()}
```

Try adjusting the `max_threads` value based on your system's capacity. This should help optimize the training process and potentially reduce the time it takes to train the new tokenizer. Let me know if you need further assistance!

In [47]:
question = "How to train new transilator model for new language?"
response = pipeline.run({"retriever": {"query": question, "chat_history": {}}}, include_outputs_from=['retriever'])
Markdown(response["llm"]["replies"][0])

Hello! To train a new transliterator model for a new language, you can follow these steps:

Step 1: Create a new Tokenizer for the new language
```python
new_language_tokenizer = Tokenizer(max_length=MAX_LENGTH)
new_language_tokenizer.train(training_corpus)
```

Step 2: Update the load_transliterator_model function to load the new model
```python
def load_transliterator_model():
    tokenizer = new_language_tokenizer
    input_size = len(tokenizer)
    output_size = len(tokenizer)
    hidden_size = HIDDEN_SIZE
    filepath = Path(MODELS_PATH) / NEW_CHECKPOINT_NAME
    device = detect_device()
    model = BiLSTMTranslator(input_size, hidden_size, output_size).to(device)
    checkpoint = torch.load(filepath, map_location=device)
    model.load_state_dict(checkpoint)
    return model
```

Step 3: Train the new transliterator model for the new language
```python
transliterator = Transliterator()
new_language_corpus = get_training_corpus_for_new_language()
new_language_tokenizer.train(new_language_corpus)
new_language_transliterator_model = train_new_model(new_language_tokenizer)  # Function not provided, should be implemented based on your training methodology
```

By following these steps, you should be able to train a new transliterator model for a new language. Let me know if you need further assistance!