In [12]:
import os
from typing import List
import numpy as np
import redis
import google.generativeai as genai
from dotenv import load_dotenv
from tqdm import tqdm

from redis.commands.search.field import (
    TagField,
    TextField,
    VectorField,
)
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.query import Query

In [13]:
load_dotenv()

True

In [14]:
client = redis.Redis(
  host=os.environ['REDIS_HOST'],
  port=12305,
  password=os.environ['REDIS_PASSWORD'])

In [15]:
client.ping()

True

## Loading data from github

In [5]:
from sourcegraph import Sourcegraph

In [21]:
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 64,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}

model = genai.GenerativeModel(
  model_name="gemini-1.5-flash",
  generation_config=generation_config,
  system_instruction="You are optimized to generate accurate descriptions for given Python codes. When the user inputs the code, you must return the description according to its goal and functionality.  You are not allowed to generate additional details. The user expects at least 5 sentence-long descriptions.",
)

In [22]:
def fetch_data(url):
    def get_description(code):
      chat_session = model.start_chat(
        history=[
          {
            "role": "user",
            "parts": [
              f"Code: {code}",
            ],
          },
        ]
      )
      response = chat_session.send_message("INSERT_INPUT_HERE")

      return response.text
    gihub_repository = Sourcegraph(url)
    gihub_repository.run()
    data = dict(gihub_repository.node_data)
    for key, value in tqdm(data.items()):
      data[key]['description'] = get_description(value['definition'])
      data[key]['uses'] = ", ".join(list(gihub_repository.get_dependencies(key)))
    return data

In [8]:
data = fetch_data("https://github.com/Ransaka/sinlib.git")

I0000 00:00:1723182195.471295 1337262 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
100%|██████████| 15/15 [00:24<00:00,  1.61s/it]


In [9]:
data['Romanizer']

{'type': 'class',
 'name': 'Romanizer',
 'definition': "class Romanizer:\n\n    def __init__(self, char_mapper_fp: str, tokenizer_path: str):\n        if char_mapper_fp is None:\n            char_mapper_fp = CHAR_MAPPER_FP\n        if tokenizer_path is None:\n            tokenizer_path = DEFAULT_VOCAB_MAP_FP\n        self.char_mapper = load_char_mapper(char_mapper_fp)\n        self.tokenizer = Tokenizer(max_length=None)\n        self.tokenizer.load_from_pretrained(tokenizer_path)\n\n    def __call__(self, text):\n        return self.__romanize(text)\n\n    def __romanize(self, text: str):\n        text = remove_non_printable(text)\n        chars = np.array(list(text))\n        sinhala_mask = [True if ch in ALL_SINHALA_CHARACTERS + list(NUBERS_AND_PUNKTS) + [' '] else False for ch in chars]\n        sinhala_text = ''.join(chars[sinhala_mask]).strip()\n        encodings = self.tokenizer(sinhala_text, truncate_and_pad=False)\n        decoded_sinhala_chars = [self.tokenizer.token_id_to_tok

In [23]:
for model_details in list(genai.list_models()):
    if 'embedContent' in model_details.supported_generation_methods:
        print(model_details.name,model_details.description)

I0000 00:00:1723202734.831485 1337262 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported


models/embedding-001 Obtain a distributed representation of a text.
models/text-embedding-004 Obtain a distributed representation of a text.


In [24]:
def get_embeddings(content: List):
    return genai.embed_content(model='models/text-embedding-004',content=content)['embedding']

In [16]:
INDEX_NAME = "idx:codes_vss"

In [17]:
def ingest_data(data):
    pipeline = client.pipeline()
    for i, code_metadata in enumerate(data.values(), start=1):
        redis_key = f"code:{i:03}"
        pipeline.json().set(redis_key, "$", code_metadata)
    _ = pipeline.execute()
    keys = sorted(client.keys("code:*"))
    defs = client.json().mget(keys, "$.definition")
    descs = client.json().mget(keys, "$.description")
    embed_inputs = []

    for i in range(1, len(keys)+1):
        embed_inputs.append(
            f"""{defs[i-1][0]}\n\n{descs[i-1][0]}"""
        )
    embeddings = get_embeddings(embed_inputs)
    VECTOR_DIMENSION = len(embeddings[0])
    pipeline = client.pipeline()
    for key, embedding in zip(keys, embeddings):
        pipeline.json().set(key, "$.embeddings", embedding)
    pipeline.execute()

    schema = (
        TextField("$.name", no_stem=True, as_name="name"),
        TagField("$.type", as_name="type"),
        TextField("$.definition", no_stem=True, as_name="definition"),
        TextField("$.file_name", no_stem=True, as_name="file_name"),
        TextField("$.description", no_stem=True, as_name="description"),
        TextField("$.uses", no_stem=True, as_name="uses"),
        VectorField(
            "$.embeddings",
            "HNSW",
            {
                "TYPE": "FLOAT32",
                "DIM": VECTOR_DIMENSION,
                "DISTANCE_METRIC": "COSINE",
            },
            as_name="vector",
        ),
    )
    definition = IndexDefinition(prefix=["code:"], index_type=IndexType.JSON)
    _ = client.ft(INDEX_NAME).create_index(fields=schema, definition=definition)

    info = client.ft(INDEX_NAME).info()
    num_docs = info["num_docs"]
    indexing_failures = info["hash_indexing_failures"]
    print(f"{num_docs} documents indexed with {indexing_failures} failures")

In [18]:
client.json().get("code:010")

{'type': 'function',
 'name': 'load_default_vocab_map',
 'definition': "def load_default_vocab_map():\n    with open(Path(DEFAULT_VOCAB_MAP_FP) / 'vocab.json', 'r') as f:\n        vocab_map = json.load(f)\n    return vocab_map",
 'file_name': 'preprocessing.py',
 'docstring': '',
 'description': 'The `load_default_vocab_map()` function is responsible for loading a vocabulary map from a JSON file. It first opens the file located at `DEFAULT_VOCAB_MAP_FP/vocab.json` in read mode. Then, it uses the `json.load()` function to parse the JSON data from the file and store it in the `vocab_map` variable. Finally, the function returns the `vocab_map`, which is a dictionary containing the vocabulary mapping. This function is likely used in a natural language processing or machine learning application where a predefined vocabulary is required for processing text data. By loading the vocabulary map from a file, the application can ensure consistency and avoid hardcoding the vocabulary within the co

In [36]:
queries = ["Tokenizer takes lot time to complete train"]

In [37]:
encoded_queries = get_embeddings(queries)

In [38]:
vector_search_query_with_range = (
    Query("@vector:[VECTOR_RANGE $range $query_vector]=>{$YIELD_DISTANCE_AS: score}")
    .sort_by('score')
    .return_fields('score', 'id', 'name', 'definition', 'file_name', 'type', 'uses')
    .dialect(2)
)

In [39]:
vector_search_query = (
    Query('*=>[KNN 3 @vector $query_vector AS vector_score]')
    .sort_by('vector_score')
    .return_fields('vector_score', 'id', 'name', 'definition', 'file_name', 'type', 'uses')
    .dialect(2)
)

In [40]:
client.ft(INDEX_NAME).search(
    vector_search_query,
    {
      'query_vector': np.array(encoded_queries[0], dtype=np.float32).tobytes()
    }
).docs

 Document {'id': 'code:004', 'payload': None, 'vector_score': '0.467481791973', 'name': 'load_tokenizer', 'definition': 'def load_tokenizer():\n    tokenizer = Tokenizer(max_length=MAX_LENGTH)\n    tokenizer.load_from_pretrained(DUMMY_FILE_NAME)\n    return tokenizer', 'file_name': 'dataset_utils.py', 'type': 'function', 'uses': 'Tokenizer, process_text, load_default_vocab_map'},
 Document {'id': 'code:006', 'payload': None, 'vector_score': '0.49214309454', 'name': 'load_transliterator_model', 'definition': 'def load_transliterator_model():\n    tokenizer = load_tokenizer()\n    input_size = len(tokenizer)\n    output_size = len(tokenizer)\n    hidden_size = HIDDEN_SIZE\n    filepath = Path(MODELS_PATH) / CHECKPOINT_NAME\n    device = detect_device()\n    model = BiLSTMTranslator(input_size, hidden_size, output_size).to(device)\n    checkpoint = torch.load(filepath, map_location=device)\n    model.load_state_dict(checkpoint)\n    return model', 'file_name': 'model_utils.py', 'type': 'f

In [52]:
client.ft(INDEX_NAME).search(
    vector_search_query_with_range,
    {
      'query_vector': np.array(encoded_queries[0], dtype=np.float32).tobytes(),
      'range': 1.0
    }
).docs

 Document {'id': 'code:004', 'payload': None, 'score': '0.467481791973', 'name': 'load_tokenizer', 'definition': 'def load_tokenizer():\n    tokenizer = Tokenizer(max_length=MAX_LENGTH)\n    tokenizer.load_from_pretrained(DUMMY_FILE_NAME)\n    return tokenizer', 'file_name': 'dataset_utils.py', 'type': 'function', 'uses': 'Tokenizer, process_text, load_default_vocab_map'},
 Document {'id': 'code:006', 'payload': None, 'score': '0.49214309454', 'name': 'load_transliterator_model', 'definition': 'def load_transliterator_model():\n    tokenizer = load_tokenizer()\n    input_size = len(tokenizer)\n    output_size = len(tokenizer)\n    hidden_size = HIDDEN_SIZE\n    filepath = Path(MODELS_PATH) / CHECKPOINT_NAME\n    device = detect_device()\n    model = BiLSTMTranslator(input_size, hidden_size, output_size).to(device)\n    checkpoint = torch.load(filepath, map_location=device)\n    model.load_state_dict(checkpoint)\n    return model', 'file_name': 'model_utils.py', 'type': 'function', 'use