# Ingesting Data into Qdrant
This notebook showcases how to use the `gef-ml` package to create the initial vector database with documents from `/data`.

This assumes data has already been downloaded into `/data/dumps/`

In [None]:
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient

qdrant_client = QdrantClient(
    location="http://localhost:6333",
)
vector_store = QdrantVectorStore(client=qdrant_client, collection_name="temp_collection")

In [None]:
from gef_ml.ingestion import get_pipeline

ingest_pipeline = get_pipeline(vector_store)

## Run the pipeline on a single document

In [None]:
from llama_index.readers.file import PDFReader

In [1]:
from gef_ml.ingestion import StreamingIngestion
from gef_ml.utils import get_qdrant_vectorstore
import aiohttp

In [2]:
vector_store = get_qdrant_vectorstore(collection_name="temp")

ingest_manager = StreamingIngestion(
    directory="../data/to_ingest/", vector_store=vector_store
)

2024-03-04 12:49:06,091 - httpx - INFO - HTTP Request: DELETE http://localhost:6333/collections/temp "HTTP/1.1 200 OK"
2024-03-04 12:49:06,385 - httpx - INFO - HTTP Request: PUT http://localhost:6333/collections/temp "HTTP/1.1 200 OK"
2024-03-04 12:49:06,388 - httpx - INFO - HTTP Request: GET http://localhost:6333/collections/temp "HTTP/1.1 200 OK"
2024-03-04 12:49:06,392 - gef_ml.ingestion - INFO - Initializing StreamingIngestion for directory: ../data/to_ingest/


In [3]:
nodes = ingest_manager._ingest_project_id("6930")

2024-03-04 12:49:07,152 - gef_ml.ingestion - INFO - Ingesting documents for project ID: 6930 from ../data/to_ingest/6930
2024-03-04 12:49:08,169 - gef_ml.ingestion - INFO - Loaded 15 documents for project 6930.
  from .autonotebook import tqdm as notebook_tqdm
Parsing nodes: 100%|██████████| 15/15 [00:00<00:00, 1096.15it/s]
2024-03-04 12:49:08,191 - gef_ml.ingestion - INFO - Processed 22 documents for project 6930.


In [4]:
session = aiohttp.ClientSession()

In [5]:
embedding = ingest_manager.fetch_embedding(session=session, node=nodes[0], model="togethercomputer/m2-bert-80M-32k-retrieval")

In [28]:
from collections.abc import Collection
from typing import Any

def print_types_simplified_corrected(obj, indent=0):
    """
    Recursively prints the types of any object, including nested structures such as dictionaries,
    lists, sets, and other collections, simplifying the output for containers of primitives
    by not listing individual primitives.

    Args:
    - obj: The object whose type information is to be printed.
    - indent (int): The current indentation level for pretty printing.
    """
    prefix = " " * indent
    if isinstance(obj, dict):
        print(f"{prefix}{type(obj).__name__} containing:")
        for key, value in obj.items():
            print(f"{prefix}  Key type: {type(key).__name__}, Value type: ", end="")
            print_types_simplified_corrected(value, indent + 4)
    elif isinstance(obj, Collection) and not isinstance(obj, str):
        item_types = {type(item).__name__ for item in obj}
        if len(item_types) == 1:
            item_type = next(iter(item_types))  # Use next(iter()) to safely access the first item
            if item_type in ["int", "float", "str", "bool", "NoneType"]:
                print(f"{prefix}{type(obj).__name__} of {item_type} items")
                return
        print(f"{prefix}{type(obj).__name__} of mixed types")
        for item in obj:
            print_types_simplified_corrected(item, indent + 4)
    else:
        print(f"{prefix}{type(obj).__name__}")

In [30]:
def print_types_with_key_names(obj, key_name=None, indent=0):
    """
    Recursively prints the types of any object, including nested structures such as dictionaries,
    lists, sets, and other collections, simplifying the output for containers of primitives
    by not listing individual primitives. It includes the key names for dictionary entries.

    Args:
    - obj: The object whose type information is to be printed.
    - key_name: The name of the key (if any) associated with the current object.
    - indent (int): The current indentation level for pretty printing.
    """
    prefix = " " * indent
    key_str = f"'{key_name}' " if key_name else ""
    if isinstance(obj, dict):
        print(f"{prefix}{key_str}{type(obj).__name__} containing:")
        for key, value in obj.items():
            print_types_with_key_names(value, key_name=key, indent=indent + 4)
    elif isinstance(obj, Collection) and not isinstance(obj, str):
        item_types = {type(item).__name__ for item in obj}
        if len(item_types) == 1:
            item_type = next(iter(item_types))  # Use next(iter()) to safely access the first item
            if item_type in ["int", "float", "str", "bool", "NoneType"]:
                print(f"{prefix}{key_str}{type(obj).__name__} of {item_type} items")
                return
        print(f"{prefix}{key_str}{type(obj).__name__} of mixed types")
        for item in obj:
            print_types_with_key_names(item, indent=indent + 4)
    else:
        print(f"{prefix}{key_str}{type(obj).__name__}")

In [None]:
response = await embedding
response

In [31]:
print_types_with_key_names(response)

dict containing:
    'object' str
    'data' list of mixed types
        dict containing:
            'object' str
            'embedding' list of float items
            'index' int
    'model' str


In [16]:
response['data'][0]['embedding']

[-0.043873984,
 -0.07827735,
 0.006439118,
 -0.031071128,
 -0.00095829525,
 0.023510134,
 0.042881183,
 -0.01731634,
 -0.08306804,
 -0.055455036,
 0.023329835,
 0.005277718,
 -0.043901924,
 -0.0040224544,
 -0.0054063494,
 0.06583597,
 0.010401386,
 0.0060811187,
 -0.122982234,
 0.002388037,
 0.019085478,
 0.0059112795,
 0.01901772,
 -0.06311566,
 0.0803122,
 0.013560534,
 -0.08629656,
 0.014837368,
 0.08834745,
 -0.005133883,
 0.0063822707,
 0.02127239,
 0.0014071139,
 -0.019836212,
 -0.093763664,
 -0.042706307,
 -0.017153494,
 0.05616766,
 0.009276444,
 0.03801713,
 0.05802761,
 -0.010630067,
 -0.016273387,
 -0.050216015,
 -0.038751382,
 0.0021800126,
 0.00801691,
 0.04555009,
 -0.014232045,
 -0.028971352,
 -0.037945908,
 -0.039426643,
 0.025607381,
 -0.026736364,
 0.025604453,
 0.06608859,
 -0.0077481824,
 0.03613252,
 -0.07653167,
 -0.09156937,
 -0.011590372,
 0.06858938,
 0.028361814,
 -0.004184971,
 0.031571668,
 -0.01252438,
 -0.011770919,
 0.039808378,
 0.026196163,
 -0.04422525

In [None]:
embeddings = ingest_manager.generate_embeddings_rest(nodes)