# Ingesting Data into Qdrant
This notebook showcases how to use the `gef-ml` package to create the initial vector database with documents from `/data`.

This assumes data has already been downloaded into `/data/dumps/`

In [None]:
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient

qdrant_client = QdrantClient(
    location="http://localhost:6333",
)
vector_store = QdrantVectorStore(client=qdrant_client, collection_name="temp_collection")

In [None]:
from gef_ml.ingestion import get_pipeline

ingest_pipeline = get_pipeline(vector_store)

## Run the pipeline on a single document

In [None]:
from llama_index.readers.file import PDFReader

# Testing

In [1]:
from gef_ml.ingestion import StreamingIngestion
from gef_ml.utils import get_qdrant_vectorstore
import aiohttp

In [2]:
vector_store = get_qdrant_vectorstore(collection_name="temp")

ingest_manager = StreamingIngestion(
    directory="../data/to_ingest/", vector_store=vector_store
)

2024-03-04 13:21:14,309 - httpx - INFO - HTTP Request: DELETE http://localhost:6333/collections/temp "HTTP/1.1 200 OK"


2024-03-04 13:21:14,601 - httpx - INFO - HTTP Request: PUT http://localhost:6333/collections/temp "HTTP/1.1 200 OK"
2024-03-04 13:21:14,606 - httpx - INFO - HTTP Request: GET http://localhost:6333/collections/temp "HTTP/1.1 200 OK"
2024-03-04 13:21:14,610 - gef_ml.ingestion - INFO - Initializing StreamingIngestion for directory: ../data/to_ingest/


In [3]:
nodes = ingest_manager._ingest_project_id("6930")

2024-03-04 13:21:14,868 - gef_ml.ingestion - INFO - Ingesting documents for project ID: 6930 from ../data/to_ingest/6930
2024-03-04 13:21:15,909 - gef_ml.ingestion - INFO - Loaded 15 documents for project 6930.
  from .autonotebook import tqdm as notebook_tqdm
Parsing nodes: 100%|██████████| 15/15 [00:00<00:00, 1054.99it/s]
2024-03-04 13:21:15,931 - gef_ml.ingestion - INFO - Processed 22 documents for project 6930.


In [4]:
from collections.abc import Collection
from typing import Any

def print_types_simplified_corrected(obj, indent=0):
    """
    Recursively prints the types of any object, including nested structures such as dictionaries,
    lists, sets, and other collections, simplifying the output for containers of primitives
    by not listing individual primitives.

    Args:
    - obj: The object whose type information is to be printed.
    - indent (int): The current indentation level for pretty printing.
    """
    prefix = " " * indent
    if isinstance(obj, dict):
        print(f"{prefix}{type(obj).__name__} containing:")
        for key, value in obj.items():
            print(f"{prefix}  Key type: {type(key).__name__}, Value type: ", end="")
            print_types_simplified_corrected(value, indent + 4)
    elif isinstance(obj, Collection) and not isinstance(obj, str):
        item_types = {type(item).__name__ for item in obj}
        if len(item_types) == 1:
            item_type = next(iter(item_types))  # Use next(iter()) to safely access the first item
            if item_type in ["int", "float", "str", "bool", "NoneType"]:
                print(f"{prefix}{type(obj).__name__} of {item_type} items")
                return
        print(f"{prefix}{type(obj).__name__} of mixed types")
        for item in obj:
            print_types_simplified_corrected(item, indent + 4)
    else:
        print(f"{prefix}{type(obj).__name__}")

In [5]:
def print_types_with_key_names(obj, key_name=None, indent=0):
    """
    Recursively prints the types of any object, including nested structures such as dictionaries,
    lists, sets, and other collections, simplifying the output for containers of primitives
    by not listing individual primitives. It includes the key names for dictionary entries.

    Args:
    - obj: The object whose type information is to be printed.
    - key_name: The name of the key (if any) associated with the current object.
    - indent (int): The current indentation level for pretty printing.
    """
    prefix = " " * indent
    key_str = f"'{key_name}' " if key_name else ""
    if isinstance(obj, dict):
        print(f"{prefix}{key_str}{type(obj).__name__} containing:")
        for key, value in obj.items():
            print_types_with_key_names(value, key_name=key, indent=indent + 4)
    elif isinstance(obj, Collection) and not isinstance(obj, str):
        item_types = {type(item).__name__ for item in obj}
        if len(item_types) == 1:
            item_type = next(iter(item_types))  # Use next(iter()) to safely access the first item
            if item_type in ["int", "float", "str", "bool", "NoneType"]:
                print(f"{prefix}{key_str}{type(obj).__name__} of {item_type} items")
                return
        print(f"{prefix}{key_str}{type(obj).__name__} of mixed types")
        for item in obj:
            print_types_with_key_names(item, indent=indent + 4)
    else:
        print(f"{prefix}{key_str}{type(obj).__name__}")

In [6]:
session = aiohttp.ClientSession()

In [7]:
embedding = ingest_manager.fetch_embedding(session=session, node=nodes[0], model="togethercomputer/m2-bert-80M-32k-retrieval")

In [None]:
response = await embedding
response

In [9]:
print_types_with_key_names(response)

list of float items


In [None]:
response['data'][0]['embedding']

In [10]:
embeddings = ingest_manager.generate_embeddings_rest(nodes)

In [11]:
resp = await embeddings

2024-03-04 13:21:44,995 - gef_ml.ingestion - INFO - Generating embeddings for 22 nodes using model togethercomputer/m2-bert-80M-32k-retrieval
Creating tasks: 100%|██████████| 22/22 [00:00<00:00, 98689.51it/s]


In [13]:
print_types_with_key_names(resp)

list of mixed types
    list of float items
    list of float items
    list of float items
    list of float items
    list of float items
    list of float items
    list of float items
    list of float items
    list of float items
    list of float items
    list of float items
    list of float items
    list of float items
    list of float items
    list of float items
    list of float items
    list of float items
    list of float items
    list of float items
    list of float items
    list of float items
    list of float items


In [15]:
from llama_index.core.schema import MetadataMode
nodes[0].get_content(metadata_mode=MetadataMode.EMBED)

'page_label: 1\nfilename: p6930_doc2.pdf\nextension: .pdf\nproject_id: 6930\ndoc_id: 2\n\n_______________________________________________________________________________________ _____________________________  \n \nGEF ID:  6930  \nCountry/Region:  China  \nProject Title:  Energy Efficiency Improvement in Public Sector Buildings   \nGEF Agency:  UNDP  GEF Agency Project ID:  5395 (UNDP)  \nType of Trust Fund:  GEF Trust Fund  GEF Focal Area (s):  Climate Change  \nGEF -5 Focal Area/ LDCF/SCCF Objective (s):   \nAnticipated Financing  PPG:  $200,000  Project Grant:  $8,932,420  \nCo-financing:  $62,500,000  Total Project Cost:  $71,632,420  \nPIF Approval:   Council Approval/Expected:  October 01, 2015  \nCEO Endorsement/Approval   Expected Project Start Date:   \nProgram Manager:  Ming Yang  Agency Contact Person:  Manuel L. Soriano  \n \nReview Criteria  Questions  Secretariat Comment at PIF (PFD)/Work \nProgram Inclusion 1 Secretariat Comment At CEO \nEndorsement(FSP)/Approval (MSP)  