In [135]:
import os
from typing import Optional, List, Dict

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk, scan, streaming_bulk

In [None]:
type_map = {
    "int":"integer",
    "float":"float",
    "double":"double",
    "str": "text",
    "bool": "boolean",
    "datetime": "date",
    "list[int]":"integer",
    "list[str]":"text",
    "list[float]": "float",
    "list[double]": "double",
    "torch.tensor": "dense_vector",
    "numpy.ndarray": "dense_vector"
}

In [244]:
def traverse_map (map_dict):
    original_map_dict = dict(map_dict)
    for k, v in map_dict.items():
        if isinstance(v, dict):
            traverse_map(v)
        else:
            try:
                map_dict[k] = {"type":TYPE_MAP[v.lower()]}
            except Exception as e:
                print(f'{e.__class__.__name__}: Key {v} not found in TYPE_MAP. Mapping not updated')
                return original_map_dict
    return map_dict


In [7]:
TYPE_MAP =  {
    "int":"integer",
    "float":"float",
    "double":"double",
    "str": "text",
    "bool": "boolean",
    "datetime": "date",
    "list[int]":"integer",
    "list[str]":"text",
    "list[float]": "float",
    "list[double]": "double",
    "torch.tensor": "dense_vector",
    "numpy.ndarray": "dense_vector"
}

In [34]:
user_map = {
    "name":"str",
    "age":"int",
    "education":{
        "primary":{
            "school":"str"
        },
        "secondary":"str",
        "tertiary":"str"
    }
}

In [247]:
final_map = {"mappings":{"properties":{}}}

In [248]:
updated_map = traverse_map(user_map)

KeyError: Key meow not found in TYPE_MAP. Mapping not updated


In [249]:
updated_map

{'name': {'type': 'text'},
 'age': {'type': 'integer'},
 'education': {'primary': {'school': 'meow'},
  'secondary': {'type': 'text'},
  'tertiary': {'type': 'text'}}}

In [8]:
TYPE_MAP =  {
    "int":"integer",
    "float":"float",
    "double":"double",
    "str": "text",
    "bool": "boolean",
    "datetime": "date",
    "list[int]":"integer",
    "list[str]":"text",
    "list[float]": "float",
    "list[double]": "double",
    "torch.tensor": "dense_vector",
    "numpy.ndarray": "dense_vector"
}

In [194]:
user_map = {
    "name":"str",
    "age":"int",
    "education":{
        "primary":{
            "school":"str"
        },
        "secondary":"str",
        "tertiary":"str"
    }
}

user_map = {
    "owner_name": "str",
    "owner_id": "str",
    "cats": "int",
    "dogs": "int"
}

In [221]:
MAX_BULK_SIZE=10000
class DocMgr():
    
    
    def __init__(self):
        self.url = f"https://{os.environ.get('ELASTICSEARCH_HOST')}:{os.environ.get('ELASTICSEARCH_C_PORT')}"
        self.username = os.environ.get('ELASTIC_USERNAME')
        self.password = os.environ.get('ELASTIC_PASSWORD')
        self.client = Elasticsearch(self.url, 
                                    verify_certs=False, 
                                    basic_auth=(self.username, self.password))
        
        self.consolidated_actions=[]

        
    def _check_valid_values(self, map_dict:dict) -> int:
        """
        Traverse mapping dictionary to ensure that all types are valid types within TYPE_MAP

        Args:
            map_dict (dict): Mapping to be checked

        Returns:
            int: 0 if there is invalid types, 1 otherwise

        """
        ret_val = 1
        for k, v in map_dict.items():
            if isinstance(v, dict):
                ret_val = self._check_valid_values(v)
            else:
                if not v in TYPE_MAP:
                    print(f"'{v}' type for '{k}' NOT FOUND")
                    return 0

        return ret_val * 1
    
    def _traverse_map (self, map_dict:Dict) -> Dict:
        """
        Traverse mapping dictionary to convert data type into framework specific type

        Args:
            map_dict (dict): Mapping to be used to create ES index

        Returns:
            dict: updated mapping dictionary

        """
        dictionary ={"properties":dict()}
        for k, v in map_dict.items():
            if isinstance(v, dict):
                dictionary['properties'][k]= self._traverse_map(v)
            else:
                dictionary['properties'][k]={"type":TYPE_MAP[v]}       
        return dictionary
    
    def _flush(self):
        errors = []
        list_of_es_ids = []
        for ok, item in streaming_bulk(self.client, self.consolidated_actions):
            if not ok:
                errors.append(item)
            else:
                list_of_es_ids.append(item['index']['_id'])
        print("List of faulty documents:", errors)
        self.consolidated_actions=[] # Reset List
        return list_of_es_ids
        
    
    def create_collection(self, collection_name: str, schema: Dict) -> Dict:
        """
        Create the index on ElasticSearch

        Args:
            collection_name (str): Index name of ES
            schema (dict): Mapping to be used to create ES index

        Returns:
            dict: response of error, or 200 if no errors caught
            
        """
        try:
            assert type(schema)==dict
        except Exception as e:
            return {"response":f"{e.__class__.__name__}: Type of 'schema' is not dict"}
        try:
            assert type(collection_name)==str
        except Exception as e:
            return {"response":f"{e.__class__.__name__}: Type of 'collection_name' is not str"}

        mapping_validity = self._check_valid_values(schema)
        if not mapping_validity:
            return {"response": "KeyError: data type not found in TYPE_MAP"}
        updated_mapping = self._traverse_map(schema)
        try:
            self.client.indices.create(index=collection_name, mappings=updated_mapping)
        except Exception as e:
            return {"response":f"{e}"}
        return {"response":"200"}
    
    def delete_collection(self, collection_name: str) -> dict:
        """
        Create the index on ElasticSearch

        Args:
            collection_name (str): Index name of ES
            schema (dict): Mapping to be used to create ES index

        Returns:
            dict: response of error, or 200 if no errors caught

        """
        try:
            self.client.indices.delete(index=collection_name)
        except Exception as e:
            return {"response": f"{e}"}
        return {"response":"200"}
    
    def create_document(self, collection_name: str, documents: List[Dict], id_field: str=None) -> dict:
        """
        Upload document(s) in the specified index within ElasticSearch

        Args:
            collection_name (str): Index name of ES
            documents (list): A list of document objects to be ingested
            id_field (str): specify the key amongst the document object to be the id field. If not specified, id will be generated by ES. 

        Returns:
            dict: response of error along with the faulty document, or 200 along with the ids of ingested document if no errors caught

        """
        try:
            assert type(documents)==list
        except Exception as e:
            return {"response":f"{e.__class__.__name__}: Type of 'documents' is not dict"}
        try:
            assert type(collection_name)==str
        except Exception as e:
            return {"response":f"{e.__class__.__name__}: Type of 'collection_name' is not str"}
        if not id_field is None:
            try:
                assert type(id_field)==str
            except Exception as e:
                return {"response":f"{e.__class__.__name__}: Type of 'id_field' is not str"}
        
        # If id_field is specified, verify that all documents possess the id_field. 
        if id_field != None:
            for doc in documents:
                if not id_field in doc.keys():
                    print("Fix document, or set 'id_field' to None. No documents uploaded.")
                    return {"response": "Fix document, or set 'id_field' to None. No documents uploaded.",
                           "error_doc": doc}
                try:
                    doc[id_field] = str(doc[id_field])
                except Exception as e:
                    return {"response": f"{e.__class__.__name__}: id cannot be casted to String type. No documents uploaded.",
                           "error_doc": doc}
        all_id = []
        for doc in documents:
            action_dict={}
            action_dict['_op_type']= 'index'
            action_dict['_index']=collection_name
            if id_field != None:
                action_dict['_id']=doc[id_field]
                doc.pop(id_field)
            action_dict['_source']=doc
            self.consolidated_actions.append(action_dict)
            if len(self.consolidated_actions) == MAX_BULK_SIZE:
                all_id = all_id+self._flush()
        
        all_id = all_id+self._flush()
        
        return {"response":"200", "ids": all_id}
    
    def delete_document(self, collection_name: str, doc_id:str) -> dict:
        """
        Delete document from index based on the specified document id. 

        Args:
            collection_name (str): Index name of ES
            doc_id (str): id of doc to be deleted

        Returns:
            dict: response of error along with the faulty document, or 200 along with elastic API response

        """
        try:
            assert type(collection_name)==str
        except Exception as e:
            return {"response":f"{e.__class__.__name__}: Type of 'collection_name' is not str"}
        try:
            assert type(doc_id)==str
        except Exception as e:
            return {"response":f"{e.__class__.__name__}: Type of 'doc_id' is not str"}
        
        # Check for document's existence     
        search_result = self.client.search(index="meow", query={"match":{"_id":doc_id}})
        result_count = search_result['hits']['total']['value']
        
        if result_count == 0:
            return {"response": f"Document '{doc_id}' not found!"}
        
        try:
            resp = self.client.delete(index="meow", id=doc_id)
        except Exception as e:
            return {"response":f"{e.__class__.__name__}. Document Deletion failed"}
        
        return {"response":"200", "api_resp": resp}
    

In [222]:
es_mgr = DocMgr()

In [201]:
create_res = es_mgr.create_collection(collection_name = "meow", schema = user_map)
print(create_res)

{'response': '200'}




In [200]:
del_res = es_mgr.delete_collection("meow")
print(del_res)

{'response': '200'}




In [139]:
def create_document(collection_name: str, properties: list, id_name:str=None) -> dict:
    print(collection_name)
    print(properties)
    print(id_name)

In [204]:
import random
import string

# printing lowercase

rand_data = []
user_map = {
    "owner_name": "str",
    "owner_id": "str",
    "cats": "int",
    "dogs": "int"
}

letters = string.ascii_lowercase
for i in range(1000):
    fake_data = {}
    fake_data["owner_name"] = ''.join(random.choice(letters) for i in range(10))
    fake_data['owner_id'] = i
    fake_data['cats'] = random.randint(0,5)
    fake_data['dogs'] = random.randint(0,5)
    rand_data.append(fake_data)

In [205]:
rand_data

[{'owner_name': 'kblgshxteh', 'owner_id': 0, 'cats': 0, 'dogs': 2},
 {'owner_name': 'aaxvguzxkc', 'owner_id': 1, 'cats': 5, 'dogs': 0},
 {'owner_name': 'gkusonhnei', 'owner_id': 2, 'cats': 0, 'dogs': 5},
 {'owner_name': 'yrpipvshmb', 'owner_id': 3, 'cats': 1, 'dogs': 1},
 {'owner_name': 'pxoenthvef', 'owner_id': 4, 'cats': 2, 'dogs': 0},
 {'owner_name': 'zcztimhdfc', 'owner_id': 5, 'cats': 4, 'dogs': 4},
 {'owner_name': 'xudyjrzhyq', 'owner_id': 6, 'cats': 3, 'dogs': 1},
 {'owner_name': 'yqlbdeinhs', 'owner_id': 7, 'cats': 5, 'dogs': 0},
 {'owner_name': 'iougvaeudd', 'owner_id': 8, 'cats': 0, 'dogs': 2},
 {'owner_name': 'btwiuyhitw', 'owner_id': 9, 'cats': 1, 'dogs': 1},
 {'owner_name': 'zvztpvacpn', 'owner_id': 10, 'cats': 0, 'dogs': 2},
 {'owner_name': 'dakorqhbzt', 'owner_id': 11, 'cats': 5, 'dogs': 5},
 {'owner_name': 'fbithftyym', 'owner_id': 12, 'cats': 5, 'dogs': 2},
 {'owner_name': 'vdwfhtjkxu', 'owner_id': 13, 'cats': 2, 'dogs': 1},
 {'owner_name': 'vqnzbthqyj', 'owner_id': 14

In [208]:

upload_res = es_mgr.create_document(collection_name = "meow", documents=rand_data)

List of faulty documents: []




In [209]:
upload_res

{'response': '200',
 'ids': ['YTy8E4UBimE7927zEki1',
  'Yjy8E4UBimE7927zEki1',
  'Yzy8E4UBimE7927zEki1',
  'ZDy8E4UBimE7927zEki1',
  'ZTy8E4UBimE7927zEki1',
  'Zjy8E4UBimE7927zEki1',
  'Zzy8E4UBimE7927zEki1',
  'aDy8E4UBimE7927zEki1',
  'aTy8E4UBimE7927zEki1',
  'ajy8E4UBimE7927zEki1',
  'azy8E4UBimE7927zEki1',
  'bDy8E4UBimE7927zEki1',
  'bTy8E4UBimE7927zEki1',
  'bjy8E4UBimE7927zEki1',
  'bzy8E4UBimE7927zEki1',
  'cDy8E4UBimE7927zEki1',
  'cTy8E4UBimE7927zEki1',
  'cjy8E4UBimE7927zEki1',
  'czy8E4UBimE7927zEki1',
  'dDy8E4UBimE7927zEki1',
  'dTy8E4UBimE7927zEki1',
  'djy8E4UBimE7927zEki1',
  'dzy8E4UBimE7927zEki1',
  'eDy8E4UBimE7927zEki1',
  'eTy8E4UBimE7927zEki1',
  'ejy8E4UBimE7927zEki1',
  'ezy8E4UBimE7927zEki1',
  'fDy8E4UBimE7927zEki1',
  'fTy8E4UBimE7927zEki1',
  'fjy8E4UBimE7927zEki1',
  'fzy8E4UBimE7927zEki1',
  'gDy8E4UBimE7927zEki1',
  'gTy8E4UBimE7927zEki1',
  'gjy8E4UBimE7927zEki2',
  'gzy8E4UBimE7927zEki2',
  'hDy8E4UBimE7927zEki2',
  'hTy8E4UBimE7927zEki2',
  'hjy8E4UB

In [225]:

delete_res = es_mgr.delete_document(collection_name = "meow", doc_id='Yzy8E4UBimE7927zEki1')



In [226]:
delete_res

{'response': '200',
 'api_resp': ObjectApiResponse({'_index': 'meow', '_id': 'Yzy8E4UBimE7927zEki1', '_version': 2, 'result': 'deleted', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1002, '_primary_term': 1})}