In [5]:
from langchain_chroma import Chroma
from langchain.indexes import SQLRecordManager, index
import os
import glob
from utils.extract_knowledge import extract_text_from_pdf, split_text_into_chunks

from tqdm import tqdm
tqdm()

def get_records_manager(database, namespace):

    # if the file exists, load the record manager from the file
    if os.path.exists(database):
        record_manager = SQLRecordManager(
            namespace, db_url=f"sqlite:///{database}"
        )
        return record_manager
    else:
        record_manager = SQLRecordManager(
            namespace, db_url=f"sqlite:///{database}"
        )
        record_manager.create_schema()

def get_huggingface_model(model_name):
    from langchain_community.embeddings import HuggingFaceEmbeddings
    model_kwargs =  {'device': 'cpu'}
    encode_kwargs = {'normalize_embeddings': False}
    hf = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
    return hf
    

if not os.path.exists("./.cache"):
    os.makedirs("./.cache")

namespace = f"rags/docs"

model_name = "BAAI/bge-base-en-v1.5"
print(f"Loading Hugging Face model {model_name}")
hf = get_huggingface_model(model_name)

print(f"Getting records manager for namespace {namespace} in db")
record_manager = get_records_manager("./.cache/record_manager_cache.sql", namespace)


chroma = Chroma("docs",  embedding_function=hf, persist_directory="./.cache/chroma/docs")

docs = extract_text_from_pdf("data/unilever.pdf")
docs = split_text_into_chunks(docs)

print(f"Indexing {len(docs)} splits of Ollama docs")


0it [00:00, ?it/s]


Loading Hugging Face model BAAI/bge-base-en-v1.5


  from .autonotebook import tqdm as notebook_tqdm


Getting records manager for namespace rags/docs in db
Indexing 275 splits of Ollama docs


In [None]:

indexing = index(
    docs,
    record_manager,
    chroma,
    cleanup='incremental',
    source_id_key="key",
)
print(indexing)

In [6]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("mdarhri00/named-entity-recognition")
model = AutoModelForTokenClassification.from_pretrained("mdarhri00/named-entity-recognition")

In [9]:
from transformers import pipeline

pipe = pipeline("token-classification", model="mdarhri00/named-entity-recognition")



In [10]:
entities = []
for doc in docs:
    result = pipe(doc)
    print(result)
    entities.extend(result)

[{'entity': 'Organization_Name', 'score': 0.98519844, 'index': 11, 'word': 'Uni', 'start': 40, 'end': 43}, {'entity': 'Organization_Name', 'score': 0.9830253, 'index': 12, 'word': '##lever', 'start': 43, 'end': 48}, {'entity': 'date_time', 'score': 0.75316703, 'index': 18, 'word': '202', 'start': 76, 'end': 79}, {'entity': 'Organization_Name', 'score': 0.98144054, 'index': 31, 'word': 'Uni', 'start': 139, 'end': 142}, {'entity': 'Organization_Name', 'score': 0.97671074, 'index': 32, 'word': '##lever', 'start': 142, 'end': 147}, {'entity': 'date_time', 'score': 0.6393542, 'index': 44, 'word': '202', 'start': 196, 'end': 199}, {'entity': 'Organization_Name', 'score': 0.65053266, 'index': 49, 'word': 'the', 'start': 216, 'end': 219}, {'entity': 'Organization_Name', 'score': 0.9793289, 'index': 50, 'word': 'National', 'start': 220, 'end': 228}, {'entity': 'Organization_Name', 'score': 0.4973104, 'index': 57, 'word': 'the', 'start': 252, 'end': 255}, {'entity': 'Organization_Name', 'score':

KeyboardInterrupt: 

In [13]:
def combine_tokens(entities):
    combined_entities = []
    current_entity = None

    for token in entities:
        if not current_entity:
            current_entity = {
                'entity': token['entity'],
                'score': token['score'],
                'word': token['word'].replace('##', ''),
                'start': token['start'],
                'end': token['end']
            }
        elif token['start'] == current_entity['end']:
            # Continuation of the current entity
            current_entity['word'] += token['word'].replace('##', '')
            current_entity['end'] = token['end']
            current_entity['score'] = min(current_entity['score'], token['score'])
        else:
            # End of current entity and start of a new entity
            combined_entities.append(current_entity)
            current_entity = {
                'entity': token['entity'],
                'score': token['score'],
                'word': token['word'].replace('##', ''),
                'start': token['start'],
                'end': token['end']
            }
    
    # Add the last entity if it exists
    if current_entity:
        combined_entities.append(current_entity)

    return combined_entities

def deduplicate_entities(entities):
    seen = set()
    deduplicated_entities = []

    for entity in entities:
        entity_tuple = (entity['entity'], entity['word'], entity['start'], entity['end'])
        if entity_tuple not in seen:
            seen.add(entity_tuple)
            deduplicated_entities.append(entity)
    
    return deduplicated_entities


In [14]:
combined_entities = combine_tokens(entities)
for entity in combined_entities:
    print(entity)

{'entity': 'Organization_Name', 'score': 0.9830253, 'word': 'Unilever', 'start': 40, 'end': 48}
{'entity': 'date_time', 'score': 0.75316703, 'word': '202', 'start': 76, 'end': 79}
{'entity': 'Organization_Name', 'score': 0.97671074, 'word': 'Unilever', 'start': 139, 'end': 147}
{'entity': 'date_time', 'score': 0.6393542, 'word': '202', 'start': 196, 'end': 199}
{'entity': 'Organization_Name', 'score': 0.65053266, 'word': 'the', 'start': 216, 'end': 219}
{'entity': 'Organization_Name', 'score': 0.9793289, 'word': 'National', 'start': 220, 'end': 228}
{'entity': 'Organization_Name', 'score': 0.4973104, 'word': 'the', 'start': 252, 'end': 255}
{'entity': 'Organization_Name', 'score': 0.9239107, 'word': 'Dutch', 'start': 256, 'end': 261}
{'entity': 'Organization_Name', 'score': 0.9581023, 'word': 'Authority', 'start': 262, 'end': 271}
{'entity': 'date_time', 'score': 0.52528656, 'word': '202', 'start': 413, 'end': 416}
{'entity': 'date_time', 'score': 0.5774464, 'word': '202', 'start': 468

In [16]:
combined_entities = combine_tokens(entities)
deduplicated_entities = deduplicate_entities(combined_entities)

for entity in deduplicated_entities:
    print(entity)

{'entity': 'Organization_Name', 'score': 0.9830253, 'word': 'Unilever', 'start': 40, 'end': 48}
{'entity': 'date_time', 'score': 0.75316703, 'word': '202', 'start': 76, 'end': 79}
{'entity': 'Organization_Name', 'score': 0.97671074, 'word': 'Unilever', 'start': 139, 'end': 147}
{'entity': 'date_time', 'score': 0.6393542, 'word': '202', 'start': 196, 'end': 199}
{'entity': 'Organization_Name', 'score': 0.65053266, 'word': 'the', 'start': 216, 'end': 219}
{'entity': 'Organization_Name', 'score': 0.9793289, 'word': 'National', 'start': 220, 'end': 228}
{'entity': 'Organization_Name', 'score': 0.4973104, 'word': 'the', 'start': 252, 'end': 255}
{'entity': 'Organization_Name', 'score': 0.9239107, 'word': 'Dutch', 'start': 256, 'end': 261}
{'entity': 'Organization_Name', 'score': 0.9581023, 'word': 'Authority', 'start': 262, 'end': 271}
{'entity': 'date_time', 'score': 0.52528656, 'word': '202', 'start': 413, 'end': 416}
{'entity': 'date_time', 'score': 0.5774464, 'word': '202', 'start': 468

In [11]:
def combine_tokens(entities):
    combined_entities = []
    current_entity = None

    for token in entities:
        if token['entity'].startswith('B-'):
            # Start of a new entity
            if current_entity:
                combined_entities.append(current_entity)
            current_entity = {
                'entity': token['entity'][2:],  # Remove the 'B-' prefix
                'score': token['score'],
                'word': token['word'].replace('##', ''),
                'start': token['start'],
                'end': token['end']
            }
        elif token['entity'].startswith('I-') and current_entity and token['entity'][2:] == current_entity['entity']:
            # Continuation of the current entity
            if token['word'].startswith('##'):
                current_entity['word'] += token['word'].replace('##', '')
            else:
                current_entity['word'] += ' ' + token['word']
            current_entity['end'] = token['end']
            current_entity['score'] = min(current_entity['score'], token['score'])
        else:
            # Handle the case where the entity does not continue correctly
            if current_entity:
                combined_entities.append(current_entity)
            current_entity = None
    
    # Add the last entity if it exists
    if current_entity:
        combined_entities.append(current_entity)

    return combined_entities


In [12]:
combined_entities = combine_tokens(entities)
for entity in combined_entities:
    print(entity)

In [2]:

def deduplicate_entities(entities):
    """ArithmeticError
    Param: entities: list of entities
    """
    seen = {}

    for entity in entities:
        if entity['word'] not in seen:
            seen[entity['word']] = (entity,)
        else:
            seen[entity['word']] += (entity,)
    
    return seen

In [3]:
combined_entities = deduplicate_entities(combined_entities)
combined_entities

NameError: name 'combined_entities' is not defined

In [4]:

# sort the entities by the number of occurrences
sorted_entities = sorted(combined_entities.items(), key=lambda x: len(x[1]), reverse=True) 
sorted_entities

NameError: name 'combined_entities' is not defined

In [63]:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("token-classification", model="dslim/bert-large-NER")

: 

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("token-classification", model="dslim/bert-large-NER")

In [62]:
entities

[{'entity': 'B-ORG',
  'score': 0.99615484,
  'index': 11,
  'word': 'Un',
  'start': 40,
  'end': 42},
 {'entity': 'I-ORG',
  'score': 0.99241465,
  'index': 12,
  'word': '##ile',
  'start': 42,
  'end': 45},
 {'entity': 'I-ORG',
  'score': 0.9873697,
  'index': 13,
  'word': '##ver',
  'start': 45,
  'end': 48},
 {'entity': 'I-MISC',
  'score': 0.9377953,
  'index': 15,
  'word': 'Report',
  'start': 56,
  'end': 62},
 {'entity': 'I-MISC',
  'score': 0.95633817,
  'index': 17,
  'word': 'A',
  'start': 67,
  'end': 68},
 {'entity': 'I-MISC',
  'score': 0.8033139,
  'index': 18,
  'word': '##cco',
  'start': 68,
  'end': 71},
 {'entity': 'I-MISC',
  'score': 0.71900713,
  'index': 19,
  'word': '##unts',
  'start': 71,
  'end': 75},
 {'entity': 'B-ORG',
  'score': 0.99852175,
  'index': 33,
  'word': 'Un',
  'start': 139,
  'end': 141},
 {'entity': 'I-ORG',
  'score': 0.9931908,
  'index': 34,
  'word': '##ile',
  'start': 141,
  'end': 144},
 {'entity': 'I-ORG',
  'score': 0.9805885