In [5]:
from langchain_chroma import Chroma
from langchain.indexes import SQLRecordManager, index
import os
import glob



In [2]:

from utils.extract_knowledge import extract_text_from_pdf, split_text_into_chunks


In [None]:

def get_records_manager(database, namespace):

    # if the file exists, load the record manager from the file
    if os.path.exists(database):
        record_manager = SQLRecordManager(
            namespace, db_url=f"sqlite:///{database}"
        )
        return record_manager
    else:
        record_manager = SQLRecordManager(
            namespace, db_url=f"sqlite:///{database}"
        )
        record_manager.create_schema()

def get_huggingface_model(model_name):
    from langchain_community.embeddings import HuggingFaceEmbeddings
    model_kwargs =  {'device': 'cpu'}
    encode_kwargs = {'normalize_embeddings': False}
    hf = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
    return hf
    

if not os.path.exists("./.cache"):
    os.makedirs("./.cache")

namespace = f"rags/docs"

model_name = "BAAI/bge-base-en-v1.5"
print(f"Loading Hugging Face model {model_name}")
hf = get_huggingface_model(model_name)

print(f"Getting records manager for namespace {namespace} in db")
record_manager = get_records_manager("./.cache/record_manager_cache.sql", namespace)


chroma = Chroma("docs",  embedding_function=hf, persist_directory="./.cache/chroma/docs")

docs = extract_text_from_pdf("data/unilever.pdf")
docs = split_text_into_chunks(docs)

print(f"Indexing {len(docs)} splits of Ollama docs")


In [None]:

indexing = index(
    docs,
    record_manager,
    chroma,
    cleanup='incremental',
    source_id_key="key",
)
print(indexing)

In [3]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="microsoft/Phi-3-mini-128k-instruct", trust_remote_code=True)

KeyboardInterrupt: 

In [3]:
docs = extract_text_from_pdf("./data/unilever.pdf")
len(docs[:50000])

50000

In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

torch.random.manual_seed(0)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct", 
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True, 
    attn_implementation="eager"
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:

generation_args = {
    "max_new_tokens": 500,
}
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
pipe(docs[:50000],**generation_args)

You are not running the flash-attention implementation, expect numerical differences.


OutOfMemoryError: CUDA out of memory. Tried to allocate 12.42 GiB. GPU 