# RecDP LLM - RAG

# Get started

## 1. Install pyrecdp and dependencies

In [None]:
! DEBIAN_FRONTEND=noninteractive apt-get install -y openjdk-8-jre
! pip install -q pyrecdp --pre
# ! pip install 'git+https://github.com/intel/e2eAIOK.git#egg=pyrecdp&subdirectory=RecDP'

## 2. RAG Workflow

### Setup And Run RAG Indexer Pipeline

In [2]:
from pyrecdp.primitives.operations import UrlLoader,RAGTextFix,CustomerDocumentSplit,TextCustomerFilter,DocumentIngestion, GlobalDeduplicate
from pyrecdp.LLM import TextPipeline

urls = ['https://app.cnvrg.io/docs/', 
        'https://app.cnvrg.io/docs/core_concepts/python_sdk_v2.html',
        'https://app.cnvrg.io/docs/cli_v2/cnvrgv2_cli.html',
        'https://app.cnvrg.io/docs/collections/tutorials.html']

def custom_filter(text):
    from nltk.tokenize import word_tokenize
    ret_txt = None
    if len(word_tokenize(text)) >10:
        if text.split(' ')[0].lower()!='version':
            ret_txt = text
    return ret_txt != None

def chunk_doc(text,max_num_of_words):
    from nltk.tokenize import word_tokenize,sent_tokenize
    text= text.strip()
    if len(word_tokenize(text)) <= max_num_of_words:
        return [text]
    else:
        chunks = []
        # split by sentence
        sentences = sent_tokenize(text)
        # print('number of sentences: ', len(sentences))
        words_count = 0
        temp_chunk = ""
        for s in sentences:
            temp_chunk+=(s+" ")
            words_count += len(word_tokenize(s))
            if len(word_tokenize(temp_chunk))> max_num_of_words:
                chunks.append(temp_chunk)
                words_count = 0
                temp_chunk = ""
                
        return chunks

pipeline = TextPipeline()
ops = [
    UrlLoader(urls, max_depth=2),
    RAGTextFix(str_to_replace={'\n###': '', '\n##': '', '\n#': ''}, remove_extra_whitespace=True),
    CustomerDocumentSplit(func=lambda text: text.split('# ')[1:]),
    TextCustomerFilter(custom_filter),
    CustomerDocumentSplit(func=chunk_doc, max_num_of_words=50),
    GlobalDeduplicate(),
    DocumentIngestion(
        vector_store='elasticsearch',
        vector_store_args={'host': 'localhost', 'port': 9200}
    )
]
pipeline.add_operations(ops)
ds = pipeline.execute()
display(ds.toPandas())



[32m2023-12-21 14:52:59.406[0m | [1mINFO    [0m | [36mpyrecdp.core.import_utils[0m:[36mcheck_availability_and_install[0m:[36m47[0m - [1mcheck_availability_and_install ['bs4', 'langchain'][0m
[32m2023-12-21 14:52:59.409[0m | [1mINFO    [0m | [36mpyrecdp.core.import_utils[0m:[36mcheck_availability_and_install[0m:[36m47[0m - [1mcheck_availability_and_install ['ftfy', 'selectolax'][0m
[32m2023-12-21 14:52:59.411[0m | [1mINFO    [0m | [36mpyrecdp.core.import_utils[0m:[36mcheck_availability_and_install[0m:[36m47[0m - [1mcheck_availability_and_install ['ftfy'][0m
[32m2023-12-21 14:52:59.413[0m | [1mINFO    [0m | [36mpyrecdp.core.import_utils[0m:[36mcheck_availability_and_install[0m:[36m47[0m - [1mcheck_availability_and_install ['langchain'][0m
[32m2023-12-21 14:52:59.470[0m | [1mINFO    [0m | [36mpyrecdp.core.import_utils[0m:[36mcheck_availability_and_install[0m:[36m47[0m - [1mcheck_availability_and_install ['bs4', 'langchain'][0m
[

2023-12-21 14:53:31.086 | INFO     | pyrecdp.core.import_utils:check_availability_and_install:52 - check_availability_and_install emoji==2.2.0
2023-12-21 14:53:31.208 | INFO     | pyrecdp.core.import_utils:check_availability_and_install:52 - check_availability_and_install emoji==2.2.0
2023-12-21 14:53:31.271 | INFO     | pyrecdp.core.import_utils:check_availability_and_install:52 - check_availability_and_install emoji==2.2.0
2023-12-21 14:53:31.279 | INFO     | pyrecdp.core.import_utils:check_availability_and_install:52 - check_availability_and_install emoji==2.2.0
2023-12-21 14:53:31.315 | INFO     | pyrecdp.core.import_utils:check_availability_and_install:52 - check_availability_and_install emoji==2.2.0
2023-12-21 14:53:31.368 | INFO     | pyrecdp.core.import_utils:check_availability_and_install:52 - check_availability_and_install emoji==2.2.0
2023-12-21 14:53:31.371 | INFO     | pyrecdp.core.import_utils:check_availability_and_install:52 - check_availability_and_install emoji==2.2.0

  Generate Global Hash took 25.209684751927853 sec
Generate Global indexing based on hash started ...
  Generate Global indexing based on hash took 0.4185355678200722 sec
Generate global duplication list started ...
  Generate global duplication list took 0.34351083636283875 sec
reduce input file based on detected duplication started ...
  reduce input file based on detected duplication took 0.3780368007719517 sec
DocumentIngestion
[32m2023-12-21 14:53:51.410[0m | [1mINFO    [0m | [36mpyrecdp.core.import_utils[0m:[36mcheck_availability_and_install[0m:[36m47[0m - [1mcheck_availability_and_install ['farm-haystack', 'farm-haystack[elasticsearch7]'][0m


Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.




execute with spark took 89.4477195031941 sec


Unnamed: 0,global_id,hash,metadata,text
0,global_id@17179869184,27cc3e8011a124ca3bf75a47fef72ff39721220bb75ce2...,{'description': 'Documentation website for cnv...,Tutorials and Examples To help you get started...
1,global_id@17179869185,218474edcc8258de97665e684b6a91d8b3df965808e563...,{'description': 'Documentation website for cnv...,Example Projects Build and Deploy an IMDB NLP ...
2,global_id@17179869186,e4443b6da5f362bda0c17d8d807f02b09101dc5d68d7b7...,{'description': 'Documentation website for cnv...,"Workspaces, Experiments and IDEs Run an Experi..."
3,global_id@17179869187,2f57da125db67ff3f1f08cc99bb7ccde9cc7d4d00b0a9f...,{'description': 'Documentation website for cnv...,Flows and Serving Processing your Dataset with...
4,global_id@17179869188,231cc3aab439c2431eece36e98e787cdded39f5ed2552a...,{'description': 'Documentation website for cnv...,Other Setup Slack Integration for Experiments ...
...,...,...,...,...
876,global_id@403726925875,f5ec39ffd11609e32239b918f86d8d84c71692546e46c1...,{'description': 'Documentation website for cnv...,AND/OR operators To run queries with and/or lo...
877,global_id@403726925876,00e3f9ded7c9a1a91e760679de138e56b2f0d012eb4d20...,{'description': 'Documentation website for cnv...,Saving queries Once you have searched using a ...
878,global_id@403726925877,2f94364a718ea68a3dc362bc486885ce19762492c8bd05...,{'description': 'Documentation website for cnv...,Collaborators Making collaboration simpler is ...
879,global_id@403726925878,07c3441c5dc9adaf1a6ebf6ff315fd08a13c7f32e0b8f6...,{'description': 'Documentation website for cnv...,Add a collaborator Complete the following step...
