### Welcome to the Vectonic Evaluation !

here, we're evaluating the various configurations of the `Unstructured.io` `Vectara-Cli` and `Together.ai` pipeline using `tonicai` to see how the available enhancements improve retrieval. Although we're using the vectara-cli these advanced RAG techniques are transferable to other models and other techniques.

In [8]:
import subprocess
import sys

# Function to handle package installations
def install(packages):
    for package in packages.split():
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Preemptively install essential packages
install("spacy ipywidgets")

# Install vector retrieval and evaluation frameworks
packages = "tonic-validate vectara-cli vectara-cli[rebel_span] llama-index-core llama-index-readers-file " \
           "llama-index-llms-together llama-parse python-magic-bin==0.4.14"
install(packages)

# Function to safely attempt imports and install if packages are missing
def try_import(module_name, from_list):
    try:
        module = __import__(module_name, fromlist=from_list)
        return module
    except ImportError:
        install(module_name)
        module = __import__(module_name, fromlist=from_list)
        return module

# Importing and initializing Vectara CLI modules safely
vectara_cli = try_import("vectara_cli", ["core", "rebel_span.noncommercial.nerdspan", "rebel_span.commercial.enterprise"])
VectaraClient = getattr(vectara_cli.core, "VectaraClient")
Span = getattr(vectara_cli.rebel_span.noncommercial.nerdspan, "Span")
EnterpriseSpan = getattr(vectara_cli.rebel_span.commercial.enterprise, "EnterpriseSpan")

try:
    import llama_index
    from llama_index.core import SimpleDirectoryReader
    from llama_index.readers.file import UnstructuredReader
    from llama_index.llms.together import TogetherLLM
except ImportError:
    install("llama-index-core")
    install("llama-index-readers-file")
    install("llama-index-llms-together")
    from llama_index.core import SimpleDirectoryReader
    from llama_index.readers.file import UnstructuredReader
    from llama_index.llms.together import TogetherLLM
# Importing validation and benchmarking tools from tonic-validate
tonic_validate = try_import("tonic_validate", ["ValidateScorer", "Benchmark"])
ValidateScorer = getattr(tonic_validate, "ValidateScorer")
Benchmark = getattr(tonic_validate, "Benchmark")

# Note to check if all modules are imported correctly
print("All modules have been successfully imported and are ready for use.")

All modules have been successfully imported and are ready for use.


In [19]:
import subprocess
import sys

# Function to handle package installations
def install(packages):
    for package in packages.split():
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])

# Installing all required dependencies as per the project requirements
install(
    'spacy ipywidgets vectara-cli vectara-cli[span_marker] llama-index-core ' \
    'llama-index-readers-file llama-index-llms-together' \
    # 'llama-index-readers-file-epub llama-index-readers-file-flat llama-index-readers-file-html ' \
    # 'llama-index-readers-file-image llama-index-readers-file-image-caption ' \
    # 'llama-index-readers-file-image-deplot llama-index-readers-file-image-vision-llm ' \
    # 'llama-index-readers-file-ipynb llama-index-readers-file-markdown llama-index-readers-file-mbo ' \
    # 'llama-index-readers-file-paged_csv llama-index-readers-file-pymu_pdf ' \
    # 'llama-index-readers-file-slides llama-index-readers-file-tabular ' \
    # 'llama-index-readers-file-unstructured llama-index-readers-file-xml llama-index-readers-file-rtf ' \
    ' python-magic-bin==0.4.14 gradio'
)

# Importing libraries after installation to check if all are correctly installed
try:
    import spacy, ipywidgets, vectara_cli, llama_index, gradio
    from vectara_cli.core import VectaraClient
    from vectara_cli.span_marker.noncommercial.nerdspan import Span
    from vectara_cli.span_marker.commercial.enterprise import EnterpriseSpan
    # from llama_index import core as llama_index_core, readers, llms
    print('All modules have been successfully imported and are ready for use.')
except Exception as e:
    print('There was an error importing the modules:', str(e))


2024-04-18 15:44:27,133 - DEBUG - load_ssl_context verify=True cert=None trust_env=True http2=False
2024-04-18 15:44:27,144 - DEBUG - load_verify_locations cafile='C:\\Users\\MeMyself\\AppData\\Roaming\\Python\\Python312\\site-packages\\certifi\\cacert.pem'
2024-04-18 15:44:27,171 - DEBUG - load_ssl_context verify=True cert=None trust_env=True http2=False
2024-04-18 15:44:27,173 - DEBUG - load_verify_locations cafile='C:\\Users\\MeMyself\\AppData\\Roaming\\Python\\Python312\\site-packages\\certifi\\cacert.pem'
2024-04-18 15:44:27,629 - DEBUG - connect_tcp.started host='api.gradio.app' port=443 local_address=None timeout=3 socket_options=None
2024-04-18 15:44:27,785 - DEBUG - load_ssl_context verify=True cert=None trust_env=True http2=False
2024-04-18 15:44:27,787 - DEBUG - load_verify_locations cafile='C:\\Users\\MeMyself\\AppData\\Roaming\\Python\\Python312\\site-packages\\certifi\\cacert.pem'
2024-04-18 15:44:27,828 - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync

There was an error importing the modules: No module named 'vectara_cli.span_marker'


### Import the Data Loader


In [21]:
import requests

# Correct URL to the raw content of the Python file on GitHub
url = 'https://raw.githubusercontent.com/TeamTonic/Vectonic/devbranch/src/dataloader.py?token=GHSAT0AAAAAACKMIGKSE4A5FEDT6UYXGCC6ZRBFM6A'

# Make a GET request to fetch the raw content of the Python file
response = requests.get(url)
if response.status_code == 200:
    content = response.text
    # Save the Python code to a local file
    with open('dataloader.py', 'w') as file:
        file.write(content)
    # After saving it locally, import the necessary classes
    from dataloader import DataProcessor, DocumentLoader

    # Initialize and use the classes
    data_processor = DataProcessor()
    document_loader = DocumentLoader()
    print("Classes DataProcessor and DocumentLoader have been successfully imported and are ready for use.")
else:
    raise Exception(f"Failed to download the file: status code {response.status_code}")

2024-04-18 15:44:53,590 - DEBUG - Starting new HTTPS connection (1): raw.githubusercontent.com:443
2024-04-18 15:44:54,027 - DEBUG - https://raw.githubusercontent.com:443 "GET /TeamTonic/Vectonic/devbranch/src/dataloader.py?token=GHSAT0AAAAAACKMIGKSE4A5FEDT6UYXGCC6ZRBFM6A HTTP/1.1" 200 2146


ModuleNotFoundError: No module named 'llama_index.readers.web'

In [25]:
from dataloader import DataProcessor , DocumentLoader
data_processor = DataProcessor()
document_loader = DocumentLoader()

ModuleNotFoundError: No module named 'llama_index.readers.web'

### Let's Make a Simple RAG Pipeline using the enhanced and non enhanced texts !