In [8]:
from getpass import getpass
import os
from dotenv import load_dotenv
import ipywidgets as widgets
from IPython.display import display, clear_output
import time

# Specify the path to the .env file if it's not in the same directory
load_dotenv(dotenv_path='.env')  # Default is '.env'

from huggingface_hub import login
login(token=os.environ.get("HF_API_TOKEN"))

In [9]:
import torch
torch.cuda.empty_cache()

In [10]:
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
from haystack.utils import ComponentDevice, Device

device = ComponentDevice.from_str("cuda")

document_store = ChromaDocumentStore()


In [11]:
from haystack.components.generators import HuggingFaceAPIGenerator
from haystack.components.builders.prompt_builder import PromptBuilder

from haystack_integrations.components.embedders.jina import JinaTextEmbedder
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack_integrations.components.embedders.fastembed import FastembedTextEmbedder

import torch
from transformers import BitsAndBytesConfig
from haystack.components.generators import HuggingFaceLocalGenerator

def load_model(retriever):
    global query_pipeline
    cache_dir= "embed_cache"
    generator = HuggingFaceLocalGenerator("mistralai/Mistral-7B-Instruct-v0.2",                                      
                                              huggingface_pipeline_kwargs={"device_map":"auto",
                                                            "model_kwargs":{"load_in_4bit":True,
                                                              "bnb_4bit_use_double_quant":True,
                                                              "bnb_4bit_quant_type":"nf4",
                                                              "bnb_4bit_compute_dtype":torch.bfloat16,
                                                                           }},
                                              generation_kwargs={
                                                                "max_new_tokens": 400,    # Controls response length
                                                                "do_sample": True,
                                                                "temperature": 0.3,
                                                                "top_p": 0.9})
    
    generator.warm_up()
    
    prompt_template = prompt_template = """
    <s>[INST]Please answer the following question using only the information in the provided context.
    
    - If the answer is explicitly contained within the context, provide a complete response and clearly state the page number(s) referenced.
    - If the answer cannot be determined from the context, respond with: "The answer cannot be determined from the given context."
    
    Context:
    
    {% for doc in documents %}
    Page {{ doc.meta['page_number'] }}:
    {{ doc.content }}
    {% endfor %}
    
    Question: {{ question }}
    
    </s>[/INST]
    """
    
    text_embedder = JinaTextEmbedder(model="jina-embeddings-v2-base-en")
    text_embedder = FastembedTextEmbedder(
    	model="jinaai/jina-embeddings-v2-base-en",
    	cache_dir=cache_dir,
    )
    
    prompt_builder = PromptBuilder(template=prompt_template)
    query_pipeline = Pipeline()
    query_pipeline.add_component("text_embedder",text_embedder)
    query_pipeline.add_component(instance=prompt_builder, name="prompt_builder")
    query_pipeline.add_component("retriever", retriever)
    query_pipeline.add_component("generator", generator)
    
    query_pipeline.connect("text_embedder.embedding", "retriever")
    query_pipeline.connect("retriever.documents", "prompt_builder.documents")
    query_pipeline.connect("prompt_builder.prompt", "generator.prompt")


In [12]:
from pathlib import Path
from haystack import Pipeline

from haystack.components.fetchers import LinkContentFetcher
from haystack.components.converters import PyPDFToDocument,DOCXToDocument
from haystack.components.writers import DocumentWriter
from haystack.components.preprocessors import DocumentCleaner
from haystack.components.preprocessors import DocumentSplitter
from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever
from haystack.document_stores.types import DuplicatePolicy
from haystack_integrations.components.embedders.fastembed import FastembedSparseTextEmbedder,FastembedTextEmbedder
from haystack_integrations.components.embedders.fastembed import FastembedSparseDocumentEmbedder,FastembedDocumentEmbedder


def embedding_doc(file_path):
    # file_path = "docs/steel.pdf"
    
    cache_dir= "embed_cache"
    document_embedder = FastembedDocumentEmbedder(
        model="jinaai/jina-embeddings-v2-base-en",
        cache_dir=cache_dir,
        threads=2
    )
    document_embedder.warm_up()
    
    fetcher = LinkContentFetcher()
    
    if file_path:
        if file_path.lower().endswith('.pdf'):
            converter = PyPDFToDocument()
        elif file_path.lower().endswith(('.doc', '.docx')):
            converter = DOCXToDocument()
        else:
            raise ValueError("Unsupported file type. Please provide a PDF or Word document.")
    # remove repeated substrings to get rid of headers/footers
    cleaner = DocumentCleaner(remove_repeated_substrings=True)
    
    # Since jina-v2 can handle 8192 tokens, 500 words seems like a safe chunk size
    splitter = DocumentSplitter(split_by="word", split_length=500)
    
    # DuplicatePolicy.SKIP is optional but helps avoid errors if you want to re-run the pipeline
    writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)
    
    retriever = ChromaEmbeddingRetriever(document_store=document_store)
    
    indexing_pipeline = Pipeline()
    
    # indexing_pipeline.add_component(instance=fetcher, name="fetcher")
    indexing_pipeline.add_component(instance=converter, name="converter")
    indexing_pipeline.add_component(instance=cleaner, name="cleaner")
    indexing_pipeline.add_component(instance=splitter, name="splitter")
    indexing_pipeline.add_component(instance=document_embedder, name="embedder")
    indexing_pipeline.add_component(instance=writer, name="writer")
    
    # indexing_pipeline.connect("fetcher.streams", "converter.sources")
    indexing_pipeline.connect("converter.documents", "cleaner.documents")
    indexing_pipeline.connect("cleaner.documents", "splitter.documents")
    indexing_pipeline.connect("splitter.documents", "embedder.documents")
    indexing_pipeline.connect("embedder.documents", "writer.documents")
    
    #indexing_pipeline.run(data={"fetcher": {"urls": urls}})
    indexing_pipeline.run({"converter": {"sources": [Path(file_path)]}})
    load_model(retriever)

    # print("Embedding Completed")

In [13]:
# Directory for uploaded files
save_directory = "llama_test/voila_uploaded_files"
os.makedirs(save_directory, exist_ok=True)

# Widgets
upload_widget = widgets.FileUpload(accept='', multiple=False)
output_widget = widgets.Output()

# Variables to store state
uploaded_file_path = None


# Step 1: File Upload Handler
def on_file_upload(change):
    global uploaded_file_path
    with output_widget:
        clear_output()
        if upload_widget.value:
            # Access the uploaded file
            for file_info in upload_widget.value:
                file_name = file_info['name']
                content = file_info['content']  # File content as bytes

                # Save file to disk
                uploaded_file_path = os.path.join(save_directory, file_name)
                with open(uploaded_file_path, 'wb') as f:
                    f.write(content)
                
                print(f"Uploaded file '{file_name}' has been saved at {uploaded_file_path}.")
                
                # Process the file (e.g., create embeddings)
                embedding_doc(uploaded_file_path)
                print("File Loaded, Model Ready for Use!")

# Link handler to widget
upload_widget.observe(on_file_upload, names='value')

# Display widgets
display(upload_widget, output_widget)


FileUpload(value=(), description='Upload')

Output()

In [None]:
from IPython.display import display
import ipywidgets as widgets
from IPython.display import HTML
import logging
import sys


# Styling for custom widgets
custom_css = """
<style>
    .custom-container {
        border: 2px solid #4CAF50;
        border-radius: 15px;
        padding: 20px;
        background-color: #f9f9f9;
        box-shadow: 0px 4px 6px rgba(0, 0, 0, 0.1);
    }
    .custom-header {
        font-family: 'Verdana';
        font-size: 20px;
        color: #4CAF50;
        text-align: center;
        margin-bottom: 15px;
    }
    .custom-input {
        margin-top: 10px;
    }
    .custom-output {
        padding: 10px;
        border-radius: 10px;
        margin-top: 15px;
        background-color: #ffffff;
    }
    .custom-button {
        font-family: 'Arial';
        font-size: 14px;
        background-color: #4CAF50;
        color: white;
        border: none;
        padding: 8px 16px;
        border-radius: 10px;
        cursor: pointer;
    }
    .custom-button:hover {
        background-color: #45a049;
    }
</style>
"""

# Widgets for the UI
question_widget = widgets.Text(
    description="Question:",
    placeholder="Type your question here...",
    layout=widgets.Layout(width="90%")
)

output_widget = widgets.Output(
    layout=widgets.Layout(padding="10px", width="90%")
)

submit_button = widgets.Button(
    description="Submit Question",
    button_style="",
    layout=widgets.Layout(width="30%"),
    style={"button_color": "#4CAF50", "font_weight": "bold"}
)

# Function to handle question submission
def on_question_submit(button):
    with output_widget:
        output_widget.clear_output()  # Clear previous output
        question = question_widget.value
        try:
            # Simulate RAG model response
            result = query_pipeline.run(data={
                "text_embedder": {"text": question},
                "retriever": {"top_k": 20},
                "prompt_builder": {"question": question},
                "generator": {"generation_kwargs": {"max_new_tokens": 600}}
            })
            answer = result['generator']['replies'][0]
            
            # Display question and answer with custom formatting
            display(widgets.HTML(f"""
            <div style="font-family: 'Arial'; font-size: 18px; color: blue; margin-bottom: 10px;">
                <strong>Q:</strong> {question}
            </div>
            <div style="font-family: 'Courier New'; font-size: 16px; color: darkgreen; margin-top: 5px;">
                <strong>A:</strong> {answer}
            </div>
            """))
        except Exception as e:
            # Display a friendly error message if something goes wrong
            display(widgets.HTML(f"<p style='color: red;'>An error occurred: {str(e)}</p>"))

# Attach the button click event
submit_button.on_click(on_question_submit)

# Restore standard output for UI purposes
sys.stdout = sys.__stdout__

# Display the UI
display(HTML(custom_css))  # Inject custom styles
container = widgets.VBox(
    [
        widgets.HTML("<div class='custom-header'>Document Q&A System</div>"),
        question_widget,
        submit_button,
        output_widget
    ],
    layout=widgets.Layout(
        align_items="center",
        width="100%",
        padding="20px",
        border="2px solid #4CAF50",
        border_radius="15px",
        box_shadow="0px 4px 6px rgba(0, 0, 0, 0.1)"
    )
)
display(container)


In [19]:
# questions = [
#     # "What constitutes force majeure or excused delay?",
#     # "Are there any stipulations or restrictions on the use of owner equipment?",
#     # "Are there any daily or weekly or other recurring meetings?",
#     # "Are there reporting requirements?",
#     # "What are the schedule dates by which the contractor must perform?",
#     # "What are the contractor’s warranty obligations?",
#     # "How long is the contractor’s warranty?",
#     # "When does the contractor have to provide notice?",
#     # "What type of insurance is the contractor obligated to have in place or obtain?",
#     # "When is the contractor allowed an adjustment or increase to the contract price or schedule?",
#     # "The project schedule to be delivered by the contractor.",
#     # "The type of warranty that the contractor provides for the work it is furnishing.",
#     # "The time period in which the contractor is to provide notice of a change to schedule or price.",
#     # "The information the contractor is to provide with a notice of delay.",
#     # "Is there more than one notice to be provided regarding a delay?",
#     # "The type of insurance the contractor is to provide.",
#     # "The method of notice that must be used by the contractor when sending notice to its customer.",
#     # "The required procedure to resolve a dispute under the contract.",
#     # "The safety responsibilities of the contractor for the site.",
#     "What are all the testing requirements, explain"
#     # "The safety responsibilities of the contractor for the site."
# ]

# for question in questions:
#     result = query_pipeline.run(data={
#         "text_embedder": {"text": question},
#         "retriever": {"top_k": 20},
#         "prompt_builder": {"question": question},
#         "generator": {"generation_kwargs": {"max_new_tokens": 600}}
#     })
    
#     print(f"Question: {question}")
#     print(result['generator']['replies'][0])
