<a href="https://colab.research.google.com/github/UdithWeerasinghe/IntelliScript_phase02_BIG/blob/main/EX2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Required Installation
%%capture
!pip install -q -U pandas matplotlib seaborn requests scikit-learn
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U trl
!pip install -q -U auto-gptq
!pip install -q -U optimum
!pip install -q -U einops
!pip install -q -U bitsandbytes
!pip install -q -U sentence-transformers
!pip install llama-index llama-parse

In [None]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from llama_index.llms.openai import OpenAI
from llama_index.core import VectorStoreIndex
from llama_parse import LlamaParse
from llama_index.core.node_parser import MarkdownElementNodeParser
from IPython.display import Markdown
import nest_asyncio

In [None]:
# Hugging Face authentication
from huggingface_hub import login
login("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")

In [None]:
# Enable async code execution in Colab
nest_asyncio.apply()

# Load the LLaMA 3 model
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
device = "cuda"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",
    ),
)
tokenizer = AutoTokenizer.from_pretrained(model_name)


# LlamaParse Setup
parser = LlamaParse(
    api_key="llx-99Y3TlqtF5fjAZYcwx1LvvqXEAveRFl2xWb9p1b3Wluuk69U",
    result_type="markdown",
)


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

In [None]:

# # HuggingFace Model Setup
# model_name = "meta-llama/Llama-2-7b-hf"  # Replace with your desired HuggingFace model

# device = "cuda" if torch.cuda.is_available() else "cpu"
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     device_map="auto",
#     torch_dtype=torch.float16,
#     quantization_config={"bits": 8},
# )
# tokenizer = AutoTokenizer.from_pretrained(model_name)


In [None]:

# Function to Query HuggingFace Model
def query_huggingface_model(prompt, max_length=200):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_return_sequences=1,
        do_sample=True,
        temperature=0.7,
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


# Recursive Retrieval Workflow
# Ensure LLMs are explicitly set to None
def recursive_retrieval_workflow(data_folder):
    """
    Processes Excel files in the provided folder, extracts relevant information,
    and creates a recursive retrieval index for querying.

    Parameters:
        data_folder (str): Path to the Google Drive folder containing Excel files.

    Returns:
        query_engine (VectorStoreIndex): Query engine for querying the data.
    """
    # Find Excel files in the directory
    excel_files = []
    for root, _, files in os.walk(data_folder):
        for file in files:
            if file.endswith(('.xlsx', '.xls')):
                excel_files.append(os.path.join(root, file))

    if not excel_files:
        raise ValueError("No Excel files found in the directory.")

    # Parse documents with LlamaParse
    documents = []
    for file_path in excel_files:
        documents.extend(parser.load_data(file_path))

    # Parse the documents into nodes
    node_parser = MarkdownElementNodeParser(llm=None, num_workers=4)  # No LLM dependency
    nodes = node_parser.get_nodes_from_documents(documents)
    base_nodes, objects = node_parser.get_nodes_and_objects(nodes)

    # Build Recursive Retrieval Index
    recursive_index = VectorStoreIndex(nodes=base_nodes + objects, llm=None)  # No LLM dependency

    return recursive_index


# Query Execution
def execute_query(index, query):
    """
    Executes a query using the provided recursive retrieval index.

    Parameters:
        index (VectorStoreIndex): Query index.
        query (str): User query.

    Returns:
        response (str): Response from the HuggingFace model.
    """
    # Retrieve relevant nodes
    retrieved_nodes = index.query(query, similarity_top_k=5)

    # Combine node contents and query the HuggingFace model
    context = "\n".join([node.get_content() for node in retrieved_nodes])
    prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
    return query_huggingface_model(prompt)



In [None]:
# Main Execution
data_folder = '/content/drive/MyDrive/CBSL-data'
query_engines = recursive_retrieval_workflow(data_folder)

Started parsing the file under job_id eab247da-0802-4649-954d-2ba7b8239f67
Started parsing the file under job_id 3d3f58bd-e745-4353-a6fe-a952de7a142f
Started parsing the file under job_id bbf723cb-d0eb-4ed7-a609-a728ae8996b1
Started parsing the file under job_id ef6c9d2e-db0d-48e3-903e-1ceb20356dbd
Started parsing the file under job_id 094acb62-bee5-44ea-b58f-49e390420530
Started parsing the file under job_id 743d29c1-cca1-43e6-b83c-39416986cfb8
Started parsing the file under job_id eeed6bd8-2c19-491e-9130-21cd74d0eac8
Started parsing the file under job_id 208c1bfb-2502-445d-8ca8-09736575dd0b
Started parsing the file under job_id 22d7c854-8d00-4986-b589-20bb15f4f718
Started parsing the file under job_id 91f7f1ba-0467-4c38-a338-52cb6d7c399f
Started parsing the file under job_id 8ce5adc4-77f3-4ce0-aa0d-6aa52edd54b6
Started parsing the file under job_id 2bc36938-69ce-4296-9f2b-5a9df59d1ad9
Started parsing the file under job_id 677d7c9c-9395-42e7-83e1-fa15465a12b3
Started parsing the file 

1it [00:00, 1486.81it/s]


AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: b08022d0********************025d. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

In [None]:
# Example Query
query = "Give date and value pairs for tea export data from 2007 to 2023?"
responses = execute_query(query_engines, query)

In [None]:
# Display Responses
for engine_name, response in responses.items():
    print(f"----------------------RESPONSE WITH {engine_name.upper()}----------------------")
    display(Markdown(f"{response}"))