In [None]:
!pip install sentence-transformers



In [None]:
!pip install lancedb vllm

Collecting lancedb
  Downloading lancedb-0.15.0-cp38-abi3-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting vllm
  Downloading vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl.metadata (10 kB)
Collecting deprecation (from lancedb)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)
Collecting pylance==0.19.1 (from lancedb)
  Downloading pylance-0.19.1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (7.4 kB)
Collecting overrides>=0.7 (from lancedb)
  Downloading overrides-7.7.0-py3-none-any.whl.metadata (5.8 kB)
Collecting transformers>=4.45.2 (from vllm)
  Downloading transformers-4.46.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting uvicorn[standard] (from vllm)
  Downloading uvicorn-0.32.0-py3-none-any.whl.metadata (6.6 kB)
Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm)
  Downloading prometheus_fastapi_instrumentator-7.0.0-py3-non

In [None]:
# Block 0: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Block 1: Imports
import transformers
import re
from transformers import AutoConfig, AutoTokenizer, AutoModel, AutoModelForCausalLM
from huggingface_hub import snapshot_download
from vllm import LLM, SamplingParams
import torch
import json
import os
import shutil
import requests
import lancedb
import pandas as pd

In [None]:
model_name = "PleIAs/RAG-1B"
local_model_path = "/content/drive/MyDrive/RAG-1B"

if not os.path.exists(local_model_path):
    print(f"Downloading {model_name} to {local_model_path}...")
    snapshot_download(repo_id=model_name, local_dir=local_model_path, ignore_patterns=["*.msgpack", "*.h5", "*.ot", "*.feather"])
    print("Download complete!")
else:
    print(f"Model already exists at {local_model_path}")


Model already exists at /content/drive/MyDrive/RAG-1B


In [None]:
# Database paths
db_path = "/content/drive/MyDrive/rag_irene/lancedb_data"
table_name = "test"


In [None]:
# Model parameters
temperature = 0.7
max_new_tokens = 3000
top_p = 0.95
repetition_penalty = 1.2

In [None]:
# Initialize vLLM
llm = LLM(
    model=local_model_path,
    max_model_len=8192,
    dtype="float16",  # Explicitly set float16 for T4 GPU compatibility
    gpu_memory_utilization=0.8  # Added to help with memory management
)

INFO 11-05 13:03:06 llm_engine.py:237] Initializing an LLM engine (v0.6.3.post1) with config: model='/content/drive/MyDrive/RAG-1B', speculative_config=None, tokenizer='/content/drive/MyDrive/RAG-1B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/content/drive/MyDrive/RAG-1B, num_scheduler_steps=1, chunked_prefill_enabled=False multi_step_stream_outputs

  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")


INFO 11-05 13:03:09 model_runner.py:1056] Starting to load model /content/drive/MyDrive/RAG-1B...
INFO 11-05 13:03:09 selector.py:224] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 11-05 13:03:09 selector.py:115] Using XFormers backend.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 11-05 13:03:49 model_runner.py:1067] Loading model weights took 2.3185 GB
INFO 11-05 13:03:51 gpu_executor.py:122] # GPU blocks: 15123, # CPU blocks: 8192
INFO 11-05 13:03:51 gpu_executor.py:126] Maximum concurrency for 8192 tokens per request: 29.54x
INFO 11-05 13:03:55 model_runner.py:1395] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 11-05 13:03:55 model_runner.py:1399] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 11-05 13:04:23 model_runner.py:1523] Graph capturing finished in 29 secs.


In [None]:
# Connect to the LanceDB database
db = lancedb.connect(db_path)
table = db.open_table(table_name)

In [None]:
# Block 4: Search Function
def hybrid_search(text):
    results = table.search(text, query_type="hybrid").limit(4).to_pandas()
    document = []

    for _, row in results.iterrows():
        hash_id = str(row['hash'])
        title = row['section']
        content = row['text']
        document.append(f"**{hash_id}**\n{title}\n{content}")

    return "\n\n".join(document)

In [None]:
# Block 5: Reference Formatting Function
def format_references(text):
    ref_start_marker = '<ref text="'
    ref_end_marker = '</ref>'
    parts = []
    current_pos = 0
    ref_number = 1

    while True:
        start_pos = text.find(ref_start_marker, current_pos)
        if start_pos == -1:
            parts.append(text[current_pos:])
            break

        parts.append(text[current_pos:start_pos])
        end_pos = text.find('">', start_pos)
        if end_pos == -1:
            break

        ref_text = text[start_pos + len(ref_start_marker):end_pos].replace('\n', ' ').strip()
        ref_text_encoded = ref_text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
        ref_end_pos = text.find(ref_end_marker, end_pos)

        if ref_end_pos == -1:
            break

        ref_id = text[end_pos + 2:ref_end_pos].strip()
        tooltip_html = f'<span class="tooltip" data-refid="{ref_id}" data-text="{ref_id}: {ref_text_encoded}"><a href="#{ref_id}">[{ref_number}]</a></span>'

        parts.append(tooltip_html)
        current_pos = ref_end_pos + len(ref_end_marker)
        ref_number += 1

    return ''.join(parts)

In [None]:
# Block 6: Main Prediction Function
def predict(user_message):
    # Get relevant documents
    sources = hybrid_search(user_message)

    # Setup sampling parameters
    sampling_params = SamplingParams(
        temperature=temperature,
        top_p=top_p,
        max_tokens=max_new_tokens,
        presence_penalty=repetition_penalty,
        stop=["#END#"]
    )

    # Create prompt
    prompt = f"""### Query ###\n{user_message}\n\n### Source ###\n{sources}\n\n### Analysis ###\n"""

    # Generate response
    outputs = llm.generate([prompt], sampling_params, use_tqdm=False)
    generated_text = outputs[0].outputs[0].text

    # Format response with references
    formatted_response = format_references(generated_text)

    return {
        "query": user_message,
        "sources": sources,
        "response": formatted_response
    }

In [None]:
!pip install tantivy

Collecting tantivy
  Downloading tantivy-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading tantivy-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.5 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/4.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/4.5 MB[0m [31m8.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3.4/4.5 MB[0m [31m48.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m4.5/4.5 MB[0m [31m56.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tantivy
Successfully installed tantivy-0.22.0


In [None]:
# Block 7: Usage

query = "What are the main types of risks associated with Large Language Models (LLMs)?"
result = predict(query)

# Print results
print("\nQuery:", result["query"])
print("\nSources:", result["sources"])
print("\nAnalysis:", result["response"])



Query: What are the main types of risks associated with Large Language Models (LLMs)?

Sources: **4e2d3c7a186d08a4**
I. INTRODUCTION
Large language models (LLMs) [1]–[5] that own mas-
sive model parameters pre-trained on extensive corpora, have
catalyzed a revolution in the fields of Natural Language
Processing (NLP). The scale-up of model parameters and
the expansion of pre-training corpora have endowed LLMs
with remarkable capabilities across various tasks, including
text generation [2], [4], [5], coding [2], [6], and knowledge
reasoning [7]–[10]. Furthermore, alignment techniques (e.g.,
supervised fine-tuning and reinforcement learning from human
feedback [4], [11]) are proposed to encourage LLMs to align
their behaviors with human preferences, thereby enhancing the
usability of LLMs. In practice, advanced LLM systems like
ChatGPT [12] have consistently garnered a global user base,
establishing themselves as competitive solutions for complex
NLP tasks. i
To mitigate the risks of LL