# Libraries

In [1]:
%pip install llama-index
%pip install transformers
%pip install torch
%pip install llama-index-llms-groq
%pip install sentence-transformers
%pip install "llama-index-embeddings-huggingface"
%pip install kdbai-client
%pip install llama-index-vector-stores-kdbai
%pip install kdbai_client

Collecting llama-index
  Downloading llama_index-0.12.5-py3-none-any.whl.metadata (11 kB)
Collecting llama-index-agent-openai<0.5.0,>=0.4.0 (from llama-index)
  Downloading llama_index_agent_openai-0.4.0-py3-none-any.whl.metadata (726 bytes)
Collecting llama-index-cli<0.5.0,>=0.4.0 (from llama-index)
  Downloading llama_index_cli-0.4.0-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-core<0.13.0,>=0.12.5 (from llama-index)
  Downloading llama_index_core-0.12.5-py3-none-any.whl.metadata (2.5 kB)
Collecting llama-index-embeddings-openai<0.4.0,>=0.3.0 (from llama-index)
  Downloading llama_index_embeddings_openai-0.3.1-py3-none-any.whl.metadata (684 bytes)
Collecting llama-index-indices-managed-llama-cloud>=0.4.0 (from llama-index)
  Downloading llama_index_indices_managed_llama_cloud-0.6.3-py3-none-any.whl.metadata (3.8 kB)
Collecting llama-index-legacy<0.10.0,>=0.9.48 (from llama-index)
  Downloading llama_index_legacy-0.9.48.post4-py3-none-any.whl.metadata (8.5 kB)
Collecting 

In [2]:
import pandas as pd
from typing import List, Dict
from llama_index.core import VectorStoreIndex, ServiceContext, Document
from llama_index.core.node_parser import SentenceSplitter, MarkdownNodeParser
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.groq import Groq
from llama_index.core.llms import ChatMessage
import kdbai_client as kdbai

import time
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

# Data Loading

In [3]:
def load_data(csv_path: str, text_col: List[str], metadata_cols: List[str]) -> List[Document]:
  """
  Load documents and include class in metadata
  """
  df = pd.read_csv(csv_path)
  documents = []
  cols = ['document_id', 'class', 'issuing_authority', 'title', 'issue_date', 'reference_number']
  for _, row in df.iterrows():
      text = str(row[text_col])
      doc = Document(
          text=text,
          metadata= {cols[i]: row[col] for i, col in enumerate(metadata_cols)}
      )
      documents.append(doc)
  return documents

DATA_PATH = "/content/drive/MyDrive/Omdena/Regulatory RAG (SL Chapter)/code/model dev/data/2024_11_28 v0_LK_tea_dataset.csv"
text_col = 'markdown_content'
metadata_cols = ['id', 'class', 'issuing_authority', 'llama_title', 'llama_issue_date', 'llama_reference_number']

all_documents = load_data(DATA_PATH, text_col, metadata_cols)
len(all_documents)

167

In [4]:
circulars_docs = [doc for doc in all_documents if doc.metadata['class'] == 'circular']
len(circulars_docs)

107

In [5]:
set([doc.metadata['issuing_authority'] for doc in circulars_docs])

{'Tea Board', 'Tea Board Analytical Lab', 'Tea Research Institute'}

In [6]:
tri_circulars_docs = [doc for doc in all_documents if ((doc.metadata['class'] == 'circular') and (doc.metadata['issuing_authority'] == ('Tea Research Institute')))]
len(tri_circulars_docs)

50

Edgecase: When two dates are available, taking the first date. Confirm how to handle.<br>
For eg.

```nodes[568].metadata['issue_date']```
> January 1996 and July 2000 (two dates available)

In [7]:
date_list = []

def convert_to_datetime64(docs):
  for doc in tqdm(tri_circulars_docs):
    doc_date = doc.metadata['issue_date']
    if not str(doc_date) == "nan":
      # pick first date if multiple available
      doc_date = " ".join(doc_date.split()[0:2])
    doc.metadata['issue_date_ts'] = pd.to_datetime(doc_date, format="%B %Y")
    date_list.append(doc.metadata['issue_date_ts'])
  return docs

tri_circulars_docs = convert_to_datetime64(tri_circulars_docs)

  0%|          | 0/50 [00:00<?, ?it/s]

In [8]:
tri_circulars_docs[0].metadata['issue_date']

'February 2024'

In [9]:
tri_circulars_docs[0].metadata['issue_date_ts']

Timestamp('2024-02-01 00:00:00')

In [10]:
pd.Series(date_list).value_counts()

Unnamed: 0,count
2024-02-01,20
2003-09-01,4
2000-07-01,4
2003-03-01,3
2009-05-01,3
2013-06-01,2
1996-01-01,2
2002-10-01,1
2001-02-01,1
2011-01-01,1


# Chunking

In [11]:
node_parser = MarkdownNodeParser()
nodes = node_parser.get_nodes_from_documents(tri_circulars_docs)
len(nodes)

725

In [12]:
chunk_word_counts = pd.Series([len(node.text.split()) for node in nodes])
chunk_word_counts.describe()

Unnamed: 0,0
count,725.0
mean,71.195862
std,77.791443
min,2.0
25%,17.0
50%,48.0
75%,96.0
max,833.0


# Embedding Model

In [13]:
def setup_embedding_model():
    """
    Setup HuggingFace embedding model
    """
    model_name = 'BAAI/bge-small-en-v1.5'
    return HuggingFaceEmbedding(
        model_name=model_name,
        trust_remote_code=True,
        cache_folder="/content/drive/MyDrive/Omdena/Regulatory RAG (SL Chapter)/code/model dev/cached_models/"
        )

embed_model = setup_embedding_model()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Groq + KDBAI API Setup

In [14]:
from google.colab import userdata
GROQ_API_KEY = userdata.get('GROQ_API_KEY')

In [15]:
def setup_groq_llm():
    """
    Setup Groq LLM
    """
    groq_api_key = GROQ_API_KEY
    if not groq_api_key:
        raise ValueError("Please set GROQ_API_KEY environment variable")

    return Groq(
        api_key=groq_api_key,
        model="llama-3.1-8b-instant",
        temperature=0.0
    )

llm = setup_groq_llm()

# KDBAI API + Session Setup

In [16]:
KDBAI_API_KEY = userdata.get('KDBAI_API_KEY')
KDBAI_SESSION_ENDPOINT = userdata.get('KDBAI_SESSION_ENDPOINT')

In [17]:
def setup_kdbai_api():
  """
  Setup KDBAI Session Endpoint and API
  """

  kdbai_endpoint = KDBAI_SESSION_ENDPOINT
  if not kdbai_endpoint:
        raise ValueError("Please set KDBAI_SESSION_ENDPOINT environment variable")

  kdbai_api_key = KDBAI_API_KEY
  if not kdbai_api_key:
        raise ValueError("Please set KDBAI_API_KEY environment variable")

  return kdbai.Session(
    endpoint=f"https://cloud.kdb.ai/instance/{kdbai_endpoint}",
    api_key=f"{kdbai_api_key}"
    )

session = setup_kdbai_api()

# KDBAI Vector Store Setup

## Session Database

In [18]:
session.databases()

[KDBAI database "default", KDBAI database "srilanka_tri_circulars"]

In [19]:
# ensure no database called "srilanka_tea" exists
try:
    session.database("srilanka_tri_circulars").drop()
except kdbai.KDBAIException:
    pass

# Create the database
db = session.create_database("srilanka_tri_circulars")
session.databases()

[KDBAI database "default", KDBAI database "srilanka_tri_circulars"]

## Table Schema + Creation

In [20]:
# List all of the tables in the db
db.tables

[]

In [21]:
# Table - name & schema
table_name = "rag_baseline"

table_schema = [
        dict(name="document_id", type="bytes"),
        dict(name="text", type="bytes"),
        dict(name="embeddings", type="float32s"),
        dict(name="issue_date_ts", type="datetime64[ns]"),
    ]

indexFlat = {
        "name": "flat_index",
        "type": "flat",
        "column": "embeddings",
        "params": {'dims': 384, 'metric': 'CS'} # For similarity metric, choose from Euclidean Distance (L2), Dot Product (IP), or Cosine Similarity (CS).
    }

In [22]:
# First ensure the table does not already exist
try:
    db.table("rag_baseline").drop()
except kdbai.KDBAIException:
    pass

# Create table
table = db.create_table(table_name, table_schema, indexes=[indexFlat])
db.tables

[KDBAI table "rag_baseline"]

In [23]:
table.indexes

[{'name': 'flat_index',
  'type': 'flat',
  'column': 'embeddings',
  'params': {'metric': 'CS', 'dims': 384}}]

## Insert Data into Tables

In [24]:
from llama_index.vector_stores.kdbai import KDBAIVectorStore
from llama_index.core import StorageContext, Settings
from llama_index.core.indices import VectorStoreIndex

In [25]:
Settings.llm = llm
Settings.embed_model = embed_model

In [26]:
%%time

# Vector Store
vector_store = KDBAIVectorStore(
    table=table,
    index_name="circular_baseline_index"
    )

storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    tri_circulars_docs,
    storage_context=storage_context,
    transformations=[MarkdownNodeParser()]
)

CPU times: user 4min 15s, sys: 8.51 s, total: 4min 23s
Wall time: 4min 31s


In [27]:
table.query()

Unnamed: 0,document_id,text,embeddings,issue_date_ts
0,b'd29a0ec5-9c75-4ec4-8054-20abab376233',b'# ADVISORY CIRCULAR',"[-0.007562597, -0.026492892, 0.03072106, 0.018...",2024-02-01
1,b'4113e76a-d680-45f3-9365-e22a81df5743',b'# No.DM JHL 925VynvT\r\n\r\nIssued in: Febru...,"[-0.0121435, -0.023042029, 0.02277239, 0.01008...",2024-02-01
2,b'3273148b-968b-49a2-b7c7-7b8f5eb2a874',b'# PROTECTION OF TEA FROM BLISTER BLIGHT\r\n\...,"[-0.009545566, -0.013665088, 0.028926225, 0.02...",2024-02-01
3,b'490ebb5c-b664-446a-869d-447516b38965',b'# 1. Introduction\r\n\r\nBlister blight dise...,"[0.01621599, -0.022643894, 0.051311307, 0.0504...",2024-02-01
4,b'47d43a34-7ba3-40d1-8ebc-3c975d6ab4de',b'# 2. Disease Management\r\n\r\nIntegrated di...,"[-0.011557567, -0.017213918, 0.047385782, 0.03...",2024-02-01
...,...,...,...,...
720,b'115946be-f7ec-4ce3-9938-08324b29e36e',b'# 3.4 Cultural Ecological Weed Control Metho...,"[0.03683722, 0.023794655, 0.018897794, 0.05579...",2024-02-01
721,b'301a5e1d-4585-4dd8-898e-c5ebc75d7125',b'# 3.5 Manual Weeding\r\n\r\nManual weeding c...,"[-0.008825304, -0.06506193, 0.017658412, 0.038...",2024-02-01
722,b'41c86f16-a0ec-4b54-a5bc-a01e700cfcf5',b'# 3.6 Mechanical Weeding\r\n\r\nSlash weedin...,"[-0.0036664123, -0.043851368, 0.032723363, 0.0...",2024-02-01
723,b'1f8da13f-f6c5-4b89-a0f8-5faa13d661dc',b'# 3.7 Chemical Weed Control\r\n\r\nChemical ...,"[0.021501746, -0.0641808, 0.016451132, 0.04455...",2024-02-01


## Setting up Query Engine

In [29]:
%%time

# Using llama-3.1-8b-instant, the 128k tokens context size can take 100 pages.
K = 15

# query_engine = index.as_query_engine(llm=llm)

query_engine = index.as_query_engine(
    similarity_top_k=K,
    llm=llm,
    vector_store_kwargs={
        "index": "flat_index"#,
        # "filter": [["<", "publication_date", pd.to_datetime("")]],
        # "sort_columns": "publication_date",
    },
)

CPU times: user 227 ms, sys: 2.95 ms, total: 230 ms
Wall time: 291 ms


## Querying Vector Store with Questions

In [36]:
%%time

input_query = "What are the basic requirements needed to be fulfilled to be a Tea Exporter?"

result = query_engine.query(input_query)
print(result.response)

To be a successful Tea Exporter, one must have a thorough understanding of the global tea market, including the various types of tea, their production processes, and the different regions that produce high-quality tea. 

They should also be familiar with the export regulations and requirements of the countries they plan to export to, including any necessary certifications, licenses, and documentation.

In addition, a Tea Exporter should have strong relationships with tea producers, suppliers, and other industry stakeholders to ensure a consistent and high-quality supply of tea.

They should also be knowledgeable about the packaging, storage, and transportation of tea to ensure that it is handled and shipped safely and efficiently.

Furthermore, a Tea Exporter should have a strong marketing and sales strategy to effectively promote and sell their tea products to customers in different markets.

Lastly, they should be committed to maintaining high standards of quality, sustainability, an

In [31]:
%%time

input_query = "List some things to keep in mind while working with tea export business outside Sri Lanka."

result = query_engine.query(input_query)
print(result.response)

**Rewrite**

When engaging in a tea export business outside Sri Lanka, several factors should be taken into consideration to ensure a successful venture, especially given the recent past and current challenges in the industry. Here are some key points to keep in mind:

1. **Market research and understanding**: Familiarize yourself with the local market trends, consumer preferences, and regulations in the target countries, taking into account the impact of recent weather patterns and drought conditions on tea production.

2. **Compliance with international standards**: Ensure that your tea products meet the required standards and regulations of the importing countries, such as food safety and quality control measures, considering the potential effects of drought on tea quality.

3. **Certifications and labeling**: Obtain necessary certifications, such as Fairtrade, Organic, or Rainforest Alliance, and ensure accurate labeling to meet local requirements and consumer expectations, while a

In [35]:
len(result.source_nodes)

15

In [37]:
%%time

input_query = "What is the leavy amount that exporters should pay for each Kilo of tea being exported?"

result = query_engine.query(input_query)
print(result.response)

Unfortunately, the provided context information does not mention anything about the levies or charges that exporters should pay for each kilo of tea being exported. It appears to be focused on the fertilizers, soil, and plant nutrition for tea plantations in Sri Lanka. Therefore, I cannot provide a specific answer to your query based on the given context.
CPU times: user 120 ms, sys: 203 µs, total: 120 ms
Wall time: 1.1 s


In [38]:
%%time

input_query = "What are the 5 technical topics that are within the TRI circulars?"

result = query_engine.query(input_query)
print(result.response)

Based on the provided information, the 5 technical topics that are within the TRI circulars are:

1. Soil Analysis
2. Architecture (ARCH)
3. Tea Cultivation and Soil Forking
4. Establishment of Medium Shade
5. Compositions of Regional Specific Fertilizer Mixtures
CPU times: user 158 ms, sys: 936 µs, total: 159 ms
Wall time: 861 ms


In [39]:
%%time

input_query = "In terms of tea regulation, what is medium shade? What are the duties of a regulatory officer with respect to that?"

result = query_engine.query(input_query)
print(result.response)

In the context of tea cultivation, medium shade refers to a level of shade that is not too intense, allowing for some sunlight to reach the tea plants. It is typically achieved by planting shade trees at a certain distance from the tea plants, allowing them to provide partial shade.

As for the duties of a regulatory officer with respect to medium shade, their primary responsibility would be to ensure that tea plantations are adhering to the recommended shade management practices. This may involve:

* Conducting regular inspections to monitor the shade levels and ensure that they are within the recommended range
* Providing guidance and training to tea plantation owners and managers on the importance of medium shade and how to achieve it
* Enforcing regulations and standards related to shade management, such as ensuring that shade trees are planted at the correct distance from tea plants
* Collaborating with other stakeholders, such as researchers and extension agents, to stay up-to-da

Delete tables after use

In [40]:
table.drop()