In [1]:
! pip install -q google-cloud-discoveryengine
! pip install -q -U google-cloud-aiplatform
! pip install -q langchain-core
! pip install -q langchain
! pip install -q tiktoken
! pip install -q faiss-cpu
!pip install -q ragxplorer

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.12.1 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.
tensorflow 2.12.1 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.9.0 which is incompatible.
google-cloud-pubsublite 1.8.3 requires overrides<7.0.0,>=6.0.1, but you have overrides 7.7.0 which is incompatible.
kfp 2.4.0 requires kubernetes<27,>=8.0.0, but you have kubernetes 29.0.0 which is incompatible.[0m[31m
[0m

In [2]:
# Automatically restart kernel after installs so that your environment can access the new packages
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [1]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth as google_auth

    google_auth.authenticate_user()

In [2]:
from langchain.chat_models.vertexai import ChatVertexAI
from langchain.vectorstores import MatchingEngine
from langchain.embeddings import VertexAIEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.prompts import ChatPromptTemplate, PromptTemplate, FewShotChatMessagePromptTemplate, ChatMessagePromptTemplate
from langchain.llms import VertexAI
from langchain.chains import ConversationalRetrievalChain, RetrievalQA
from langchain.schema import format_document
from langchain.text_splitter import CharacterTextSplitter


import torch
from tqdm import tqdm
import pandas as pd
import numpy as np
import re
from typing import List
from pydantic import BaseModel
import matplotlib.pyplot as plt
import time


PROJECT_ID="engaged-domain-403109"
REGION="asia-southeast1"
GCS_BUCKET="engaged-domain-403109-me-bucket"
ME_INDEX_ID="projects/510519063638/locations/asia-southeast1/indexes/4693366538231611392"
ME_ENDPOINT_ID="projects/510519063638/locations/asia-southeast1/indexEndpoints/3617586769429528576"

QUERY_EXAMPLES_FILENAME = 'data/embedding_adaptor_training/query_examples.csv'

In [3]:
# Utility functions for Embeddings API with rate limiting
def rate_limit(max_per_minute):
    period = 60 / max_per_minute
    print("Waiting")
    while True:
        before = time.time()
        yield
        after = time.time()
        elapsed = after - before
        sleep_time = max(0, period - elapsed)
        if sleep_time > 0:
            print(".", end="")
            time.sleep(sleep_time)

class CustomVertexAIEmbeddings(VertexAIEmbeddings, BaseModel):
    requests_per_minute: int
    num_instances_per_batch: int

    # Overriding embed_documents method
    def embed_documents(self, texts: List[str]):
        limiter = rate_limit(self.requests_per_minute)
        results = []
        docs = list(texts)

        while docs:
            # Working in batches because the API accepts maximum 5
            # documents per request to get embeddings
            head, docs = (
                docs[: self.num_instances_per_batch],
                docs[self.num_instances_per_batch :],
            )
            chunk = self.client.get_embeddings(head)
            results.extend(chunk)
            next(limiter)

        return [r.values for r in results]

In [4]:
class CustomVertexAIEmbeddings_v2(VertexAIEmbeddings, BaseModel):

    # Overriding embed_query method
    def embed_query(self, text: str):
        embeddings = self.client.get_embeddings([text])[0].values
        adapted_query_embeddings = np.matmul(best_matrix, np.array(embeddings).T).tolist()
        
        return adapted_query_embeddings

# Embeddings API integrated with langChain
embeddings_v2 = CustomVertexAIEmbeddings_v2()
embeddings_v2.location = REGION

# Show embeddings_v2 config
embeddings_v2

2024-02-05 02:53:09.742731: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


CustomVertexAIEmbeddings_v2(project=None, location='asia-southeast1', request_parallelism=5, max_retries=6, stop=None, model_name='textembedding-gecko', client=<vertexai.preview.language_models._PreviewTextEmbeddingModel object at 0x7fce5fa56c20>, temperature=0.0, max_output_tokens=128, top_p=0.95, top_k=40, credentials=None, n=1, streaming=False)

In [5]:
EMBEDDING_QPM = 100
EMBEDDING_NUM_BATCH = 5
embeddings = CustomVertexAIEmbeddings(
    requests_per_minute=EMBEDDING_QPM,
    num_instances_per_batch=EMBEDDING_NUM_BATCH,
)

embeddings

CustomVertexAIEmbeddings(project=None, location='us-central1', request_parallelism=5, max_retries=6, stop=None, model_name='textembedding-gecko', client=<vertexai.preview.language_models._PreviewTextEmbeddingModel object at 0x7fce6167f3d0>, temperature=0.0, max_output_tokens=128, top_p=0.95, top_k=40, credentials=None, n=1, streaming=False, requests_per_minute=100, num_instances_per_batch=5)

In [7]:
# Load matching engine
me_v2 = MatchingEngine.from_components(
    project_id=PROJECT_ID,
    region=REGION,
    gcs_bucket_name=GCS_BUCKET,
    embedding=embeddings_v2,
    index_id=ME_INDEX_ID,
    endpoint_id=ME_ENDPOINT_ID,
)

# Get retriever from matching engine
NUMBER_OF_RESULTS = 4
retriever_v2 = me_v2.as_retriever(
        search_type="similarity",
        search_kwargs={
            "k": NUMBER_OF_RESULTS,
        },
    )

In [8]:
retriever_v2.get_relevant_documents('what is sped')

[Document(page_content='For parents whose child’s doctor has initiated a referral for enrolment in EIPIC, this EIPIC information pack contains some useful materials on early intervention and EIPIC, to guide you through the initial phase of your child’s enrolment.\n\nAs a new caregiver, you may have many questions on early intervention. To help address some of these questions, a Step One programme for new caregivers is also available. For more information and to sign up, click here.\n\nAs EIPIC is not a kindergarten or child care, caregivers may also want to consider pre-school or childcare arrangements. More information can be found on our Education page.\n\nList of EIPIC centres\n\nParent’s Guide: Introduction to Early Intervention\n\nParent’s Guide: Navigating EIPIC Application and Community Resources\n\nInclusive Support Programme (InSP) Pilot'),
 Document(page_content="Inclusive Support Programme (InSP) Pilot\n\nIn 2021, ECDA introduced a pilot programme in 7 selected childcare cen

In [9]:
# Initialise embedding object
embeddings = VertexAIEmbeddings(location=REGION, model_name="textembedding-gecko@001")

# Load matching engine
me = MatchingEngine.from_components(
    project_id=PROJECT_ID,
    region=REGION,
    gcs_bucket_name=GCS_BUCKET,
    embedding=embeddings,
    index_id=ME_INDEX_ID,
    endpoint_id=ME_ENDPOINT_ID,
)

# Get retriever from matching engine
NUMBER_OF_RESULTS = 4
retriever = me.as_retriever(
        search_type="similarity",
        search_kwargs={
            "k": NUMBER_OF_RESULTS,
        },
    )

retriever.get_relevant_documents('what is sped')

[Document(page_content='\uf0da Special Education Needs (SEN) Fund\n\n\uf0da Enhancement for Active Seniors (EASE)\n\nFinancial Planning and Others\n\n\uf0da Trusts\n\n\uf0da Insurance\n\n\uf0da Others\n\nSite Map\n\nTerms Of Use\n\nPrivacy Policy\n\neAdmin Singpass Login (for Individual users)\n\neAdmin Singpass Login (for Business users)\n\nSSNet – Enabling Services (for Business users)\n\nRate Our Website\n\n© 2023 Enabling Guide. All Rights Reserved.\n\nPowered by'),
 Document(page_content='3\n\nhttps://www.enablingguide.sg/docs/default-source/publications/education-pathways-for-children-with-sen-entering-sped-schools.pdf?sfvrsn=ea3b2a7e_4\n\n4\n\nhttps://www.moe.gov.sg/news/parliamentary-replies/20190211-allied-educators\n\n5\n\nhttps://www.moe.gov.sg/-/media/\x00les/special-education/parents-guide-children-special-educational-needs.ashx\n\n6\n\nhttps://www.moe.gov.sg/-/media/\x00les/special-education/parents-guide-children-special-educational-needs.ashx\n\n7\n\nhttps://www.moe.gov