In [1]:
x = 1+2

In [4]:
!pip install pypdf2

Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: pypdf2
Successfully installed pypdf2-3.0.1


In [5]:
from google.cloud import storage
from vertexai.language_models import TextEmbeddingModel
from google.cloud import aiplatform

import PyPDF2

import re
import os
import random
import json
import uuid

In [20]:
project="robust-habitat-439810-p4"
location="us-central1"
MODEL_NAME = "text-embedding-preview-0815"

In [11]:
pdf_path="lakeside_handbook.pdf"
bucket_name = "sukrit_bucket_1"
embed_file_path = "lakeside_embeddings.json"
sentence_file_path = "lakeside_sentences.json"
index_name="lakeside_index"

In [42]:
def extract_sentences_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            if page.extract_text() is not None:
                text += page.extract_text() + " "
    sentences = [sentence.strip() for sentence in text.split('. ') if sentence.strip()]
    return sentences

def generate_text_embeddings(sentences) -> list: 
  aiplatform.init(project=project,location=location)
  model = TextEmbeddingModel.from_pretrained(MODEL_NAME)  # Change to a model you have access to
  # model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")
  embeddings = model.get_embeddings(sentences)
  vectors = [embedding.values for embedding in embeddings]
  return vectors

def generate_and_save_embeddings(pdf_path, sentence_file_path, embed_file_path):
    def clean_text(text):
        cleaned_text = re.sub(r'\u2022', '', text)  # Remove bullet points
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()  # Remove extra whitespaces and strip
        return cleaned_text
    
    sentences = extract_sentences_from_pdf(pdf_path)
    if sentences:
        embeddings = generate_text_embeddings(sentences)
        
        with open(embed_file_path, 'w') as embed_file, open(sentence_file_path, 'w') as sentence_file:
            for sentence, embedding in zip(sentences, embeddings):
                cleaned_sentence = clean_text(sentence)
                id = str(uuid.uuid4())
                
                embed_item = {"id": id, "embedding": embedding}
                sentence_item = {"id": id, "sentence": cleaned_sentence}
                
                json.dump(sentence_item, sentence_file)
                sentence_file.write('\n') 
                json.dump(embed_item, embed_file)
                embed_file.write('\n')

# def upload_file(bucket_name,file_path):
#     storage_client = storage.Client()
#     bucket = storage_client.create_bucket(bucket_name,location=location)
#     blob = bucket.blob(file_path)
#     blob.upload_from_filename(file_path)

def upload_file(bucket_name, file_path, location="us-central1", prefix=""):
    storage_client = storage.Client()
    
    # Check if the bucket already exists
    try:
        bucket = storage_client.get_bucket(bucket_name)
        print(f"Bucket {bucket_name} already exists.")
    except NotFound:
        bucket = storage_client.create_bucket(bucket_name, location=location)
        print(f"Bucket {bucket_name} created.")

    # Create a blob and upload the file
    blob = bucket.blob(file_path)
    blob.upload_from_filename(file_path)
    print(f"File {file_path} uploaded to {bucket_name}.")

    # Return the full URI of the uploaded file
    file_uri = f"gs://{bucket_name}/{file_path}"
    return file_uri


    
# def create_vector_index(bucket_name, index_name):
#     lakeside_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
#     display_name = index_name,
#     contents_delta_uri = "gs://"+bucket_name,
#     dimensions = 768,
#     approximate_neighbors_count = 10,
#     )
                  
#     lakeside_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
#     display_name = index_name,
#     public_endpoint_enabled = True
#     )                      

#     lakeside_index_endpoint.deploy_index(
#     index = lakeside_index, deployed_index_id = index_name
#     )

def create_vector_index(bucket_name, index_name, machine_type="n1-standard-16", min_replica_count=1, max_replica_count=1):
    # Create the Matching Engine Index
    lakeside_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
        display_name=index_name,
        contents_delta_uri="gs://" + bucket_name, # Will be the uri of the embeddings file 
        dimensions=768,
        approximate_neighbors_count=10,
    )
                  
    # Create the Matching Engine Index Endpoint
    lakeside_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
        display_name=index_name,
        public_endpoint_enabled=True
    )                      

    # Deploy the index to the endpoint with compatible machine type and replica counts
    lakeside_index_endpoint.deploy_index(
        index=lakeside_index,
        deployed_index_id=index_name,
        machine_type=machine_type,
        min_replica_count=min_replica_count,
        max_replica_count=max_replica_count
    )



In [41]:
# generate_and_save_embeddings(pdf_path,sentence_file_path,embed_file_path)
# upload_file(bucket_name,embed_file_path)
create_vector_index(bucket_name, index_name)

Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/1087248591320/locations/us-central1/indexes/5029654368610156544/operations/6680546510502887424
MatchingEngineIndex created. Resource name: projects/1087248591320/locations/us-central1/indexes/5029654368610156544
To use this MatchingEngineIndex in another session:
index = aiplatform.MatchingEngineIndex('projects/1087248591320/locations/us-central1/indexes/5029654368610156544')
Creating MatchingEngineIndexEndpoint
Create MatchingEngineIndexEndpoint backing LRO: projects/1087248591320/locations/us-central1/indexEndpoints/6361453220916625408/operations/4936246079826952192
MatchingEngineIndexEndpoint created. Resource name: projects/1087248591320/locations/us-central1/indexEndpoints/6361453220916625408
To use this MatchingEngineIndexEndpoint in another session:
index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/1087248591320/locations/us-central1/indexEndpoints/6361453220916625408')
Deploying index 

In [24]:
# Working - Sample codebase to generate embeddings

# from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel

# MODEL_NAME = "text-embedding-preview-0815"
# DIMENSIONALITY = 256


# def embed_text(
#     texts: list[str] = ["Retrieve a function that adds two numbers"],
#     task: str = "CODE_RETRIEVAL_QUERY",
#     model_name: str = "text-embedding-preview-0815",
#     dimensionality: int | None = 256,
# ) -> list[list[float]]:
#     """Embeds texts with a pre-trained, foundational model."""
#     model = TextEmbeddingModel.from_pretrained(model_name)
#     inputs = [TextEmbeddingInput(text, task) for text in texts]
#     kwargs = dict(output_dimensionality=dimensionality) if dimensionality else {}
#     embeddings = model.get_embeddings(inputs, **kwargs)
#     # Example response:
#     # [[0.025890009477734566, -0.05553026497364044, 0.006374752148985863,...],
#     return [embedding.values for embedding in embeddings]

# texts = ["Retrieve a function that adds two numbers"]
# task = "CODE_RETRIEVAL_QUERY"
# code_block_embeddings = embed_text(
#     texts=texts, task=task, model_name=MODEL_NAME, dimensionality=DIMENSIONALITY
# )

# # Embeds code retrieval with a pre-trained, foundational model.
# # Using this function to calculate the embedding for query.
# texts = [
#     "def func(a, b): return a + b",
#     "def func(a, b): return a - b",
#     "def func(a, b): return (a ** 2 + b ** 2) ** 0.5",
# ]
# task = "RETRIEVAL_DOCUMENT"
# code_query_embeddings = embed_text(
#     texts=texts, task=task, model_name=MODEL_NAME, dimensionality=DIMENSIONALITY
# )
# print(code_query_embeddings)