In [3]:
!pip install pypdf2
!pip install google-cloud-aiplatform
!pip install google-cloud-storage

Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m153.6/232.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf2
Successfully installed pypdf2-3.0.1


In [4]:
from google.cloud import storage
from vertexai.language_models import TextEmbeddingModel
from google.cloud import aiplatform
import PyPDF2

import re
import os
import random
import json
import uuid

In [5]:
%ls

In [None]:
location = ""
pdf_path = "Deepseek.pdf"
bucket_name = "mlaverickpb_rag_demo"
embed_file_path = "rag_demo_embeddings.json"
sentence_file_path = "rag_demo_sentences.json"
index_name = "rag_demo_index"

In [7]:
def extract_sentences_from_pdf(pdf_path):
  with open(pdf_path, 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    text = ""
    for page in reader.pages:
      if page.extract_text() is not None:
        text += page.extract_text() + " "
  sentences = [sentence.strip() for sentence in text.split('. ') if sentence.strip()]
  return sentences

In [8]:
def generate_text_embeddings(sentences) -> list:
  # aiplatform.init(project=project, location-location)
    model = TextEmbeddingModel.from_pretrained("text-embedding-005")
    batch_size = 250
    vectors = []

    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        embeddings = model.get_embeddings(batch)
        vectors.extend([embedding.values for embedding in embeddings])

    return vectors

def generate_and_save_embeddings(pdf_path, sentence_file_path, embed_file_path):
  def clean_text(text):
    cleaned_text = re.sub(r'\u2022','', text)# Remove bullet points
    cleaned_text = re.sub(r'\s+',' ', cleaned_text).strip() # Remove extra space
    return cleaned_text

  sentences = extract_sentences_from_pdf(pdf_path)
  if sentences:
    embeddings = generate_text_embeddings(sentences)

    with open(embed_file_path, 'w') as embed_file, open(sentence_file_path, 'w') as sentence_file:
      for sentence, embedding in zip(sentences, embeddings):
        cleaned_sentence = clean_text(sentence)
        id = str(uuid.uuid4())

        embed_item = {"id": id, "embedding": embedding}
        sentence_item = {"id": id, "sentence": cleaned_sentence}

        json.dump(sentence_item, sentence_file)
        sentence_file.write('\n')
        json.dump(embed_item, embed_file)
        embed_file.write('\n')

In [9]:
def upload_file(bucket_name, file_path):
  storage_client = storage.Client()
  bucket = storage_client.create_bucket(bucket_name,location=location)
  blob = bucket.blob(os.basename(file_path))
  blob.upload_from_filename(file_path)
  print(f"Uploaded {file_path} to gs://{bucket_name}/{os.path.basename(file_path)}")

In [11]:
def create_vector_index(bucket_name, index_name):
  lakeside_index= aiplatform.MatchingEngineIndex.create_tree_ah_index(
      display_name = index_name,
      contents_delta_uri = f"gs://{bucket_name}/{embed_file_path}",
      dimensions = 768,
      approximate_neighbors_count = 10
  )

  lakeside_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
      display_name = index_name,
      public_endpoint_enabled = True
  )

  lakeside_index_endpoint.deploy_index(
      index = lakeside_index,
      deployed_index_id = index_name
  )

In [13]:
generate_and_save_embeddings(pdf_path, sentence_file_path, embed_file_path)
upload_file(bucket_name, sentence_file_path)



Uploaded rag_demo_sentences.json to gs://mlaverickpb_rag_demo/rag_demo_sentences.json
✅ Confirmed: rag_demo_sentences.json exists in bucket.
Uploaded rag_demo_embeddings.json to gs://mlaverickpb_rag_demo/rag_demo_embeddings.json
✅ Confirmed: rag_demo_embeddings.json exists in bucket.


In [15]:
index_name = "test1"

In [16]:
create_vector_index(bucket_name, index_name)

In [17]:
import vertexai
from vertexai.preview.generative_models import GenerativeModel, Part
import json
import os

In [None]:
location = ""
sentence_file_path = "rag_demo_sentences.json"
# index_name = "rag_demo_index"

In [19]:
model = GenerativeModel("gemini-2.5-pro")
lakeside_index_ep = aiplatform.MatchingEngineIndexEndpoint(index_endpoint_name = "4460923183049998336")



In [20]:
def generate_context(ids, data):
  concatenated_names = ''
  for id in ids:
    for entry in data:
      if entry['id'] == id:
        concatenated_names += entry['sentence'] + "\n"
  return concatenated_names.strip()

def load_file(sentence_file_path):
  data = []
  with open(sentence_file_path, 'r') as f:
    for line in f:
      entry = json.loads(line)
      data.append(entry)
  return data

In [21]:
data = load_file(sentence_file_path)
data

[{'id': 'b6f4386f-bd90-404e-80a4-a5d466d3a0f9',
  'sentence': 'DeepSeek-V3 Technical Report DeepSeek-AI research@deepseek.com Abstract We present DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token'},
 {'id': '8b1f8513-8301-4baa-b7c5-65e72711c0d5',
  'sentence': 'To achieve efficient inference and cost-effective training, DeepSeek-V3 adopts Multi-head Latent Attention (MLA) and DeepSeekMoE architec- tures, which were thoroughly validated in DeepSeek-V2'},
 {'id': 'bafdf013-0015-43c2-af9c-6cc98723aed8',
  'sentence': 'Furthermore, DeepSeek-V3 pioneers an auxiliary-loss-free strategy for load balancing and sets a multi-token prediction training objective for stronger performance'},
 {'id': '45da1598-dec8-4f42-b9b8-23f8e228f6b3',
  'sentence': 'We pre-train DeepSeek-V3 on 14.8 trillion diverse and high-quality tokens, followed by Supervised Fine-Tuning and Reinforcement Learning stages to fully harness its capabilities

In [22]:
query = ["Deepseek"]
qry_emb = generate_text_embeddings(query)



In [23]:
qry_emb

[[-0.002134046982973814,
  -0.05345364660024643,
  0.005462569184601307,
  -0.021416520699858665,
  0.039022296667099,
  -0.010105929337441921,
  -0.0040922705084085464,
  -0.03538521006703377,
  0.014840569347143173,
  -0.047400809824466705,
  -0.08851015567779541,
  -0.015626629814505577,
  0.009413429535925388,
  -0.015008246526122093,
  0.0014074756763875484,
  0.05496128275990486,
  0.03669889643788338,
  -0.017831269651651382,
  -0.08223424106836319,
  -0.01058909110724926,
  0.025668596848845482,
  -0.013018603436648846,
  -0.06587757170200348,
  -0.0443401075899601,
  0.020482724532485008,
  -0.03476598113775253,
  0.04201348498463631,
  0.022937197238206863,
  -0.07300527393817902,
  0.02627064287662506,
  0.06050751730799675,
  -0.00582152372226119,
  0.020229892805218697,
  -0.011805320158600807,
  -0.07134976238012314,
  0.038198646157979965,
  0.04638567566871643,
  -0.008120018988847733,
  0.04747822508215904,
  -0.043766677379608154,
  -0.023004930466413498,
  -0.0276194

In [27]:
response = lakeside_index_ep.find_neighbors(
    deployed_index_id = index_name,
    queries = [qry_emb[0]],
    num_neighbors = 10
)

In [28]:
response

[]

In [25]:
matching_ids = [neighbor.id for sublist in response for neighbor in sublist]

context = generate_context(matching_ids, data)
prompt = f"Based on the context delimited in backticks, answer the query. ```{context}``` Query:{query[0]}"

chat = model.start_chat(history=[])
gemini_response = chat.send_message(prompt)
print(gemini_response.text)

Based on the empty context provided, I cannot answer the query.

However, based on my general knowledge, **DeepSeek** is an AI research company that develops large language models (LLMs). They are known for creating powerful open-source models that often excel in areas like coding and mathematics, such as:

*   **DeepSeek LLM:** A family of general-purpose chat models.
*   **DeepSeek Coder:** A series of models specifically trained for code generation and completion.
