### **Imports go here!**

In [59]:
import os
import json
import numpy as np

from fastembed import TextEmbedding
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct
from qdrant_client.models import Distance, VectorParams

### **Read the Grading Doc JSON**

In [7]:
with open(f"{os.path.join(os.getcwd(), 'rag', 'data', 'instruction_dataset.json')}", "r", encoding = "utf-8") as file:
  data = json.loads(file.read())

In [8]:
data[0].keys()

dict_keys(['instruction', 'context', 'response'])

In [15]:
unprocessed_data = []

for obj in data:
  Q = f"Context: {obj['context']}\nQuestion: {obj['instruction']}" if obj['context'] is not None else f"Question: {obj['instruction']}"
  A = obj['response']

  unprocessed_data.append({
    "Question": Q,
    "Answer": A
  })

### **Generate embeddings**

In [17]:
embedding_model = TextEmbedding()

Fetching 5 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00,  1.10it/s]


In [48]:
processed_data = []

for ele in unprocessed_data:
  processed_data.append({
    "Question": list(embedding_model.embed(ele["Question"]))[0],
    "Answer": ele["Answer"]
  })

### **Insert into Qdrant**

In [51]:
QDRANT_PORT = os.getenv("QDRANT_PORT", "6333")
QDRANT_HOST = os.getenv("QDRANT_HOST", "localhost")

qdrant_url = f"http://{QDRANT_HOST}:{QDRANT_PORT}"
client = QdrantClient(url=qdrant_url)

### **Create qdrant collection**

In [57]:
QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "grading-doc-sep-2024")

client.create_collection(
  collection_name=QDRANT_COLLECTION,
  vectors_config=VectorParams(size=processed_data[0]["Question"].shape[0], distance=Distance.DOT),
)

True

### **Add vectors**

In [67]:
points = []
for idx, ele in enumerate(processed_data):
  pt = PointStruct(id = idx + 1, vector = ele["Question"], payload = { "Answer": ele["Answer"] })
  points.append(pt)

operation_info = client.upsert(
  collection_name = QDRANT_COLLECTION,
  wait = True,
  points = points
)

print(operation_info)

operation_id=0 status=<UpdateStatus.COMPLETED: 'completed'>
