# Load data into Weaviate

## Load dependencies

In [1]:
# Load the modules in the src folder
import sys
sys.path.append('../src/')

In [13]:
import os
import weaviate
from weaviate.util import generate_uuid5
import gdown
import pandas as pd
from utils.common import SettingsLoader
import json
from tqdm import tqdm
import numpy as np

In [3]:
# Load the weaviate app settings
APP_NAME="WEAVIATE"
DATA_PATH = "../data/"
options = SettingsLoader.load(
    APP_NAME,
    {},
)

In [4]:
# Establish a connection with Weaviate
client = weaviate.Client(
    url=options.get("url"),
    auth_client_secret=weaviate.AuthApiKey(
        api_key=options.get("api_key")
    ),
)

## Download and load data

In [None]:

# Download source data
gdown.download_folder(
    "https://drive.google.com/drive/folders/1ic2kX1Vd0xUfBtmbwqXLIAyzBhvSHg7f?usp=drive_link", 
    output=DATA_PATH,
    quiet=False,
)

In [5]:
messages_df = pd.read_csv(os.path.join(DATA_PATH, "./Vector DB & LLM Hackathon/messages.csv"))
messages_df.__Text = messages_df.__Text.astype(str)
message_embeddings_df = pd.read_csv(os.path.join(DATA_PATH, "./Vector DB & LLM Hackathon/messages-embeddings-ada-002.csv"))
len(message_embeddings_df)

62726

In [6]:
chats_df = pd.read_csv(os.path.join(DATA_PATH, "./Vector DB & LLM Hackathon/chats.csv"))
chats_embeddings_df = pd.read_csv(os.path.join(DATA_PATH, "./Vector DB & LLM Hackathon/chats-embeddings-ada-002.csv"))
len(chats_df.thread_id.unique())

9719

In [7]:
# Get your embeddings data together.
# Create a temp index of the chats

def join_data_embeddings(df, df_embeddings, df_index, df_embeddings_index, df_text_name):
# Link the chats and embeddings together
  embeddings = []
  VECTOR_SIZE = None
  COUNTER = 0
  for _, row in tqdm(df_embeddings.iterrows(), desc="Collecting chats and embeddings"):
    embedding = json.loads(row['embedding'])
    text = df[df[df_index] == row[df_embeddings_index]][df_text_name]
    text = "\n".join(text.values)
    embeddings.append({
      "thread_id": row[df_embeddings_index], 
      "embedding":  embedding, 
      "chat_text": text,
    })
    
    if not VECTOR_SIZE:
      VECTOR_SIZE = len(embedding)
    else:
      assert VECTOR_SIZE==len(embedding)
    
  return embeddings

In [8]:
embeddings_chats = join_data_embeddings(chats_df, chats_embeddings_df, "thread_id", "thread_id", "chat_text")

Collecting chats and embeddings: 9713it [00:09, 1018.29it/s]


In [9]:
embeddings_messages = join_data_embeddings(messages_df, message_embeddings_df, "Thread_Timstamp", "message_id", "__Text")

Collecting chats and embeddings: 62726it [03:30, 298.45it/s]


In [10]:
print(len(embeddings_chats), len(embeddings_messages))
concatenate_embeddings = embeddings_chats + embeddings_messages
print(len(concatenate_embeddings))

9713 62726
72439


## Create vector database

In [12]:
client.schema.delete_all()

#Create a class object for our chat conversations:
class_obj = {
    "class": "Message",
    "description": "MLOps Community Messages",
    "vectorizer": "none",
    "properties": [
        {
            "name": "messages",
            "dataType": ["text"],
            "description": "Text of Messages",
        },
    ]
}
client.schema.create_class(class_obj)

In [14]:
# bulk insert data

with client.batch(
    batch_size=100
) as batch:
    # Batch import all conversations
    for row in tqdm(concatenate_embeddings, desc="Importing conversations"):
        properties = {
            "message": row['chat_text'],
        }

        client.batch.add_data_object(
            properties,
            "Message",
            vector=row["embedding"],
            uuid=generate_uuid5(row["chat_text"])
        )

Importing conversations: 100%|██████████| 72439/72439 [01:50<00:00, 656.86it/s]


In [13]:
def search_index(embedding):
  custom_vector = np.array(embedding).astype(np.float32)
  response = (
      client.query
      .get("Conversations", ["thread_id"])
      .with_near_vector({"vector":custom_vector})
      .with_limit(3)
      .do()
  )

  return response["data"]["Get"]["Conversations"]

In [14]:
# test with an existing document, that that document is returned
row = embeddings[2539]
docs = search_index(row['embedding'])
assert row['thread_id']==docs[0]["thread_id"], "Document does not match"
