### **Embedding Generation**




In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install weaviate-client --upgrade

Collecting weaviate-client
  Downloading weaviate_client-4.11.1-py3-none-any.whl.metadata (3.6 kB)
Collecting validators==0.34.0 (from weaviate-client)
  Downloading validators-0.34.0-py3-none-any.whl.metadata (3.8 kB)
Collecting authlib<1.3.2,>=1.2.1 (from weaviate-client)
  Downloading Authlib-1.3.1-py2.py3-none-any.whl.metadata (3.8 kB)
Collecting grpcio-tools<2.0.0,>=1.66.2 (from weaviate-client)
  Downloading grpcio_tools-1.70.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.3 kB)
Collecting grpcio-health-checking<2.0.0,>=1.66.2 (from weaviate-client)
  Downloading grpcio_health_checking-1.70.0-py3-none-any.whl.metadata (1.1 kB)
Collecting protobuf<6.0dev,>=5.26.1 (from grpcio-health-checking<2.0.0,>=1.66.2->weaviate-client)
  Downloading protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Downloading weaviate_client-4.11.1-py3-none-any.whl (353 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m353.3/353.3 kB[0m [31m8.8

In [None]:
pip show weaviate-client

Name: weaviate-client
Version: 4.11.1
Summary: A python native Weaviate client
Home-page: https://github.com/weaviate/weaviate-python-client
Author: Weaviate
Author-email: hello@weaviate.io,
License: BSD 3-clause
Location: /usr/local/lib/python3.11/dist-packages
Requires: authlib, grpcio, grpcio-health-checking, grpcio-tools, httpx, pydantic, validators
Required-by: 


In [None]:
!curl "https://oxgymtteqrxaf03sexsa.c0.us-east1.gcp.weaviate.cloud/.well-known/ready"

{"code":404,"message":"path /.well-known/ready was not found"}

In [None]:
import os
os.environ['WEAVIATE_URL'] = 'https://oxgymtteqrxaf03sexsa.c0.us-east1.gcp.weaviate.cloud'
os.environ['WEAVIATE_API_KEY'] = 'OYuEkc5LrUA708D2vtrYJEl0NWUcK4ZMDwnP'


In [None]:
import os
import weaviate
from weaviate.classes.init import Auth

# Best practice: store your credentials in environment variables
weaviate_url = os.environ["WEAVIATE_URL"]
weaviate_api_key = os.environ["WEAVIATE_API_KEY"]

# Connect to Weaviate Cloud
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url,
    auth_credentials=Auth.api_key(weaviate_api_key),
)

print(client.is_ready())

True


In [None]:
import os

cleaned_data_path = "/content/drive/My Drive/ai_medical_assistant/cleaned_data/disease_symp_cleaned.csv"

if os.path.exists(cleaned_data_path):
    print(f"File exists at: {cleaned_data_path}")
else:
    print(f"File does NOT exist at: {cleaned_data_path}")

File exists at: /content/drive/My Drive/ai_medical_assistant/cleaned_data/disease_symp_cleaned.csv


In [None]:
import os
os.environ['WEAVIATE_URL'] = 'https://oxgymtteqrxaf03sexsa.c0.us-east1.gcp.weaviate.cloud'
os.environ['WEAVIATE_API_KEY'] = 'OYuEkc5LrUA708D2vtrYJEl0NWUcK4ZMDwnP'

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
import os
import weaviate

# 1. Define Paths and Load Cleaned Data
cleaned_data_path = "/content/drive/My Drive/ai_medical_assistant/cleaned_data/disease_symp_cleaned.csv" # path to the cleaned CSV
chunk_size = 1000  # Adjust based on available memory

# 2. Load Quantized Model and Tokenizer
model_name = 'distilbert-base-uncased' #same name as before
tokenizer_save_path = "/content/drive/My Drive/ai_medical_assistant/models/distilbert_tokenizer"
model_save_path = "/content/drive/My Drive/ai_medical_assistant/models/quantized_distilbert/model.pth"

tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)

#Load pre-trained model
model = AutoModel.from_pretrained(model_name)

#Quantize the model
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)

quantized_model.load_state_dict(torch.load(model_save_path, map_location=torch.device('cpu'))) #load the model for specified cpu device

quantized_model.eval() # Put model in evaluation mode

# Determine the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # <--- ADD THIS LINE HERE - IMPORTANT
quantized_model.to(device)

# 3. Weaviate Client Setup (Replace with your credentials)
WEAVIATE_URL = os.environ["WEAVIATE_URL"]
WEAVIATE_API_KEY = os.environ["WEAVIATE_API_KEY"]


client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,                                    # Replace with your Weaviate Cloud URL
    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),             # Replace with your Weaviate Cloud key
)

# 4. Function to Generate Embeddings
def generate_embedding(text):
    """Generates an embedding for the given text using the quantized model."""
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Move inputs to the same device as the model

    # Generate the embedding
    with torch.no_grad():  # Disable gradient calculation for inference
        outputs = quantized_model(**inputs) #run the model on the input
        # Mean Pooling - Take attention mask into account for correct averaging
        input_mask_expanded = inputs['attention_mask'].unsqueeze(-1).expand(outputs[0].size()).float()
        sum_embeddings = torch.sum(outputs[0] * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        embedding = sum_embeddings / sum_mask
    return embedding.cpu().numpy().tolist()[0] #move to cpu and return

# 5. Load Data and Generate Embeddings
# Load the "disease_symp_cleaned.csv" data
df = pd.read_csv(cleaned_data_path)


# Initialize the batch process *inside* the loop, without num_workers
with client.batch(
    batch_size=100,  # Adjust based on your needs and Weaviate's limits
    timeout_retries=3, # retry failed requests
) as batch:
    # Iterate through each row and create the data object
    for index, row in df.iterrows():

        # Get data from the CSV
        disease_name = row["diseases"]
        disease_description = row["descriptions"]
        disease_id = str(index) #the diseaseID is the index

        # Generate the embedding for the disease description
        embedding = generate_embedding(disease_description) #generate the embedding using the function

        #Print the shape of the embedding
        print(f"the embedding shape is: {len(embedding)}")

        # Create the data object payload
        data_object = {
            "disease_id": disease_id,
            "name": disease_name,
            "description": disease_description
        }

        # Add the object to the batch
        try:
            batch.add_data_object(
                data_object=data_object,
                class_name="Disease",
                vector=embedding
            )
            print(f"Adding object to batch for disease: {disease_name}")
        except Exception as e:
            print(f"Error adding object to batch for disease {disease_name}: {e}")

#Free up memory
del model
del quantized_model
import gc
gc.collect()

print("Embedding generation and data import complete!")

  quantized_model.load_state_dict(torch.load(model_save_path, map_location=torch.device('cpu'))) #load the model for specified cpu device


TypeError: '_BatchClientWrapper' object is not callable

In [None]:
import weaviate
import torch
from transformers import AutoTokenizer, AutoModel
import weaviate.classes as wvc

# 1. Weaviate Client Setup (Replace with your credentials)
WEAVIATE_URL = os.environ["WEAVIATE_URL"]
WEAVIATE_API_KEY = os.environ["WEAVIATE_API_KEY"]


client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,                                    # Replace with your Weaviate Cloud URL
    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),             # Replace with your Weaviate Cloud key
)

# 2. Define a Simple Test Class
try:
    questions = client.collections.create(
        name="Question",
        vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(),    # Set the vectorizer to "text2vec-openai" to use the OpenAI API for vector-related operations
        generative_config=wvc.config.Configure.Generative.cohere(),             # Set the generative module to "generative-cohere" to use the Cohere API for RAG
        properties=[
            wvc.config.Property(
                name="question",
                data_type=wvc.config.DataType.TEXT,
            ),
            wvc.config.Property(
                name="answer",
                data_type=wvc.config.DataType.TEXT,
            ),
            wvc.config.Property(
                name="category",
                data_type=wvc.config.DataType.TEXT,
            )
        ]
    )

    print(questions.config.get(simple=False))

finally:
    client.close()

# 3. Load Model and Tokenizer (Small Model)
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # Put the model in evaluation mode
# 4. Generate a Single Embedding
test_text = "This is a test string."
def generate_embedding(text):
    """Generates an embedding for the given text using the model."""
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Move inputs to the same device as the model

    # Generate the embedding
    with torch.no_grad():  # Disable gradient calculation for inference
        outputs = model(**inputs) #run the model on the input
        # Mean Pooling - Take attention mask into account for correct averaging
        input_mask_expanded = inputs['attention_mask'].unsqueeze(-1).expand(outputs[0].size()).float()
        sum_embeddings = torch.sum(outputs[0] * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        embedding = sum_embeddings / sum_mask
    return embedding.cpu().numpy().tolist()[0] #move to cpu and return

test_embedding = generate_embedding(test_text)

# 5. Import a Single Object using Batch

try:
    with client.batch(batch_size=10) as batch:
        data_object = {
            "text": test_text
        }
        batch.add_data_object(
            data_object=data_object,
            class_name="TestClass",
            vector=test_embedding
        )
    print("Single object imported successfully.")

except Exception as e:
    print(f"Error importing single object: {e}")

finally:
  #Clean up memory
  del model
  import gc
  gc.collect()

_CollectionConfig(name='Question', description=None, generative_config=_GenerativeConfig(generative=<GenerativeSearches.COHERE: 'generative-cohere'>, model={}), inverted_index_config=_InvertedIndexConfig(bm25=_BM25Config(b=0.75, k1=1.2), cleanup_interval_seconds=60, index_null_state=False, index_property_length=False, index_timestamps=False, stopwords=_StopwordsConfig(preset=<StopwordsPreset.EN: 'en'>, additions=None, removals=None)), multi_tenancy_config=_MultiTenancyConfig(enabled=False, auto_tenant_creation=False, auto_tenant_activation=False), properties=[_Property(name='question', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=_PropertyVectorizerConfig(skip=False, vectorize_property_name=True), vectorizer='text2vec-openai'), _Property(name='answer', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=Tr

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Error importing single object: '_BatchClientWrapper' object is not callable


In [None]:
import weaviate
import weaviate.classes as wvc
import os

# 1. Weaviate Client Setup (Replace with your credentials)
WEAVIATE_URL = os.environ["WEAVIATE_URL"]
WEAVIATE_API_KEY = os.environ["WEAVIATE_API_KEY"]


client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,                                    # Replace with your Weaviate Cloud URL
    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),             # Replace with your Weaviate Cloud key
)

# 2. Define a Simple Test Class
try:
    questions = client.collections.create(
        name="Question",
        vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(),    # Set the vectorizer to "text2vec-openai" to use the OpenAI API for vector-related operations
        generative_config=wvc.config.Configure.Generative.cohere(),             # Set the generative module to "generative-cohere" to use the Cohere API for RAG
        properties=[
            wvc.config.Property(
                name="question",
                data_type=wvc.config.DataType.TEXT,
            ),
            wvc.config.Property(
                name="answer",
                data_type=wvc.config.DataType.TEXT,
            ),
            wvc.config.Property(
                name="category",
                data_type=wvc.config.DataType.TEXT,
            )
        ]
    )

    print(questions.config.get(simple=False))
except Exception as e:
    print(f"Error creating class: {e}")



# 3. Import a Single Object using the new Weaviate 4 client

try:
    collection = client.collections.get("Question")

    collection.data.insert(
    properties={
        "question": "What is the capital of France?",
        "answer": "Paris",
        "category": "Geography"
    }
    )
    print("Object imported successfully.")

except Exception as e:
    print(f"Error importing single object: {e}")

finally:
    client.close()

Error creating class: Collection may not have been created properly.! Unexpected status code: 422, with response body: {'error': [{'message': 'class name Question already exists'}]}.
Error importing single object: Object was not added! Unexpected status code: 500, with response body: {'error': [{'message': 'update vector: API Key: no api key found neither in request header: X-Openai-Api-Key nor in environment variable under OPENAI_APIKEY'}]}.


In [None]:
import weaviate
import weaviate.classes as wvc
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel

# 1. Weaviate Client Setup (Replace with your credentials)
WEAVIATE_URL = os.environ["WEAVIATE_URL"]
WEAVIATE_API_KEY = os.environ["WEAVIATE_API_KEY"]

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,  # Replace with your Weaviate Cloud URL
    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),  # Replace with your Weaviate Cloud key
)

# 2. Define the Disease Class
try:
    disease_class = client.collections.create(
        name="Disease",
        properties=[
            wvc.config.Property(
                name="disease_id",
                data_type=wvc.config.DataType.TEXT,
            ),
            wvc.config.Property(
                name="name",
                data_type=wvc.config.DataType.TEXT,
            ),
            wvc.config.Property(
                name="description",
                data_type=wvc.config.DataType.TEXT,
            ),
        ],
    )
    print("Disease class created successfully.")
except Exception as e:
    print(f"Error creating Disease class: {e}")

# 3. Load Model and Tokenizer (Small Model)
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # Put the model in evaluation mode


# 4. Define the Embedding Generation Function
def generate_embedding(text):
    """Generates an embedding for the given text using the model."""
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Move inputs to the same device as the model

    # Generate the embedding
    with torch.no_grad():  # Disable gradient calculation for inference
        outputs = model(**inputs)  # run the model on the input
        # Mean Pooling - Take attention mask into account for correct averaging
        input_mask_expanded = inputs['attention_mask'].unsqueeze(-1).expand(outputs[0].size()).float()
        sum_embeddings = torch.sum(outputs[0] * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        embedding = sum_embeddings / sum_mask
    return embedding.cpu().numpy().tolist().flatten()  # move to cpu and return, flatten to 1D


# 5. Load Data and Generate Embeddings
cleaned_data_path = "/content/drive/My Drive/ai_medical_assistant/cleaned_data/disease_symp_cleaned.csv"
df = pd.read_csv(cleaned_data_path)


# 6. Batch Import with Embeddings
try:
    with client.batch(batch_size=100) as batch:
        for index, row in df.iterrows():
            disease_name = row["diseases"]
            disease_description = row["descriptions"]
            disease_id = str(index)

            embedding = generate_embedding(disease_description)  # generate the embedding
            data_object = {
                "disease_id": disease_id,
                "name": disease_name,
                "description": disease_description,
            }
            batch.add_data_object(  # add properties and vector
                data_object=data_object,
                class_name="Disease",
                vector=embedding
            )
            print(f"Added {disease_name} to the batch.")

    print("Batch import completed successfully!")
except Exception as e:
    print(f"Error during batch import: {e}")

finally:
    # Clean up memory
    del model
    import gc

    gc.collect()
    client.close()

Error creating Disease class: Collection may not have been created properly.! Unexpected status code: 422, with response body: {'error': [{'message': 'class name Disease already exists'}]}.
Error during batch import: '_BatchClientWrapper' object is not callable
