In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import io
import os

# Define paths to your data on Google Drive (replace with your actual paths)
all_doc_data_path = "/content/drive/My Drive/ai_medical_assistant/data/all_doctors_data.csv"
disease_symp_path = "/content/drive/My Drive/ai_medical_assistant/data/disease-symp.csv"
medical_chatbot_qa_path = "/content/drive/My Drive/ai_medical_assistant/data/medical_chatbot_qa.csv"
medical_book_path = "/content/drive/My Drive/ai_medical_assistant/data/Medical_book.pdf"

output_path = "/content/drive/My Drive/ai_medical_assistant/cleaned_data/"
os.makedirs(output_path, exist_ok=True)

In [None]:
# Chunk size (adjust based on available RAM)
chunk_size = 1000

# Initialize an empty list to store cleaned chunks
cleaned_chunks = []

# Iterate over the CSV file in chunks
for chunk in pd.read_csv(all_doc_data_path, chunksize=chunk_size):
    # Data Cleaning and Transformation
    # Handle missing values (e.g., fill with "Unknown" or drop the row)
    chunk.fillna("Unknown", inplace=True)
    # Standardize column names (e.g., lowercase, replace spaces)
    chunk.columns = chunk.columns.str.lower().str.replace(" ", "_")

    #Data type optimizations
    for col in chunk.columns:
      if chunk[col].dtype == 'object': #check type
        num_unique_values = len(chunk[col].unique())
        num_total_values = len(chunk[col])
        if (num_unique_values / num_total_values) < 0.5: #if unique values are less than 50% of total values
          chunk[col] = chunk[col].astype('category') #convert type to category

    #Potentially standardize state names
    #chunk['state'] = chunk['state'].apply(standardize_state_name) #example transformation, assuming you have this function

    # Append the cleaned chunk to the list
    cleaned_chunks.append(chunk)

# Concatenate all cleaned chunks into a single DataFrame
all_doc_data_cleaned = pd.concat(cleaned_chunks, ignore_index=True)

# Save the cleaned data to a new CSV file
all_doc_data_cleaned.to_csv(os.path.join(output_path, "all_doc_data_cleaned.csv"), index=False)

# Free up memory
del cleaned_chunks
del all_doc_data_cleaned
import gc
gc.collect()

print("all_doc_data_cleaned.csv process done ")

all_doc_data_cleaned.csv process done 


In [None]:
#Chunk size (adjust based on available RAM)
chunk_size = 1000

# Initialize an empty list to store cleaned chunks
cleaned_chunks = []

# Iterate over the CSV file in chunks
for chunk in pd.read_csv(disease_symp_path, chunksize=chunk_size):

    # Data Cleaning and Transformation
    chunk.fillna("Unknown", inplace=True)
    chunk.columns = chunk.columns.str.lower().str.replace(" ", "_")

    #More advanced text cleaning - lowercase
    chunk['diseases'] = chunk['diseases'].str.lower()
    chunk['descriptions'] = chunk['descriptions'].str.lower()

    cleaned_chunks.append(chunk)

# Concatenate all cleaned chunks into a single DataFrame
disease_symp_cleaned = pd.concat(cleaned_chunks, ignore_index=True)

disease_symp_cleaned.to_csv(os.path.join(output_path, "disease_symp_cleaned.csv"), index=False)

# Free up memory
del cleaned_chunks
del disease_symp_cleaned
import gc
gc.collect()

print("disease_symp_cleaned.csv process done ")

disease_symp_cleaned.csv process done 


In [None]:
#Chunk size (adjust based on available RAM)
chunk_size = 1000

# Initialize an empty list to store cleaned chunks
cleaned_chunks = []

# Iterate over the CSV file in chunks
for chunk in pd.read_csv(medical_chatbot_qa_path, chunksize=chunk_size):
    # Data Cleaning and Transformation
    chunk.fillna("Unknown", inplace=True)
    chunk.columns = chunk.columns.str.lower().str.replace(" ", "_")

    #Text cleaning - remove punctuation, lowercase
    chunk['description'] = chunk['description'].str.lower().str.replace(r'[^\w\s]+', '', regex=True)
    chunk['patient'] = chunk['patient'].str.lower().str.replace(r'[^\w\s]+', '', regex=True)
    chunk['doctor'] = chunk['doctor'].str.lower().str.replace(r'[^\w\s]+', '', regex=True)

    cleaned_chunks.append(chunk)

# Concatenate all cleaned chunks into a single DataFrame
medical_chatbot_qa_cleaned = pd.concat(cleaned_chunks, ignore_index=True)

medical_chatbot_qa_cleaned.to_csv(os.path.join(output_path, "medical_chatbot_qa_cleaned.csv"), index=False)

# Free up memory
del cleaned_chunks
del medical_chatbot_qa_cleaned
import gc
gc.collect()

print("medical_chatbot_qa_cleaned.csv process done ")

medical_chatbot_qa_cleaned.csv process done 


In [None]:
!pip install PyPDF2 #if you dont have it installed

import PyPDF2

def extract_text_from_pdf(pdf_path):
  """Extracts text from a PDF file."""
  text = ""
  try:
    with open(pdf_path, 'rb') as file:
      reader = PyPDF2.PdfReader(file)
      for page_num in range(len(reader.pages)):
        page = reader.pages[page_num]
        text += page.extract_text()
  except Exception as e:
    print(f"Error extracting text from PDF: {e}")
    return None
  return text


medical_book_text = extract_text_from_pdf(medical_book_path)

if medical_book_text:
  #Basic cleaning
  medical_book_text = medical_book_text.lower()
  # You'll likely need more sophisticated cleaning here (e.g., remove headers/footers, split into sections)
  #Depending on the size, may need to chunk and save to multiple text files

  with open(os.path.join(output_path, "medical_book_cleaned.txt"), "w", encoding="utf-8") as f:
    f.write(medical_book_text)

  del medical_book_text
  import gc
  gc.collect()

  print("medical_book_cleaned.text process done ")

else:
  print("Could not extract text from Medical_book.pdf")

medical_book_cleaned.text process done 


In [None]:
!pip install weaviate-client



In [None]:
import os
os.environ['WEAVIATE_URL'] = 'https://oxgymtteqrxaf03sexsa.c0.us-east1.gcp.weaviate.cloud'
os.environ['WEAVIATE_API_KEY'] = 'OYuEkc5LrUA708D2vtrYJEl0NWUcK4ZMDwnP'

In [None]:
import weaviate
print(f"Weaviate version: {weaviate.__version__}")

Weaviate version: 4.11.1


In [None]:
import os
from weaviate.classes.init import Auth

# Set environment variables
WEAVIATE_URL = os.environ["WEAVIATE_URL"]
WEAVIATE_API_KEY = os.environ["WEAVIATE_API_KEY"]

# Authenticate with the Weaviate cluster
auth_config = weaviate.AuthApiKey(api_key=os.environ["WEAVIATE_API_KEY"])

# Instantiate the client with the auth config and cluster URL
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,                                    # Replace with your Weaviate Cloud URL
    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),             # Replace with your Weaviate Cloud key
)
# 4. Test connection
print(client.is_ready())

class_name = "Doctor"
class_properties = [
    {"name": "name", "dataType": ["text"]},
    {"name": "specialization", "dataType": ["text"]},
    {"name": "experience", "dataType": ["int"]},
    {"name": "location", "dataType": ["geoCoordinates"]},
    {"name": "consultation_fee", "dataType": ["number"]}
]

class_obj = {
    "class": class_name,
    "properties": class_properties,
      "vectorizerConfig": {
        "text2vec-transformers": {
            "vectorizeClassName": False,
            "vectorizePropertyName": False
        }
    },
    "description": "Collection to store Doctor information"
}

# Get all collection
col_configs = client.collections.list_all()

# Print all collection names
for name in col_configs:
    print(name)

# # Print one collection configuration
print(col_configs["Doctor"])
print(col_configs["Doctor"].properties)        # property schema
print(col_configs["Doctor"].vectorizer_config) # vectorizer configuration
print(col_configs["Doctor"].vectorizer)

# # Create the Doctor class in Weaviate
# try:
#     client.schema.create_class(class_obj)
#     print("Doctor class created successfully.")
# except Exception as e:
#     print(f"Error creating Doctor class: {e}")




True
Doctor
Disease
_CollectionConfigSimple(name='Doctor', description=None, generative_config=_GenerativeConfig(generative=<GenerativeSearches.COHERE: 'generative-cohere'>, model={}), properties=[_Property(name='properties', description="This property was generated by Weaviate's auto-schema feature on Sun Mar  2 17:36:47 2025", data_type=<DataType.OBJECT_ARRAY: 'object[]'>, index_filterable=True, index_range_filters=False, index_searchable=False, nested_properties=[_NestedProperty(data_type=<DataType.TEXT_ARRAY: 'text[]'>, description="This nested property was generated by Weaviate's auto-schema feature on Sun Mar  2 17:36:47 2025", index_filterable=True, index_searchable=True, name='dataType', nested_properties=None, tokenization=<Tokenization.WORD: 'word'>), _NestedProperty(data_type=<DataType.TEXT: 'text'>, description="This nested property was generated by Weaviate's auto-schema feature on Sun Mar  2 17:36:47 2025", index_filterable=True, index_searchable=True, name='name', nested

In [None]:
import os
from weaviate.classes.init import Auth

# Set environment variables
WEAVIATE_URL = os.environ["WEAVIATE_URL"]
WEAVIATE_API_KEY = os.environ["WEAVIATE_API_KEY"]

# Authenticate with the Weaviate cluster
auth_config = weaviate.AuthApiKey(api_key=os.environ["WEAVIATE_API_KEY"])

# Instantiate the client with the auth config and cluster URL
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,                                    # Replace with your Weaviate Cloud URL
    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),             # Replace with your Weaviate Cloud key
)
# 4. Test connection
print(client.is_ready())


class_name = "Disease"
class_properties = [
    {"name": "name", "dataType": ["text"]},
    {"name": "description", "dataType": ["text"]}
]

class_obj = {
    "class": class_name,
    "properties": class_properties,
      "vectorizerConfig": {
        "text2vec-transformers": {
            "vectorizeClassName": False,
            "vectorizePropertyName": False
        }
    },
    "description": "Collection to store Disease information"
}

# Get all collection
col_configs = client.collections.list_all()

# Print all collection names
for name in col_configs:
    print(name)

# Print information about the Disease collection
print(col_configs["Disease"])
print(col_configs["Disease"].properties)        # property schema
print(col_configs["Disease"].vectorizer_config) # vectorizer configuration
print(col_configs["Disease"].vectorizer)

# # Create the Disease class in Weaviate (commented out to avoid errors)
# try:
#     client.schema.create_class(class_obj)
#     print("Disease class created successfully.")
# except Exception as e:
#     print(f"Error creating Disease class: {e}")


True
Doctor
Disease
_CollectionConfigSimple(name='Disease', description=None, generative_config=_GenerativeConfig(generative=<GenerativeSearches.COHERE: 'generative-cohere'>, model={}), properties=[], references=[], reranker_config=None, vectorizer_config=_VectorizerConfig(vectorizer=<Vectorizers.TEXT2VEC_WEAVIATE: 'text2vec-weaviate'>, model={'baseUrl': 'https://api.embedding.weaviate.io', 'model': 'Snowflake/snowflake-arctic-embed-l-v2.0', 'truncate': 'right'}, vectorize_collection_name=True), vectorizer=<Vectorizers.TEXT2VEC_WEAVIATE: 'text2vec-weaviate'>, vector_config=None)
[]
_VectorizerConfig(vectorizer=<Vectorizers.TEXT2VEC_WEAVIATE: 'text2vec-weaviate'>, model={'baseUrl': 'https://api.embedding.weaviate.io', 'model': 'Snowflake/snowflake-arctic-embed-l-v2.0', 'truncate': 'right'}, vectorize_collection_name=True)
Vectorizers.TEXT2VEC_WEAVIATE
