<a href="https://colab.research.google.com/github/Sug-ar-N-Spice/Dr.Chats/blob/Patricia/Patricia_Dr_chat_pre.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install sentence-transformers
# ! pip install sacremoses
# ! pip install transformers
# ! pip install datasets
# ! pip install torch
#!pip install -q gradio
#!pip install lancedb transformers sentence-transformers

In [16]:
import pandas as pd
import pyarrow as pa
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
from sentence_transformers import SentenceTransformer
import lancedb
import traceback

In [17]:

##STOP WORDS IN NLP DONT MEAN ANYTHING LIKE WE THEY THEY JUST COMPLETE THE SENTENCE

## THIS IS CLASS THAT Cleans the data
class MEDDataPreprocessor:
    """
    Preprocessor for general medical data.
    This class handles cleaning, normalization, and preparation of text data
    related to medical topics for use in a medical chatbot.
    """

    def __init__(self):
        """Initialize the preprocessor with necessary NLTK downloads."""
        nltk.download('punkt', quiet=True)
        nltk.download('stopwords', quiet=True)
        self.stop_words = set(stopwords.words('english'))

        # Minimal list of medical terms to preserve
        self.medical_terms = {
            # Terms conflicting with stopwords
            'a', 'am', 'an', 'as', 'at', 'be', 'by', 'in', 'no', 'on', 'or', 'to', 'up',
            # Critical abbreviations to always preserve
            'ct', 'dr', 'er', 'hiv', 'hr', 'icu', 'iv', 'mr', 'ms'
        }

        self.stop_words = self.stop_words - self.medical_terms

    def clean_text(self, text: str) -> str:



        """
        Clean and normalize the input text.

        Args:
            text (str): Raw input text

        Returns:
            str: Cleaned and normalized text
        """

        if pd.isna(text):
            return ""

        # Convert to lowercase
        text = text.lower()

        # Remove special characters but keep medical symbols
        text = re.sub(r'[^a-zA-Z0-9\s+\-/%]', '', text)

        # Remove extra whitespace
        text = ' '.join(text.split())

        return text

    def remove_stopwords(self, text: str) -> str:
        """
        Remove stopwords from the text, keeping medical specific terms.

        Args:
            text (str): Input text

        Returns:
            str: Text with stopwords removed
        """
        words = word_tokenize(text) ### this is resulting in a list of words was converting sentence / paragraph into a list of words

        filtered_words = [word for word in words if word.lower() not in self.stop_words]
        return ' '.join(filtered_words)

    def preprocess_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Preprocess the entire dataframe.

        Args:
            df (pd.DataFrame): Input dataframe

        Returns:
            pd.DataFrame: Preprocessed dataframe
        """

        processed_df = df.copy()

        # Check if required columns exist
        if 'question' in df.columns and 'context' in df.columns:
            # change columns names depending which csv file you are using cleaning text and removing stopwords
            processed_df['clean_question'] = processed_df['question'].apply(self.clean_text).apply(self.remove_stopwords)
            processed_df['clean_context'] = processed_df['context'].apply(self.clean_text).apply(self.remove_stopwords)
            # Combine
            processed_df['combined_text'] = processed_df['clean_question'] + ' ' + processed_df['clean_context']
        
        else:
            print("Warning: Expected columns 'question' and 'context' not found.")
            print(f"Available columns: {df.columns.tolist()}")
            # Handle missing columns gracefully
            processed_df['combined_text'] = ''

        return processed_df



In [26]:
class MedicalRAG:
    def __init__(self, 
                 embedding_model='all-MiniLM-L6-v2',
                 llm_model="microsoft/BioGPT",
                 db_path="medical_vectors"):
        # Initialize models
        self.embedding_model = SentenceTransformer(embedding_model)
        self.tokenizer = AutoTokenizer.from_pretrained(llm_model)
        self.model = AutoModelForCausalLM.from_pretrained(llm_model)
        
        # Initialize vector database
        self.db = lancedb.connect(db_path)
        self.table = self._create_or_get_table()
        
    def _create_or_get_table(self):
        schema = pa.schema ([
            pa.field("text", pa.string()),
            pa.field("embedding", pa.list_(pa.float32(), 384)),  # 384 is the dimension of MiniLM embeddings
            pa.field("type", pa.string()),
            pa.field("metadata", pa.string())
        ])
        
        if "medical_data" not in self.db.table_names():
            # Create empty DataFrame with correct schema
            empty_data = [{
                "text": [],
                "embedding": [],
                "type": [],
                "metadata": []
            }]
            return self.db.create_table("medical_data",data=empty_data, schema=schema)
        return self.db.open_table("medical_data")
    
    def add_documents(self, df, batch_size=32):
        """Add documents to the vector database"""
        for i in range(0, len(df), batch_size):
            batch = df.iloc[i:i+batch_size]
            
            # Create embeddings
            texts = batch['combined_text'].tolist()
            embeddings = self.embedding_model.encode(texts)
            
            # Create list of dictionaries (one for each document)
            data = [{
                "text": text,
                "embedding": embedding.tolist(),
                "type": "context",
                "metadata": ""
                }
                for text, embedding in zip(texts, embeddings)
            ]
            
            # Add to database
            self.table.add(data)
            print(f"Added batch of {len(texts)} documents to database")
    
    def get_relevant_context(self, query, k=3):
        """Retrieve relevant context using vector similarity"""
        query_embedding = self.embedding_model.encode([query])[0]
        results = self.table.search(query_embedding).limit(k).to_pandas()
        return results['text'].tolist()
    
    def answer_question(self, question):
        """Generate answer using RAG"""
        # Get relevant context
        relevant_contexts = self.get_relevant_context(question)
        combined_context = " ".join(relevant_contexts)
        
        # Create prompt
        prompt = f"""Based on the following medical information, please provide a clear, accurate answer.
        If the information provided doesn't contain enough relevant details, please say so.

        Context: {combined_context}

        Question: {question}

        Answer:"""
        
        # Generate answer
        inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
        outputs = self.model.generate(
            inputs["input_ids"],
            max_length=200,
            num_return_sequences=1,
            temperature=0.7,
            do_sample=True
        )
        
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)



In [27]:
def main():
    file_path = 'million_sample.csv'
    
    try:
        # Initialize RAG system
        rag = MedicalRAG()

        # Load and check the data first
        print("Reading CSV file...")
        df = pd.read_csv(file_path)
        print(f"Loaded {len(df)} rows with columns: {df.columns.tolist()}")
        
        # Process in manageable chunks
        chunk_size = 10000
        for i in range(0, len(df), chunk_size):
            chunk = df.iloc[i:i+chunk_size]
            print(f"\nProcessing chunk {i//chunk_size + 1}/{len(df)//chunk_size + 1}")

            # Preprocess chunk
            preprocessor = MEDDataPreprocessor()
            processed_chunk = preprocessor.preprocess_dataframe(chunk)
            
            if processed_chunk['combined_text'].str.strip().str.len().sum() > 0:
                # Add to vector database
                rag.add_documents(processed_chunk)
                print(f"Processed and added {len(processed_chunk)} documents")
            else:
                print("Warning: Chunk produced no valid text after preprocessing")
        
        # Test the system
        print("\nTesting the system...")
        test_questions = [
            "What are the symptoms of asthma?",
            "How is diabetes diagnosed?",
            "What are common treatments for hypertension?"
        ]
        
        for question in test_questions:
            answer = rag.answer_question(question)
            print(f"\nQ: {question}")
            print(f"A: {answer}")
            
    except KeyboardInterrupt:
        print("\nProcessing interrupted by user.")
    except Exception as e:
        print(f"\nAn error occurred: {str(e)}")
        traceback.print_exc()

In [None]:
# Run the system
if __name__ == "__main__":
    main()

Reading CSV file...
Loaded 1000000 rows with columns: ['question', 'context']

Processing chunk 1/101
Added batch of 32 documents to database
Added batch of 32 documents to database
Added batch of 32 documents to database
Added batch of 32 documents to database
Added batch of 32 documents to database
Added batch of 32 documents to database
Added batch of 32 documents to database
Added batch of 32 documents to database
Added batch of 32 documents to database
Added batch of 32 documents to database
Added batch of 32 documents to database
Added batch of 32 documents to database
Added batch of 32 documents to database
Added batch of 32 documents to database
Added batch of 32 documents to database
Added batch of 32 documents to database
Added batch of 32 documents to database
Added batch of 32 documents to database
Added batch of 32 documents to database
Added batch of 32 documents to database
Added batch of 32 documents to database
Added batch of 32 documents to database
Added batch of 32 

In [21]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
