<a href="https://colab.research.google.com/github/Sug-ar-N-Spice/Dr.Chats/blob/Patricia/Patricia_Dr_chat_pre.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ! pip install sacremoses
# ! pip install transformers
# ! pip install datasets
# ! pip install torch
#!pip install -q gradio

In [1]:
import pandas as pd
import re
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
from typing import List, Dict
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np



In [2]:

##STOP WORDS IN NLP DONT MEAN ANYTHING LIKE WE THEY THEY JUST COMPLETE THE SENTENCE

## THIS IS CLASS THAT Cleans the data
class MEDDataPreprocessor:
    """
    Preprocessor for general medical data.
    This class handles cleaning, normalization, and preparation of text data
    related to medical topics for use in a medical chatbot.
    """

    def __init__(self):
        """Initialize the preprocessor with necessary NLTK downloads."""
        nltk.download('punkt', quiet=True)
        nltk.download('stopwords', quiet=True)
        self.stop_words = set(stopwords.words('english'))

        # Minimal list of medical terms to preserve
        self.medical_terms = {
            # Terms conflicting with stopwords
            'a', 'am', 'an', 'as', 'at', 'be', 'by', 'in', 'no', 'on', 'or', 'to', 'up',
            # Critical abbreviations to always preserve
            'ct', 'dr', 'er', 'hiv', 'hr', 'icu', 'iv', 'mr', 'ms'
        }
        
        self.stop_words = self.stop_words - self.medical_terms

    def clean_text(self, text: str) -> str:
    


        """
        Clean and normalize the input text.

        Args:
            text (str): Raw input text

        Returns:
            str: Cleaned and normalized text
        """

        if pd.isna(text):
            return ""

        # Convert to lowercase
        text = text.lower()

        # Remove special characters but keep medical symbols
        text = re.sub(r'[^a-zA-Z0-9\s+\-/%]', '', text)

        # Remove extra whitespace
        text = ' '.join(text.split())

        return text

    def remove_stopwords(self, text: str) -> str:
        """
        Remove stopwords from the text, keeping medical specific terms.

        Args:
            text (str): Input text

        Returns:
            str: Text with stopwords removed
        """
        words = word_tokenize(text) ### this is resulting in a list of words was converting sentence / paragraph into a list of words

        filtered_words = [word for word in words if word.lower() not in self.stop_words]
        return ' '.join(filtered_words)

    def preprocess_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Preprocess the entire dataframe.

        Args:
            df (pd.DataFrame): Input dataframe

        Returns:
            pd.DataFrame: Preprocessed dataframe
        """
        # change columns names depending which csv file you are using cleaning text and removing stopwords
        df['clean_question'] = df['question'].apply(self.clean_text).apply(self.remove_stopwords)
        df['clean_context'] = df['context'].apply(self.clean_text).apply(self.remove_stopwords)

        # Combine cleaned abstract and results or full_texts depending on which csv you are using
        df['combined_text'] = df['clean_question'] + ' ' + df['clean_context']

        return df

    def prepare_for_model(self, text: str, max_length: int = 512) -> str: ##looks at paragraph, cuts the paragraph if more than 512 This takes the sentence splits to words and has a max length
        """
        Prepare text for model input, truncating if necessary.

        Args:
            text (str): Input text
            max_length (int): Maximum number of words

        Returns:
            str: Prepared text
        """
        words = text.split()
        if len(words) > max_length:
            return ' '.join(words[:max_length])
        return text



In [3]:
def preprocess_dataframe(self, df): ###USUALLY YOU SEE SELF IN A CLASS This allows you to code attributes in a class
    """
    Preprocess the entire dataframe.

    Args:
        df (pd.DataFrame or DatasetDict): Input dataframe with 'question' and 'context' columns

    Returns:
        pd.DataFrame: Preprocessed dataframe
    """

    # Check if df is a DatasetDict object-- When we try to find dataset Dict- that means we havent converted it to pandas and cleaning whole dataset


    if isinstance(df.DatasetDict):
        # Process each split separately and combine into a DataFrame
        all_data = []
        for split in df:
                    # Assuming all splits have the same columns
                    # Process the columns in each split
            df_split = df[split].to_pandas()  # Convert to DataFrame

            df_split['clean_question'] = df_split['question'].apply(self.clean_text).apply(self.remove_stopwords)
            df_split['clean_context'] = df_split['context'].apply(self.clean_text).apply(self.remove_stopwords)
            df_split['combined_text'] = df_split['clean_question'] + ' ' + df_split['clean_context']

            all_data.extend(df_split.to_dict('records')) # Add processed data to the list

        processed_df= pd.DataFrame(all_data) # Create a new DataFrame from the combined data

    else:  # If it's a regular DataFrame, process as before
        processed_df= df.copy()  # Create a copy to avoid modifying the original DataFrame
        processed_df['clean_question'] = processed_df['question'].apply(self.clean_text).apply(self.remove_stopwords)
        processed_df['clean_context'] = processed_df['context'].apply(self.clean_text).apply(self.remove_stopwords)
        processed_df['combined_text'] = processed_df['clean_question'] + ' ' + processed_df['clean_context']

    return processed_df  # Return the processed data

In [4]:

class ConsistentVocabularyContextSelector:
    def __init__(self, chunk_size=1000):
        self.vectorizer = None
        self.tfidf_matrices = []
        self.contexts = []
        self.chunk_size = chunk_size

    def fit(self, contexts):
        # First, fit the vectorizer on all contexts to get a consistent vocabulary
        self.vectorizer = TfidfVectorizer(stop_words='english')
        self.vectorizer.fit(contexts)

        # Now, transform chunks using the consistent vocabulary
        for i in range(0, len(contexts), self.chunk_size):
            chunk = contexts[i:i+self.chunk_size]
            self.contexts.extend(chunk)
            tfidf_matrix = self.vectorizer.transform(chunk)
            self.tfidf_matrices.append(tfidf_matrix)

    def get_most_relevant_context(self, question, top_n=1):
        question_vector = self.vectorizer.transform([question])
        all_similarities = []
        for tfidf_matrix in self.tfidf_matrices:
            similarities = cosine_similarity(question_vector, tfidf_matrix)
            all_similarities.extend(similarities[0])
        
        most_similar_idx = np.argsort(all_similarities)[-top_n:][::-1]
        return [self.contexts[i] for i in most_similar_idx]




In [5]:
class DRChatbot:
    def __init__(self, qa_model="microsoft/BioGPT"):
        self.preprocessor = MEDDataPreprocessor()
        self.qa_tokenizer = AutoTokenizer.from_pretrained(qa_model)
        self.qa_model = AutoModelForCausalLM.from_pretrained(qa_model)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.qa_model.to(self.device)
        self.context_selector = ConsistentVocabularyContextSelector()

    def preprocess_data(self, df):
        processed_df = self.preprocessor.preprocess_dataframe(df)
        return processed_df['combined_text'].tolist()

    def fit_context_selector(self, all_contexts):
        self.context_selector.fit(all_contexts)

    def answer_question(self, question):
        relevant_contexts = self.context_selector.get_most_relevant_context(question, top_n=3)
        combined_context = " ".join(relevant_contexts)
        
        prompt = f"Based on the following medical information:\n{combined_context}\n\nQuestion: {question}\nAnswer:"
        inputs = self.qa_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(self.device)
        
        input_length = inputs["input_ids"].shape[1]
        max_new_tokens = min(100, 1024 - input_length)
        
        with torch.no_grad():
            output = self.qa_model.generate(
                inputs["input_ids"],
                max_new_tokens=max_new_tokens,
                num_return_sequences=1,
                do_sample=True,
                temperature=0.7
            )
        
        return self.qa_tokenizer.decode(output[0], skip_special_tokens=True)


In [6]:
# Usage
def process_large_csv(file_path, chunk_size=1000):
    chatbot = DRChatbot()
    all_contexts = []
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        contexts = chatbot.preprocess_data(chunk)
        all_contexts.extend(contexts)
    
    chatbot.fit_context_selector(all_contexts)
    return chatbot



In [14]:
# Process the large CSV file
chatbot = process_large_csv('million_sample.csv')


In [15]:
# Example usage
question = "What are the main symptoms of gonorrhea?"
answer = chatbot.answer_question(question)
print(f"Q: {question}\nA: {answer}")

Q: What are the main symptoms of gonorrhea?
A: Based on the following medical information: name gonorrhea gonorrhea becoming untreatable gonorrhea new challenges gonorrhea new challenges current status vaccines gonorrhea vaccines gonorrhea current status future challenges Question: What are the main symptoms of gonorrhea? Answer: As this article will tell, the main symptoms of gonorrhea are acute pelvic pain, dysuria, and bleeding.
