1. Data Cleaning: Removing any NULL rows or columns.
2. Data Preparation: Prepare the training data for the chatbot.
3. BERT Embeddings: Use BERT to generate embeddings for the input text.
4. LSTM Model: Build and train an LSTM model for generating responses.
5. Chatbot Integration: Combine BERT and LSTM for the chatbot functionality.

## Loading the dataset

In [7]:
from datasets import load_dataset
import pandas as pd
import re

dataset = load_dataset("ruslanmv/ai-medical-chatbot")
train_data = dataset["train"]

# For this demo, let's choose the first 1000 dialogues
df = pd.DataFrame(train_data)
df = df[["Description", "Doctor"]].rename(columns={"Description": "question", "Doctor": "answer"})

# Clean the question and answer columns
df['question'] = df['question'].apply(lambda x: re.sub(r'\s+', ' ', x.strip()))
df['answer'] = df['answer'].apply(lambda x: re.sub(r'\s+', ' ', x.strip()))

In [4]:
df.isnull().sum()


question    0
answer      0
dtype: int64

In [5]:
df.loc[df['question']==df['answer']].sum()

question    0.0
answer      0.0
dtype: float64

## Gradio Interface

In [None]:
import gradio as gr

def chat(message, history):
  response = rag_chain.invoke(message)
  return response

# Create a Gradio interface
with gr.Blocks() as interface:
  # Display a welcome message (implementation omitted for brevity)
  with gr.Row():
    with gr.Column():
      text_prompt = gr.Textbox(label="Input Prompt", placeholder="Example: What are the symptoms of COVID-19?", lines=2)
      generate_button = gr.Button("Ask Me", variant="primary")
  with gr.Row():
    answer_output = gr.Textbox(type="text", label="Answer")
  generate_button.click(chat, inputs=[text_prompt], outputs=answer_output)

# Launch the Gradio interface
interface.launch(server_name="0.0.0.0", server_port=7000, share=True)

In [None]:
from datasets import load_dataset, Dataset
import pandas as pd
import torch
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
import gradio as gr
import re

# Load the dataset
dataset = load_dataset("ruslanmv/ai-medical-chatbot")
train_data = dataset["train"]

# Preprocess the dataset
df = pd.DataFrame(train_data)
df = df[["Description", "Doctor"]].rename(columns={"Description": "title", "Doctor": "text"})
df['title'] = df['title'].apply(lambda x: re.sub(r'\s+', ' ', x.strip()))
df['text'] = df['text'].apply(lambda x: re.sub(r'\s+', ' ', x.strip()))

# Convert DataFrame to Hugging Face Dataset
data = Dataset.from_pandas(df)
data = data.map(lambda example: {"embeddings": [0.0] * 768})  # Placeholder embeddings

# Prepare the tokenizer and retriever
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
retriever = RagRetriever.from_pretrained(
    "facebook/rag-sequence-nq",
    indexed_dataset=data
)

# Load the model
model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq")

# Function for answering questions
def answer_question(question):
    input_ids = tokenizer(question, return_tensors="pt", truncation=True, padding=True)["input_ids"]
    outputs = model.generate(
        input_ids=input_ids, 
        num_beams=5, 
        min_length=10, 
        max_length=50, 
        num_return_sequences=1
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Gradio interface
def chatbot_interface(question):
    return answer_question(question)

interface = gr.Interface(
    fn=chatbot_interface,
    inputs="text",
    outputs="text",
    title="Question Answering Chatbot",
    description="Ask a question and get an answer based on the dataset."
)

# Launch the Gradio interface
interface.launch()
