In [None]:
!pip install -U bitsandbytes

In [None]:
!pip install unsloth peft accelerate transformers

In [None]:
!curl ipv4.icanhazip.com

In [None]:
!pip install streamlit
!npm install -g localtunnel

In [13]:
%%writefile app3.py
import os
import warnings
import asyncio

# Disable all problematic watchers and warnings
os.environ["STREAMLIT_WATCHER_TYPE"] = "none"
os.environ["STREAMLIT_SERVER_ENABLE_STATIC_FILE_WATCHER"] = "false"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings("ignore")

# Fix event loop issues
try:
    asyncio.get_event_loop()
except RuntimeError:
    asyncio.set_event_loop(asyncio.new_event_loop())

# Suppress specific CUDA warnings (optional)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

import streamlit as st
import torch
from transformers import BitsAndBytesConfig
from unsloth import FastLanguageModel

# Set Streamlit page configuration
st.set_page_config(page_title="🩺 MedQA Chatbot", layout="centered")

# Choose device automatically
device = "cuda" if torch.cuda.is_available() else "cpu"

@st.cache_resource
def load_model():
    model_name = "Vijayendra/Phi4-MedQA"

    # Define 4-bit quantization config
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4"
    )

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=2048,
        dtype=None,
        quantization_config=quant_config,
        device_map=device
    )

    FastLanguageModel.for_inference(model)

    # Ensure proper token handling
    if tokenizer.eos_token is None:
        tokenizer.eos_token = "</s>"
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids("</s>")
        model.config.eos_token_id = tokenizer.eos_token_id

    return model, tokenizer

model, tokenizer = load_model()

def format_prompt(question):
    return f"""<|system|>
You are a medical expert providing clear, concise answers to medical questions.
Answer the question directly and stay strictly on topic. Keep your answer professional yet understandable.</s>
<|user|>
{question}</s>
<|assistant|>"""

def clean_response(response):
    # Remove everything before the assistant marker if present
    if "<|assistant|>" in response:
        response = response.split("<|assistant|>")[-1]
    
    # Remove any remaining special tokens or prompt fragments
    response = response.replace("<|system|>", "").replace("<|user|>", "").replace("</s>", "")
    
    # Remove any duplicate question fragments
    if "Question:" in response:
        response = response.split("Question:")[0]
    
    return response.strip()

# App Title and Intro
st.title("🩺 MedQA Chatbot")
st.markdown("_Ask a medical question and receive a concise, professional answer._")

# Clear chat button
if st.button("🧹 Clear Chat"):
    st.session_state.messages = []
    st.rerun()  # ← Use the new simplified rerun()

# Initialize chat session
if "messages" not in st.session_state:
    st.session_state.messages = []

# Show chat history
for msg in st.session_state.messages:
    with st.chat_message(msg["role"]):
        st.markdown(msg["content"])

# User input box
user_input = st.chat_input("Enter your medical question:")

# If user submits a message
if user_input:
    # Add user message to chat
    with st.chat_message("user"):
        st.markdown(user_input)
    st.session_state.messages.append({"role": "user", "content": user_input})

    # Generate response from model
    with st.chat_message("assistant"):
        with st.spinner("💬 Generating answer..."):
            try:
                prompt = format_prompt(user_input)
                inputs = tokenizer([prompt], return_tensors="pt", truncation=True, max_length=2048).to(model.device)

                outputs = model.generate(
                    **inputs,
                    max_new_tokens=512,
                    eos_token_id=tokenizer.eos_token_id,
                    pad_token_id=tokenizer.eos_token_id,
                    do_sample=True,
                    temperature=0.7,
                    top_p=0.9,
                    repetition_penalty=1.1,
                    use_cache=True,
                    early_stopping=True
                )

                # Decode and clean answer
                response = tokenizer.decode(outputs[0], skip_special_tokens=True)
                answer = clean_response(response)

                # Display response
                st.markdown(answer)
                st.caption(f"🔢 Tokens used: {inputs['input_ids'].shape[-1]}")
                st.session_state.messages.append({"role": "assistant", "content": answer})
            except Exception as e:
                st.error(f"❗ An error occurred: {str(e)}")

Writing app3.py


In [16]:
!streamlit run app3.py & npx localtunnel --port 8501

[1G[0K⠙
Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠹[1G[0K⠸[1G[0K⠼[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.19.2.2:8501[0m
[34m  External URL: [0m[1mhttp://35.197.118.234:8501[0m
[0m
[1G[0K⠴[1G[0Kyour url is: https://yummy-beds-rush.loca.lt
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
2025-04-18 19:28:57.149922: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745004537.176599    2977 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745004537.185549    2977 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to

In [None]:
import shutil
shutil.rmtree("/kaggle/working")