# MediReportAssist - Google Colab Setup

This notebook sets up and runs the Patient Discharge Instructions Assistant in Google Colab.

## Features
- üìÑ Document Upload (PDF, TXT, DOCX)
- üîç Natural Language Queries
- ü§ñ RAG System with Hugging Face Transformers
- üí¨ Query Manager for clarification
- üåê Public URL via ngrok
- üîó Direct GitHub integration (no manual file uploads needed!)


## Step 1: Install Dependencies


In [None]:
# Install required packages
!pip install -q fastapi uvicorn[standard] python-multipart
!pip install -q chromadb sentence-transformers
!pip install -q pypdf2 python-docx
!pip install -q transformers torch accelerate
!pip install -q pyngrok

print("‚úÖ All dependencies installed!")


## Step 2: Clone from GitHub


In [None]:
# Clone the repository from GitHub
# ‚ö†Ô∏è IMPORTANT: Replace with your actual GitHub repository URL
# Example: "https://github.com/yourusername/MediReportAssist.git"
GITHUB_REPO = "https://github.com/yourusername/MediReportAssist.git"  # ‚ö†Ô∏è UPDATE THIS!

import os
import subprocess

# Clone the repository
if not os.path.exists("MediReportAssist"):
    print(f"üì• Cloning repository from GitHub...")
    print(f"   Repository: {GITHUB_REPO}")
    try:
        subprocess.run(["git", "clone", GITHUB_REPO], check=True)
        print("‚úÖ Repository cloned successfully!")
    except subprocess.CalledProcessError as e:
        print(f"‚ùå Error cloning repository: {e}")
        print("   Please check:")
        print("   1. The repository URL is correct")
        print("   2. The repository is public (or you have access)")
        print("   3. Git is available in Colab")
        raise
else:
    print("üìÅ Repository already exists, updating...")
    os.chdir("MediReportAssist")
    try:
        subprocess.run(["git", "pull"], check=True)
        print("‚úÖ Repository updated!")
    except subprocess.CalledProcessError:
        print("‚ö†Ô∏è Could not update repository (this is okay if you made local changes)")
    os.chdir("..")

# Change to project directory
if os.path.exists("MediReportAssist"):
    os.chdir("MediReportAssist")
else:
    print("‚ö†Ô∏è Warning: MediReportAssist directory not found!")
    print("   Make sure the repository URL is correct and the repository exists.")

print(f"‚úÖ Working directory: {os.getcwd()}")


## Step 3: Verify Files


In [None]:
# Verify that required files exist
import os
from pathlib import Path

required_files = [
    "app_colab.py",  # Check for Colab-specific version first
    "app.py",        # Fallback to regular version
    "query_manager.py",
    "document_processor.py",
    "static/index.html"
]

print("üîç Checking for required files...")
missing_files = []

for file in required_files:
    if os.path.exists(file):
        print(f"‚úÖ Found: {file}")
    else:
        print(f"‚ùå Missing: {file}")
        missing_files.append(file)

# Check for RAG system files
rag_files = ["rag_system_colab.py", "rag_system.py"]
rag_found = [f for f in rag_files if os.path.exists(f)]
if rag_found:
    print(f"‚úÖ Found RAG system: {rag_found[0]}")
else:
    print(f"‚ùå Missing RAG system files: {', '.join(rag_files)}")
    missing_files.extend(rag_files)

if missing_files:
    print(f"\n‚ö†Ô∏è Warning: Some files are missing: {', '.join(missing_files)}")
    print("   Make sure your GitHub repository contains all required files.")
else:
    print("\n‚úÖ All required files are present!")


## Step 4: Configure Settings


In [None]:
import os

# Enable GPU (Runtime -> Change runtime type -> GPU)
USE_GPU = True  # Set to False if no GPU available

# Use 8-bit quantization to save memory
USE_8BIT = True

# Choose LLM model
LLM_MODEL = "gpt2"  # Start with small model

# Database path (use local directory - data persists during session)
DB_PATH = "/content/MediReportAssist/data"
os.makedirs(DB_PATH, exist_ok=True)

# Set environment variables
os.environ["USE_GPU"] = str(USE_GPU).lower()
os.environ["USE_8BIT"] = str(USE_8BIT).lower()
os.environ["LLM_MODEL"] = LLM_MODEL
os.environ["DB_PATH"] = DB_PATH

print(f"‚úÖ Configuration:")
print(f"   GPU: {USE_GPU}")
print(f"   8-bit: {USE_8BIT}")
print(f"   LLM Model: {LLM_MODEL}")
print(f"   DB Path: {DB_PATH}")
print(f"\nüí° Note: Data persists during the Colab session.")
print(f"   For permanent storage, consider using Google Drive or committing to GitHub.")


## Step 5: Start Server with Public URL


In [None]:

from pyngrok import ngrok
import uvicorn
from threading import Thread
import time
import os

# Optional: Add your ngrok token here (uncomment and add token)
# ngrok.set_auth_token("your-ngrok-token-here")

# Determine which app file to use
app_file = "app_colab" if os.path.exists("app_colab.py") else "app"
print(f"üìù Using app file: {app_file}.py")

# Start ngrok tunnel
tunnel = ngrok.connect(8000)
# Extract the public URL string from the tunnel object
public_url = tunnel.public_url
print(f"üåê Public URL: {public_url}")
print(f"üì± Access your app at: {public_url}/static/index.html")
print(f"üìö API Docs: {public_url}/docs")
print(f"\n‚ö†Ô∏è IMPORTANT: Make sure to access the app using the ngrok URL above!")
print(f"   If you see 'Failed to fetch' or 'ERR_CONNECTION_REFUSED' errors:")
print(f"   1. You MUST access via: {public_url}/static/index.html")
print(f"   2. Do NOT use localhost:8000 - it won't work from your browser!")
print(f"   3. If you opened localhost, add this to the URL: ?api_url={public_url}")
print(f"   4. Or use the 'Set API URL' button that appears on the page\n")

# Start FastAPI server
def run_server():
    uvicorn.run(f"{app_file}:app", host="0.0.0.0", port=8000, log_level="info")

server_thread = Thread(target=run_server, daemon=True)
server_thread.start()

time.sleep(5)
print("‚úÖ Server is running!")

## Step 6: Test the API (Optional)


## Step 7: Stop Server and Clear Cache

Use this cell to stop the server and clear all cached data when you're done.


In [None]:
# Stop Server and Clear Cache
import os
import shutil
import subprocess
from pyngrok import ngrok

print("üõë Stopping server and clearing cache...\n")

# 1. Stop ngrok tunnels
try:
    ngrok.kill()
    print("‚úÖ Stopped ngrok tunnels")
except Exception as e:
    print(f"‚ö†Ô∏è Could not stop ngrok: {e}")

# 2. Kill any running uvicorn processes
try:
    # Find and kill uvicorn processes
    result = subprocess.run(["pkill", "-f", "uvicorn"], 
                          capture_output=True, 
                          text=True)
    print("‚úÖ Stopped uvicorn server processes")
except Exception as e:
    print(f"‚ö†Ô∏è Could not stop uvicorn: {e}")

# 3. Clear Python cache
print("\nüßπ Clearing Python cache...")
cache_dirs = ["__pycache__", ".pytest_cache", ".mypy_cache"]
for cache_dir in cache_dirs:
    for root, dirs, files in os.walk("."):
        if cache_dir in dirs:
            cache_path = os.path.join(root, cache_dir)
            try:
                shutil.rmtree(cache_path)
                print(f"   ‚úÖ Removed: {cache_path}")
            except Exception as e:
                print(f"   ‚ö†Ô∏è Could not remove {cache_path}: {e}")

# 4. Clear ChromaDB data (if exists)
print("\nüóÑÔ∏è Clearing vector database cache...")
chroma_dirs = ["chroma_db", "data/chroma_db", "chroma"]
for chroma_dir in chroma_dirs:
    if os.path.exists(chroma_dir):
        try:
            shutil.rmtree(chroma_dir)
            print(f"   ‚úÖ Removed: {chroma_dir}")
        except Exception as e:
            print(f"   ‚ö†Ô∏è Could not remove {chroma_dir}: {e}")

# 5. Clear uploads directory
print("\nüìÅ Clearing uploads...")
upload_dirs = ["uploads", "data/uploads"]
for upload_dir in upload_dirs:
    if os.path.exists(upload_dir):
        try:
            # Remove files but keep directory
            for filename in os.listdir(upload_dir):
                file_path = os.path.join(upload_dir, filename)
                try:
                    if os.path.isfile(file_path):
                        os.remove(file_path)
                        print(f"   ‚úÖ Removed file: {filename}")
                except Exception as e:
                    print(f"   ‚ö†Ô∏è Could not remove {filename}: {e}")
        except Exception as e:
            print(f"   ‚ö†Ô∏è Could not clear {upload_dir}: {e}")

# 6. Clear Python bytecode files
print("\nüêç Clearing Python bytecode...")
pyc_count = 0
for root, dirs, files in os.walk("."):
    # Skip hidden directories and common exclusions
    dirs[:] = [d for d in dirs if not d.startswith('.') and d not in ['node_modules', 'venv', 'env']]
    for file in files:
        if file.endswith(('.pyc', '.pyo')):
            try:
                os.remove(os.path.join(root, file))
                pyc_count += 1
            except Exception as e:
                pass
if pyc_count > 0:
    print(f"   ‚úÖ Removed {pyc_count} .pyc/.pyo files")

# 7. Clear test files
print("\nüß™ Clearing test files...")
test_files = ["test_discharge.txt", "test_*.txt", "test_*.pdf"]
import glob
for pattern in test_files:
    for file_path in glob.glob(pattern):
        try:
            os.remove(file_path)
            print(f"   ‚úÖ Removed: {file_path}")
        except Exception as e:
            print(f"   ‚ö†Ô∏è Could not remove {file_path}: {e}")

print("\n‚úÖ Cleanup complete!")
print("\nüí° To restart the server, run Step 5 again.")


In [None]:
# Test the API endpoints
import requests
import time

# Wait a bit more to ensure server is ready
time.sleep(2)

# Test 1: Health check
try:
    response = requests.get(f"{public_url}/health", timeout=5)
    print("‚úÖ Health Check:")
    print(response.json())
except Exception as e:
    print(f"‚ùå Health check failed: {e}")

# Test 2: Create a test file and upload
try:
    # Create a sample test file
    test_content = """DISCHARGE SUMMARY
    
Patient Name: Test Patient
Date: 2024-01-15

MEDICATIONS:
- Amoxicillin 500mg: Take twice daily (morning and evening) for 7 days
- Ibuprofen 200mg: Take as needed for pain, maximum 3 times per day

DIETARY RESTRICTIONS:
- Avoid spicy foods for 2 weeks
- No alcohol for 1 week
- Drink plenty of water

ACTIVITY RESTRICTIONS:
- No heavy lifting for 4 weeks
- Light walking is encouraged
- Avoid strenuous exercise

FOLLOW-UP:
- Schedule appointment in 2 weeks
- Contact doctor if fever > 101¬∞F or severe pain
"""
    
    # Save test file
    test_file_path = "test_discharge.txt"
    with open(test_file_path, "w") as f:
        f.write(test_content)
    
    # Upload test file
    with open(test_file_path, "rb") as f:
        files = {"file": ("test_discharge.txt", f, "text/plain")}
        response = requests.post(f"{public_url}/upload", files=files, timeout=30)
    
    if response.status_code == 200:
        print("\n‚úÖ Upload Test Successful:")
        print(response.json())
    else:
        print(f"\n‚ùå Upload Test Failed:")
        print(f"Status: {response.status_code}")
        print(f"Response: {response.text}")
        
except Exception as e:
    print(f"\n‚ùå Upload test failed: {e}")
    print("This is okay - you can test via the web interface instead!")
