<a href="https://colab.research.google.com/github/Vanimator11/Capstone-or-major-project-of-gitam/blob/main/Copy_of_Untitled4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required packages
!apt-get install -y tesseract-ocr libtesseract-dev poppler-utils
!pip install pytesseract pdf2image requests

import os
import cv2
import pytesseract
import numpy as np
from pdf2image import convert_from_path
from google.colab import files
from PIL import Image
import requests

# Configure Tesseract path for Google Colab
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

# Function to upload files in Colab
def upload_files():
    """Upload multiple files (PDFs or Images) in Google Colab."""
    uploaded = files.upload()
    return list(uploaded.keys())

# Convert PDF pages to images
def pdf_to_images(pdf_path):
    """Convert PDF pages to images for OCR processing."""
    images = convert_from_path(pdf_path, dpi=300)
    image_paths = []
    for i, image in enumerate(images):
        image_path = f"page_{i+1}.png"
        image.save(image_path, "PNG")
        image_paths.append(image_path)
    return image_paths

# Preprocess image to enhance OCR accuracy
def preprocess_image(image_path):
    """Apply grayscale, Gaussian blur, and thresholding."""
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (5, 5), 0)
    thresh = cv2.adaptiveThreshold(blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 2)
    return thresh

# Extract text from an image using Tesseract OCR
def extract_text(image_path):
    """Extract text from preprocessed image."""
    processed_image = preprocess_image(image_path)
    raw_text = pytesseract.image_to_string(processed_image)
    return raw_text

# Extract specific sections based on markers
def extract_section(raw_text, start_marker, end_marker=None):
    """Extract specific sections from text using start and end markers."""
    lines = raw_text.split('\n')
    section_text = []
    inside_section = False

    for line in lines:
        if start_marker in line:
            inside_section = True
        if inside_section:
            section_text.append(line)
        if end_marker and end_marker in line and inside_section:
            break
        elif not end_marker and line.strip() == '':
            break

    return "\n".join(section_text)

# Process uploaded files (PDFs and Images)
def process_files(file_paths):
    """Process multiple files and extract text."""
    extracted_text = ""

    for file_path in file_paths:
        if file_path.lower().endswith(".pdf"):
            print(f"\n📄 Processing PDF: {file_path}\n")
            image_paths = pdf_to_images(file_path)
            for i, image_path in enumerate(image_paths):
                extracted_text += f"\n\n--- Page {i+1} ---\n"
                extracted_text += extract_text(image_path)
                os.remove(image_path)  # Cleanup temporary images
        elif file_path.lower().endswith((".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp")):
            print(f"\n🖼️ Processing Image: {file_path}\n")
            extracted_text += extract_text(file_path)
        else:
            extracted_text += f"\n❌ Unsupported file format: {file_path}\n"

    return extracted_text

# GPT-based medical report summarization
def summarize_medical_text(text):
    """Summarize extracted text using GPT API."""
    url = "https://chatgpt-42.p.rapidapi.com/gpt4"
    headers = {
        "content-type": "application/json",
        "X-RapidAPI-Key": "43c7a0636amsh1e9374e1b64116dp11f641jsnc1c66647e6b9",  # Replace with your actual API key
        "X-RapidAPI-Host": "chatgpt-42.p.rapidapi.com"
    }
    payload = {
        "messages": [
            {
                "role": "user",
                "content": f"""You are a medical expert. Please summarize this medical report in 4-5 lines
                focusing on key findings and make sure it is easily understandable by patients in simple words do not include patient information:\n\n{text}"""
            }
        ],
        "web_access": False
    }

    try:
        response = requests.post(url, json=payload, headers=headers)
        response.raise_for_status()
        response_json = response.json()
        return response_json.get('result', "Error: Unexpected response format")
    except requests.exceptions.RequestException as e:
        return f"Request Error: {str(e)}"
    except Exception as e:
        return f"Error: {str(e)}"

# --- Run the pipeline ---
file_paths = upload_files()
extracted_text = process_files(file_paths)

# Save extracted text
with open('medical_report.txt', 'w', encoding="utf-8") as f:
    f.write(extracted_text)

# Display extracted text
print("\nExtracted Text:\n", extracted_text)

# Summarize extracted text
summary = summarize_medical_text(extracted_text)

# Format summary for better readability
formatted_summary = "\n".join(summary.split(". "))

# Print summary with bold font
print(f"\n\033[1m{formatted_summary}\033[0m")

Reading package lists... Done

Reading state information... Done
The following additional packages will be installed:
  libarchive-dev libleptonica-dev tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  libarchive-dev libleptonica-dev libtesseract-dev poppler-utils tesseract-ocr tesseract-ocr-eng
  tesseract-ocr-osd
0 upgraded, 7 newly installed, 0 to remove and 29 not upgraded.
Need to get 8,746 kB of archives.
After this operation, 32.3 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libarchive-dev amd64 3.6.0-1ubuntu1.3 [581 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libleptonica-dev amd64 1.82.0-3build1 [1,562 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libtesseract-dev amd64 4.1.1-2.1build1 [1,600 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.6 [186 kB]
Get:5 http://archive.ubuntu.com/ubuntu j

Saving WhatsApp Image 2025-01-21 at 20.12.17_980495ea.jpg to WhatsApp Image 2025-01-21 at 20.12.17_980495ea.jpg

🖼️ Processing Image: WhatsApp Image 2025-01-21 at 20.12.17_980495ea.jpg


Extracted Text:
  

DRLOGY IMAGING CENTER = & si2ssseres ostzsasers
G& & Ray | CT-SCan | MRI | USG 1) dtogyimaging@drtogy.com
106-108, SAMAR VIRION COMPLEX, HEALTHCARE ROAD, OPPOSITE HEALTHCARE COMPLEX. MUMBAS- 600578

NX! ears artogy comm

 

 

. Registered on:
Yashvi M. Patel Re: PIO 7556 02:31 PM 02 Dec, 2X
Age: 21 Years A Aptlo 22028252 Reported on:
‘Sex: Female Ref.By : Dr. Hiren Shah 04:35 PM 02 Dec, 2X
CT ABDOMEN
Pont
Abdomen : Plots & Contract
‘Tochesque —
GT ecen of ove sbciomen hes been dene wih administration of oral and inreraneue contrast media.
e
Feange / SO™ \
¢ tier shows nommal size, shape end scienuotion. He mess lesion er ceicificetion im te hepatic porenchyens, The porta hepetis io nermel Ma
AR oF C80 atetation, 6 (Sah 1

© (Gall bladder reese normal henan snd walle with normal niz

In [None]:
!pip install reportlab

Collecting reportlab
  Downloading reportlab-4.3.1-py3-none-any.whl.metadata (1.7 kB)
Downloading reportlab-4.3.1-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.3.1


In [None]:
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.units import inch


def save_summary_as_pdf(summary_text, output_filename="medical_summary.pdf"):
    """Save the extracted summary as a properly formatted PDF file."""

    doc = SimpleDocTemplate(output_filename, pagesize=letter)
    styles = getSampleStyleSheet()


    content = []


    title_style = styles['Title']
    title = Paragraph("Medical Report Summary", title_style)
    content.append(title)


    content.append(Spacer(1, 0.2 * inch))


    summary_style = styles['BodyText']
    summary_style.spaceAfter = 12
    summary_style.fontSize = 12
    summary_style.leading = 14


    paragraphs = summary_text.split("\n")
    for paragraph in paragraphs:
        if paragraph.strip():
            p = Paragraph(paragraph.strip(), summary_style)
            content.append(p)


    doc.build(content)
    print(f"\n📄 Summary saved as PDF: {output_filename}")


try:
    save_summary_as_pdf(formatted_summary)
except Exception as e:
    print(f"Error: {e}")


📄 Summary saved as PDF: medical_summary.pdf
