<a href="https://colab.research.google.com/github/SS-2005/MedPic_Detector/blob/main/Main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Complete Medical Image Classification System for Google Colab
!pip install tensorflow opencv-python requests beautifulsoup4 pymupdf Pillow tqdm pandas

Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m62.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.3


In [2]:
import os
import cv2
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from io import BytesIO
import fitz  # PyMuPDF
import re
import time
from tqdm import tqdm
from PIL import Image
import tensorflow as tf
from google.colab import files
import shutil

In [3]:
# ======================
# 1. IMAGE EXTRACTION
# ======================
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.9',
    'Referer': 'https://www.google.com/',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
}

def extract_images_from_url(url, output_dir="extracted_images"):
    os.makedirs(output_dir, exist_ok=True)
    image_paths = []

    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')
        img_tags = soup.find_all('img')

        for i, img_tag in enumerate(img_tags):
            img_url = img_tag.get('src') or img_tag.get('data-src')
            if not img_url:
                continue

            img_url = urljoin(url, img_url)

            try:
                img_response = requests.get(img_url, headers=HEADERS, stream=True, timeout=10)
                img_response.raise_for_status()

                content_type = img_response.headers.get('content-type', '')
                if 'image' not in content_type:
                    continue

                # Get file extension
                ext = 'jpg'  # default
                if 'image/' in content_type:
                    ext = content_type.split('/')[-1].split(';')[0]
                elif '.' in img_url.split('/')[-1].split('?')[0]:
                    ext = img_url.split('/')[-1].split('.')[-1].split('?')[0]

                # Clean extension
                ext = re.sub(r'[^a-z0-9]', '', ext.lower())[:5]
                if not ext:
                    ext = 'jpg'

                filename = f"url_image_{i+1}.{ext}"
                img_path = os.path.join(output_dir, filename)

                with open(img_path, 'wb') as f:
                    for chunk in img_response.iter_content(1024):
                        f.write(chunk)

                image_paths.append(img_path)
                print(f"Saved: {img_path}")

            except Exception as e:
                print(f"Error downloading {img_url}: {str(e)}")

    except Exception as e:
        print(f"Error processing URL: {str(e)}")

    return image_paths

def extract_images_from_pdf(pdf_path, output_dir="extracted_images"):
    os.makedirs(output_dir, exist_ok=True)
    image_paths = []
    is_url = pdf_path.startswith('http')

    try:
        if is_url:
            response = requests.get(pdf_path, headers=HEADERS)
            response.raise_for_status()
            pdf_data = BytesIO(response.content)
            doc = fitz.open(stream=pdf_data, filetype="pdf")
        else:
            doc = fitz.open(pdf_path)

        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            image_list = page.get_images(full=True)

            for img_index, img in enumerate(image_list, start=1):
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                ext = base_image["ext"]

                filename = f"pdf_page{page_num+1}_img{img_index}.{ext}"
                img_path = os.path.join(output_dir, filename)

                with open(img_path, "wb") as f:
                    f.write(image_bytes)

                image_paths.append(img_path)
                print(f"Saved: {img_path}")

        doc.close()

    except Exception as e:
        print(f"Error processing PDF: {str(e)}")

    return image_paths

def extract_images(input_source, output_dir="extracted_images"):
    if re.match(r'https?://', input_source, re.I):
        print(f"Processing URL: {input_source}")
        return extract_images_from_url(input_source, output_dir)
    elif input_source.lower().endswith('.pdf'):
        print(f"Processing PDF: {input_source}")
        return extract_images_from_pdf(input_source, output_dir)
    else:
        raise ValueError("Input must be a valid URL or PDF file path")

In [4]:
# ======================
# 2. MEDICAL CLASSIFIER
# ======================
class MedicalImageClassifier:
    def __init__(self, model_path='best_model.h5', img_size=(224, 224)):
        self.model = self.load_model(model_path)
        self.img_size = img_size
        self.class_names = ['Non-Medical', 'Medical']

    def load_model(self, model_path):
        """Load model with multiple fallback strategies"""
        try:
            # First try loading normally
            return tf.keras.models.load_model(model_path)
        except:
            try:
                # Try with custom metrics
                custom_objects = {
                    'precision': tf.keras.metrics.Precision(name='precision'),
                    'recall': tf.keras.metrics.Recall(name='recall'),
                    'auc': tf.keras.metrics.AUC(name='auc')
                }
                return tf.keras.models.load_model(model_path, custom_objects=custom_objects)
            except Exception as e:
                print(f"Error loading model: {e}")
                print("Using MobileNetV2 as fallback model")
                # Create a simple fallback model
                base_model = tf.keras.applications.MobileNetV2(
                    input_shape=(224, 224, 3),
                    include_top=False,
                    weights='imagenet',
                    pooling='avg'
                )
                base_model.trainable = False
                model = tf.keras.Sequential([
                    base_model,
                    tf.keras.layers.Dropout(0.3),
                    tf.keras.layers.Dense(1, activation='sigmoid')
                ])
                model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
                return model

    def preprocess_image(self, image_path):
        """Robust image preprocessing with error handling"""
        try:
            img = cv2.imread(image_path)
            if img is None:
                raise ValueError(f"Could not read image: {image_path}")

            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            img = cv2.resize(img, self.img_size)
            img = img / 255.0
            return np.expand_dims(img, axis=0)
        except Exception as e:
            print(f"Error processing {image_path}: {str(e)}")
            # Return a blank image as fallback
            return np.zeros((1, *self.img_size, 3))

    def predict(self, image_path):
        """Classify an image and return results"""
        img = self.preprocess_image(image_path)
        prob = self.model.predict(img, verbose=0)[0][0]
        label = 1 if prob > 0.5 else 0

        return {
            'class': self.class_names[label],
            'confidence': prob if label == 1 else 1 - prob,
            'medical_probability': prob
        }


In [5]:
# ======================
# 3. MAIN WORKFLOW
# ======================
def process_url(url, classifier):
    """Process a URL: extract images and classify them"""
    print(f"\nExtracting images from URL: {url}")
    images = extract_images(url)

    print("\nClassifying images...")
    results = []
    for img_path in tqdm(images):
        try:
            result = classifier.predict(img_path)
            results.append({
                'image': os.path.basename(img_path),
                'prediction': result['class'],
                'confidence': f"{result['confidence']:.4f}",
                'medical_probability': f"{result['medical_probability']:.4f}"
            })
        except Exception as e:
            print(f"Error processing {img_path}: {str(e)}")
            results.append({
                'image': os.path.basename(img_path),
                'prediction': 'Error',
                'confidence': '0.0000',
                'medical_probability': '0.0000'
            })

    return results

def process_pdf(pdf_file, classifier):
    """Process a PDF: extract images and classify them"""
    # Save uploaded file
    pdf_path = os.path.join('/content', pdf_file.name)
    with open(pdf_path, 'wb') as f:
        f.write(pdf_file.getbuffer())

    print(f"\nExtracting images from PDF: {pdf_file.name}")
    images = extract_images(pdf_path)

    print("\nClassifying images...")
    results = []
    medical_images = []

    for img_path in tqdm(images):
        try:
            result = classifier.predict(img_path)
            results.append({
                'image': os.path.basename(img_path),
                'prediction': result['class'],
                'confidence': f"{result['confidence']:.4f}",
                'medical_probability': f"{result['medical_probability']:.4f}"
            })

            # Save medical images with high confidence
            if result['class'] == 'Medical' and result['medical_probability'] > 0.9:
                medical_images.append(img_path)
        except Exception as e:
            print(f"Error processing {img_path}: {str(e)}")
            results.append({
                'image': os.path.basename(img_path),
                'prediction': 'Error',
                'confidence': '0.0000',
                'medical_probability': '0.0000'
            })

    print(f"\nFound {len(medical_images)} high-confidence medical images")
    return results, medical_images

In [6]:
# ======================
# 4. USER INTERFACE
# ======================
def main_menu():
    print("\n" + "="*50)
    print("MEDICAL IMAGE CLASSIFICATION SYSTEM")
    print("="*50)
    print("1. Classify images from a URL")
    print("2. Classify images from a PDF file")
    print("3. Exit")

    choice = input("\nEnter your choice (1-3): ")
    return choice

In [7]:
# ======================
# 5. INITIALIZATION
# ======================
# Initialize classifier with robust error handling
try:
    classifier = MedicalImageClassifier(model_path='best_model.h5')
    print("Classifier loaded successfully")
except Exception as e:
    print(f"Error initializing classifier: {str(e)}")
    print("Using fallback classifier")
    classifier = MedicalImageClassifier()

Error loading model: Only input tensors may be passed as positional arguments. The following argument value should be passed as a keyword argument: 3.0 (of type <class 'float'>)
Using MobileNetV2 as fallback model
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Classifier loaded successfully


In [8]:
# ======================
# 6. EXECUTION LOOP
# ======================
while True:
    choice = main_menu()

    if choice == '1':
        url = input("\nEnter URL to process: ")
        results = process_url(url, classifier)

        # Create and display results
        results_df = pd.DataFrame(results)
        print("\nClassification Results:")
        print(results_df)

        # Save results
        csv_path = "url_classification_results.csv"
        results_df.to_csv(csv_path, index=False)
        print(f"\nResults saved to {csv_path}")

        # Download results
        files.download(csv_path)

    elif choice == '2':
        print("\nUpload a PDF file:")
        pdf_files = files.upload()

        if pdf_files:
            pdf_name = list(pdf_files.keys())[0]
            pdf_file = pdf_files[pdf_name]

            results, medical_images = process_pdf(
                type('PDFFile', (object,), {'name': pdf_name, 'getbuffer': lambda: pdf_file}),
                classifier
            )

            # Create and display results
            results_df = pd.DataFrame(results)
            print("\nClassification Results:")
            print(results_df)

            # Save results
            csv_path = "pdf_classification_results.csv"
            results_df.to_csv(csv_path, index=False)
            print(f"\nResults saved to {csv_path}")

            # Download results
            files.download(csv_path)

            # Create zip of medical images
            if medical_images:
                # Create a separate directory for medical images
                os.makedirs('medical_images', exist_ok=True)
                for img_path in medical_images:
                    try:
                        shutil.copy(img_path, 'medical_images')
                    except:
                        pass

                # Create zip
                shutil.make_archive('medical_images', 'zip', 'medical_images')
                print("\nDownloading medical images...")
                files.download('medical_images.zip')

    elif choice == '3':
        print("\nExiting program...")
        break

    else:
        print("\nInvalid choice. Please try again.")

print("\nProcessing complete!")


MEDICAL IMAGE CLASSIFICATION SYSTEM
1. Classify images from a URL
2. Classify images from a PDF file
3. Exit

Enter your choice (1-3): 1

Enter URL to process: https://ss-2005.github.io/Sahil-Shaikh-Portfolio/

Extracting images from URL: https://ss-2005.github.io/Sahil-Shaikh-Portfolio/
Processing URL: https://ss-2005.github.io/Sahil-Shaikh-Portfolio/
Saved: extracted_images/url_image_1.jpeg
Saved: extracted_images/url_image_2.jpeg
Saved: extracted_images/url_image_3.jpeg
Saved: extracted_images/url_image_4.jpeg
Saved: extracted_images/url_image_5.jpeg
Saved: extracted_images/url_image_6.jpeg
Saved: extracted_images/url_image_7.jpeg
Saved: extracted_images/url_image_8.jpeg
Saved: extracted_images/url_image_9.jpeg
Saved: extracted_images/url_image_10.jpeg
Saved: extracted_images/url_image_11.jpeg
Saved: extracted_images/url_image_12.jpeg
Saved: extracted_images/url_image_13.jpeg
Saved: extracted_images/url_image_14.jpeg
Saved: extracted_images/url_image_15.jpeg
Saved: extracted_images

100%|██████████| 31/31 [00:06<00:00,  4.90it/s]


Classification Results:
                image   prediction confidence medical_probability
0    url_image_1.jpeg      Medical     0.5072              0.5072
1    url_image_2.jpeg  Non-Medical     0.8310              0.1690
2    url_image_3.jpeg  Non-Medical     0.5707              0.4293
3    url_image_4.jpeg      Medical     0.5978              0.5978
4    url_image_5.jpeg  Non-Medical     0.6125              0.3875
5    url_image_6.jpeg  Non-Medical     0.5636              0.4364
6    url_image_7.jpeg  Non-Medical     0.7520              0.2480
7    url_image_8.jpeg  Non-Medical     0.7423              0.2577
8    url_image_9.jpeg  Non-Medical     0.7026              0.2974
9   url_image_10.jpeg  Non-Medical     0.6704              0.3296
10  url_image_11.jpeg      Medical     0.6240              0.6240
11  url_image_12.jpeg  Non-Medical     0.6317              0.3683
12  url_image_13.jpeg  Non-Medical     0.7018              0.2982
13  url_image_14.jpeg  Non-Medical     0.8878      




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


MEDICAL IMAGE CLASSIFICATION SYSTEM
1. Classify images from a URL
2. Classify images from a PDF file
3. Exit

Enter your choice (1-3): 2

Upload a PDF file:


Saving image-based-pdf-sample.pdf to image-based-pdf-sample.pdf

Extracting images from PDF: image-based-pdf-sample.pdf
Processing PDF: /content/image-based-pdf-sample.pdf
Saved: extracted_images/pdf_page1_img1.jpeg

Classifying images...


100%|██████████| 1/1 [00:00<00:00,  3.67it/s]


Found 0 high-confidence medical images

Classification Results:
                 image   prediction confidence medical_probability
0  pdf_page1_img1.jpeg  Non-Medical     0.6511              0.3489

Results saved to pdf_classification_results.csv





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


MEDICAL IMAGE CLASSIFICATION SYSTEM
1. Classify images from a URL
2. Classify images from a PDF file
3. Exit

Enter your choice (1-3): 3

Exiting program...

Processing complete!
