In [None]:
# class ImageClassifier(nn.Module):
#     def __init__(self, input_dim, hidden_dim=64, num_classes=5):
#         super().__init__()
#         self.fc1 = nn.Linear(input_dim, hidden_dim)
#         self.relu = nn.ReLU()
#         self.fc2 = nn.Linear(hidden_dim, num_classes)
#         
#     def forward(self, x):
#         x = self.fc1(x)
#         x = self.relu(x)
#         x = self.fc2(x)
#         return x
# torch.save(image_classifier.state_dict(), "image_classifier.pt")


In [None]:
# class VisionModule:
#     def __init__(self, resnet_model, image_classifier):
#         self.resnet = resnet_model
#         self.image_classifier = image_classifier
# 
#     def load_image(self, path): ...
#     def extract_deep_features(self, q): ...
#     def extract_handcrafted(self, q): ...
#     def build_output_dict(self, ...): ...
# 
#     def run(self, image_path):
#         img = self.load_image(image_path)
#         Q1, Q2, Q3, Q4, OCR_crop = self.crop_quadrants(img)
# 
#         # run all quadrant processing
#         # build Image_Output_Dict
#         return Image_Output_Dict, OCR_crop


In [1]:
configuration = {

    "resize": (224, 224),

    "crop_positions": {
        
        "Q1": {"x1": 772, "y1": 91, "x2": 1194, "y2": 451, "center": (967,275), "radius":143, "apply_circular_mask":True},
        
        "Q2": {"x1": 336, "y1": 90, "x2": 759, "y2": 453, "center": (592, 275), "radius":143, "apply_circular_mask":True},
        
        "Q3": {"x1": 336, "y1": 467, "x2": 758, "y2": 830, "center": (592,652), "radius":143, "apply_circular_mask":True},
        
        "Q4": {"x1": 771, "y1": 467, "x2": 1194, "y2": 830, "center": (967,652), "radius":143, "apply_circular_mask":True},
        
        "text_panel": {"x1": 11, "y1": 203, "x2": 324, "y2": 828}
    }
}


In [101]:
import cv2
import os
import numpy as np
import torch

class VisionModule:
    def __init__(self, feature_extraction_model, classifier, configuration):
        self.feature_extraction_model = feature_extraction_model
        self.classifier = classifier
        self.configuration = configuration
        
    def crop(self, img, save_dir=None):

        crops = {}
        crop_cfg = self.configuration["crop_positions"]

        for key, cfg in crop_cfg.items():
            
            # --- 1. Crop the region ---
            x1, y1, x2, y2 = cfg["x1"], cfg["y1"], cfg["x2"], cfg["y2"]
            crop_img = img[y1:y2, x1:x2].copy()
            
            # --- 2. Apply circular mask ONLY if mask_radius_factor exists ---
            if cfg.get("apply_circular_mask", False) is True:
                radius = cfg.get("radius", None)
                center = cfg.get("center", None)
                center = (center[0] - x1, center[1] - y1)
                crop_img = self._apply_circular_mask(
                    crop_img,
                    radius,
                    center
                )

            # --- 3. Optionally save cropped image ---
            if save_dir is not None:
                os.makedirs(save_dir, exist_ok=True)
                save_path = os.path.join(save_dir, f"{key}.png")
                cv2.imwrite(save_path, cv2.cvtColor(crop_img, cv2.COLOR_RGB2BGR))

            # --- 4. Add to output dictionary ---
            crops[key] = crop_img

        return crops


    def _apply_circular_mask(self, img, radius, center):
        h, w = img.shape[:2]
        
        if center is not None:
            center = (int(center[0]), int(center[1]))
        else:
            center = (int(w // 2), int(h // 2))
            
        if radius is not None:
            radius = int(radius)
        else:
            radius = int(min(h, w) * 0.9 / 2)
    
        mask = np.zeros((h, w), dtype=np.uint8)
    
        # make white circle on a center
        cv2.circle(mask, center, radius, (255, 255, 255), -1)
    
        mask = mask.astype(float) / 255.0
        
        fill_color = [0.485*255, 0.456*255, 0.406*255]
        fill_color = np.array(fill_color, dtype=np.float32)
        
        fill_img = np.ones_like(img, dtype=np.float32) * fill_color
    
        result = img.astype(float) * mask[..., None] + fill_img * (1 - mask[..., None])
    
        return result.astype(np.uint8)
    
    
    def process_crops(self, crops):
        processed = {}
        
        for key, img in crops.items():
            # Skip text panel for CNN
            if key == "text_panel":
                processed[key] = img
                continue
            
            processed[key] = self._preprocess_for_cnn(img)
        
        return processed
    
    
    def _preprocess_for_cnn(self, cropped_img):
        # resize
        size = self.configuration["resize"]
        img = cv2.resize(cropped_img, size)
    
        # convert to float32 0–1
        img = img.astype(np.float32) / 255.0
    
        # ImageNet norm (change if you use custom model)
        mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
        std  = np.array([0.229, 0.224, 0.225], dtype=np.float32)
    
        img = (img - mean) / std
    
        # HWC → CHW
        img = np.transpose(img, (2, 0, 1))
    
        return img
    
    
    def extract_deep_features(self, tensor):

        t = torch.from_numpy(tensor).unsqueeze(0).float()  
        # shape (1,3,224,224) for resnet 50
    
        with torch.no_grad():
            deep_features = self.feature_extraction_model(t).cpu().numpy().flatten()
    
        return deep_features
    

    def handcrafted_features(self, crop_img):
        hsv = cv2.cvtColor(crop_img, cv2.COLOR_RGB2HSV)
    
        mean_rgb = crop_img.mean(axis=(0,1))
        mean_hsv = hsv.mean(axis=(0,1))
    
        return {
            "dom_r": float(mean_rgb[0]),
            "dom_g": float(mean_rgb[1]),
            "dom_b": float(mean_rgb[2]),
            "mean_h": float(mean_hsv[0]),
            "mean_s": float(mean_hsv[1]),
            "mean_v": float(mean_hsv[2]),
        }
    
    
    def run_vision(self, img):
        # Step 1 — Crop
        crops = self.crop(img)
    
        # Step 2 — Normalize crops for CNN
        processed = self.process_crops(crops)
    
        vision_output = {}
    
        # Step 3 — For each quadrant
        for key in ["Q1", "Q2", "Q3", "Q4"]:
            
            # deep features
            deep_feat = self.extract_deep_features(processed[key])
    
            # handcrafted features
            hand_feat = self.handcrafted_features(crops[key])
    
            # combine features
            full_feat = np.concatenate(
                [deep_feat, np.array(list(hand_feat.values()))]
            )
    
            # Step 4 — Prediction
            pred = self.classifier.predict(full_feat.reshape(1, -1))[0]
    
            # Store results
            vision_output[key] = {
                "deep_features": deep_feat,
                "handcrafted": hand_feat,
                "prediction": pred
            }
    
        # Step 5 — Add text panel image
        vision_output["text_panel"] = crops["text_panel"]
    
        return vision_output




In [102]:
obj = VisionModule(feature_extraction_model="p",classifier="g", configuration=configuration)

img = cv2.imread(r"dataset/normal/207.jpg")
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

obj.crop(img=img, save_dir=r"dataset/cropping_sample")

{'Q1': array([[[123, 116, 103],
         [123, 116, 103],
         [123, 116, 103],
         ...,
         [123, 116, 103],
         [123, 116, 103],
         [123, 116, 103]],
 
        [[123, 116, 103],
         [123, 116, 103],
         [123, 116, 103],
         ...,
         [123, 116, 103],
         [123, 116, 103],
         [123, 116, 103]],
 
        [[123, 116, 103],
         [123, 116, 103],
         [123, 116, 103],
         ...,
         [123, 116, 103],
         [123, 116, 103],
         [123, 116, 103]],
 
        ...,
 
        [[123, 116, 103],
         [123, 116, 103],
         [123, 116, 103],
         ...,
         [123, 116, 103],
         [123, 116, 103],
         [123, 116, 103]],
 
        [[123, 116, 103],
         [123, 116, 103],
         [123, 116, 103],
         ...,
         [123, 116, 103],
         [123, 116, 103],
         [123, 116, 103]],
 
        [[123, 116, 103],
         [123, 116, 103],
         [123, 116, 103],
         ...,
         [123, 116, 10

In [None]:
# class TextModule:
#     def __init__(self, ocr_engine, text_classifier, embed_compressor):
#         self.ocr = ocr_engine
#         self.text_classifier = text_classifier
#         self.embed_compressor = embed_compressor
# 
#     def ocr_extract(self, crop): ...
#     def clean_text(self, raw): ...
#     def extract_numeric(self, text): ...
#     def classify_text(self, text): ...
#     def build_output_dict(self, ...): ...
# 
#     def run(self, OCR_crop):
#         raw = self.ocr_extract(OCR_crop)
#         cleaned = self.clean_text(raw)
#         numeric = self.extract_numeric(cleaned)
#         text_pred, text_probs, embed = self.classify_text(cleaned)
# 
#         # build Text_Output_Dict
#         return Text_Output_Dict


In [None]:
# class FusionModule:
#     def __init__(self, xgb_model):
#         self.model = xgb_model
# 
#     def build_fusion_vector(self, image_dict, text_dict): ...
#     def run(self, Image_Output_Dict, Text_Output_Dict):
#         x = self.build_fusion_vector(Image_Output_Dict, Text_Output_Dict)
#         pred = self.model.predict(x)
#         conf = self.model.predict_proba(x)
#         return Fusion_Output_Dict


In [None]:
# class ReportModule:
#     def __init__(self, t5_model, tokenizer):
#         self.model = t5_model
#         self.tokenizer = tokenizer
# 
#     def build_long_text(self, image_dict, text_dict, fusion_dict): ...
#     def summarize(self, text): ...
# 
#     def run(self, Image_Output_Dict, Text_Output_Dict, Fusion_Output_Dict):
#         full = self.build_long_text(...)
#         summary = self.summarize(full)
#         return summary


In [None]:
# def run_pipeline(image_path):
#     # 1 Vision
#     Image_Output_Dict, OCR_crop = VisionModule.run(image_path)
# 
#     # 2 Text
#     Text_Output_Dict = TextModule.run(OCR_crop)
# 
#     # 3 Fusion
#     Fusion_Output_Dict = FusionModule.run(Image_Output_Dict, Text_Output_Dict)
# 
#     # 4 Report
#     final_report = ReportModule.run(Image_Output_Dict, Text_Output_Dict, Fusion_Output_Dict)
# 
#     return {
#         "vision": Image_Output_Dict,
#         "text": Text_Output_Dict,
#         "fusion": Fusion_Output_Dict,
#         "report": final_report
#     }
