In [None]:
# class ImageClassifier(nn.Module):
#     def __init__(self, input_dim, hidden_dim=64, num_classes=5):
#         super().__init__()
#         self.fc1 = nn.Linear(input_dim, hidden_dim)
#         self.relu = nn.ReLU()
#         self.fc2 = nn.Linear(hidden_dim, num_classes)
#         
#     def forward(self, x):
#         x = self.fc1(x)
#         x = self.relu(x)
#         x = self.fc2(x)
#         return x
# torch.save(image_classifier.state_dict(), "image_classifier.pt")


In [None]:
# class VisionModule:
#     def __init__(self, resnet_model, image_classifier):
#         self.resnet = resnet_model
#         self.image_classifier = image_classifier
# 
#     def load_image(self, path): ...
#     def extract_deep_features(self, q): ...
#     def extract_handcrafted(self, q): ...
#     def build_output_dict(self, ...): ...
# 
#     def run(self, image_path):
#         img = self.load_image(image_path)
#         Q1, Q2, Q3, Q4, OCR_crop = self.crop_quadrants(img)
# 
#         # run all quadrant processing
#         # build Image_Output_Dict
#         return Image_Output_Dict, OCR_crop


In [3]:
configuration = {

    "resize": (224, 224),

    "crop_positions": {
        
        "Q1": {"x1": 772, "y1": 91, "x2": (772+360), "y2": (91+360), "center": (967,275), "radius":143, "apply_circular_mask":True},
        
        "Q2": {"x1": 410, "y1": 90, "x2": (410+360), "y2": (90+360), "center": (592, 275), "radius":143, "apply_circular_mask":True},
        
        "Q3": {"x1": 410, "y1": 467, "x2": (410+360), "y2": (467+360), "center": (592,652), "radius":143, "apply_circular_mask":True},
        
        "Q4": {"x1": 771, "y1": 467, "x2": (771+360), "y2": (467+360), "center": (967,652), "radius":143, "apply_circular_mask":True},
        
        "text_panel": {"x1": 10, "y1": 200, "x2": (10+315), "y2": (200+625)}
    }
}

In [13]:
import colorsys
import cv2
import os
import numpy as np
import torch
from colorthief import ColorThief

class VisionModule:
    
    def __init__(self, feature_extraction_model, classifier, configuration):
        self.feature_extraction_model = feature_extraction_model
        self.classifier = classifier
        self.configuration = configuration
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
    def crop(self, img, save_dir=None):

        crops = {}
        crop_cfg = self.configuration["crop_positions"]

        for key, cfg in crop_cfg.items():
            
            # --- 1. Crop the region ---
            x1, y1, x2, y2 = cfg["x1"], cfg["y1"], cfg["x2"], cfg["y2"]
            crop_img = img[y1:y2, x1:x2].copy()
            
            # --- 2. Apply circular mask ONLY if mask_radius_factor exists ---
            if cfg.get("apply_circular_mask", False) is True:
                radius = cfg.get("radius", None)
                center = cfg.get("center", None)
                center = (center[0] - x1, center[1] - y1)
                crop_img = self._apply_circular_mask(
                    crop_img,
                    radius,
                    center
                )

            # --- 3. Optionally save cropped image ---
            if save_dir is not None:
                os.makedirs(save_dir, exist_ok=True)
                save_path = os.path.join(save_dir, f"{key}.png")
                cv2.imwrite(save_path, cv2.cvtColor(crop_img, cv2.COLOR_RGB2BGR))

            # --- 4. Add to output dictionary ---
            crops[key] = crop_img

        return crops


    def _apply_circular_mask(self, img, radius, center):
        h, w = img.shape[:2]
        
        if center is not None:
            center = (int(center[0]), int(center[1]))
        else:
            center = (int(w // 2), int(h // 2))
            
        if radius is not None:
            radius = int(radius)
        else:
            radius = int(min(h, w) * 0.9 / 2)
    
        mask = np.zeros((h, w), dtype=np.uint8)
    
        # make white circle on a center
        cv2.circle(mask, center, radius, (255, 255, 255), -1)
    
        mask = mask.astype(float) / 255.0
        
        fill_color = [0.485*255, 0.456*255, 0.406*255]
        fill_color = np.array(fill_color, dtype=np.float32)
        
        fill_img = np.ones_like(img, dtype=np.float32) * fill_color
    
        result = img.astype(float) * mask[..., None] + fill_img * (1 - mask[..., None])
    
        return result.astype(np.uint8)
    
    
    def process_crops(self, crops):
        processed = {}
        
        for key, img in crops.items():
            # Skip text panel for CNN
            if key == "text_panel":
                processed[key] = img
                continue
            
            processed[key] = self._preprocess_for_cnn(img)
        
        return processed
    
    
    def _preprocess_for_cnn(self, cropped_img):
        # resize
        size = self.configuration["resize"]
        img = cv2.resize(cropped_img, size)
    
        # convert to float32 0–1
        img = img.astype(np.float32) / 255.0
    
        # ImageNet norm (change if you use custom model)
        mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
        std  = np.array([0.229, 0.224, 0.225], dtype=np.float32)
    
        img = (img - mean) / std
    
        # HWC → CHW
        img = np.transpose(img, (2, 0, 1))
    
        return img
    
    
    def extract_deep_features(self, tensor):

        t = torch.from_numpy(tensor).unsqueeze(0).float()  
        # shape (1,3,224,224) for resnet 50
    
        with torch.no_grad():
            deep_features = self.feature_extraction_model(t).cpu().numpy().flatten()
        torch.save()
        return deep_features
    

    def handcrafted_features(self, cropped_img_pth):
            
        colors_platte = ColorThief(cropped_img_pth)
        dominant_colors = colors_platte.get_palette(color_count=2)
        print(dominant_colors)
        
        ignored_colors = [
            (0, 0, 0),
            (255, 255, 255),
            (int(0.485*255), int(0.456*255), int(0.406*255))  # fill mask base color
        ]
        tol = 15
        cleaned_colors = []
        
        for R, G, B in dominant_colors:
            bad = False
            for ir, ig, ib in ignored_colors:
                if (abs(R - ir) <= tol and
                    abs(G - ig) <= tol and
                    abs(B - ib) <= tol):
                    bad = True
                    break
        
            if not bad:
                cleaned_colors.append((R, G, B))
        
        if len(cleaned_colors) == 0:
            cleaned_colors = [(0,0,0)]
        
        R, G, B = cleaned_colors[0]
        r, g, b = R/255.0, G/255.0, B/255.0
        
        h, s, v = colorsys.rgb_to_hsv(r, g, b)
        
        dom_h = h
        dom_s = s
        dom_v = v
        
        # ---- FIXED BGR → RGB for cv2 ----
        cropped_img = cv2.imread(cropped_img_pth)
        cropped_img = cv2.cvtColor(cropped_img, cv2.COLOR_BGR2RGB)
    
        img = cropped_img.astype(np.float32) / 255.0
        hsv = cv2.cvtColor(cropped_img, cv2.COLOR_RGB2HSV).astype(np.float32)
    
        # Create mask of "valid" pixels matrix/grid
        valid_mask = np.ones((img.shape[0], img.shape[1]), dtype=bool)
    
        for ir, ig, ib in ignored_colors:
            if (ir, ig, ib) == ignored_colors[2]:
                # This is the FILL MASK → use tolerance
                invalid = (
                    (np.abs(cropped_img[:,:,0] - ir) < tol) &
                    (np.abs(cropped_img[:,:,1] - ig) < tol) &
                    (np.abs(cropped_img[:,:,2] - ib) < tol)
                )
            else:
                # Black & white → exact match
                invalid = (
                    (cropped_img[:,:,0] == ir) &
                    (cropped_img[:,:,1] == ig) &
                    (cropped_img[:,:,2] == ib)
                )
        
            valid_mask[invalid] = False
    
        # Valid pixels only
        valid_hsv = hsv[valid_mask]
    
        # Intensity Features (based on V channel)
        V = valid_hsv[:,2] / 255.0
    
        avg_intensity = float(V.mean())
        std_intensity = float(V.std())
    
        # Center vs Periphery intensities, the center sharpness
        h_im, w_im = cropped_img.shape[:2]
        cy, cx = h_im//2, w_im//2
        R = min(cx, cy)
    
        r_center = int(R * 0.3)
        r_mid = int(R * 0.60)
    
        Y, X = np.ogrid[:h_im, :w_im]
        dist = np.sqrt((X - cx)**2 + (Y - cy)**2)
    
        center_mask = dist < r_center
        periphery_mask = (dist > r_mid) & (dist < R)
    
        center_vals = img[center_mask][:, :].mean() if center_mask.any() else 0.0
        periphery_vals = img[periphery_mask][:, :].mean() if periphery_mask.any() else 1e-6
        
        center_intensity = float(center_vals)
        periphery_intensity = float(periphery_vals)
        center_periphery_ratio = float(center_intensity / periphery_intensity)
    
        # Quadrant splits and asymmetry ratios
        top = img[:h_im//2,:,:].mean()
        bottom = img[h_im//2:,:,:].mean()
        left = img[:, :w_im//2, :].mean()
        right = img[:, w_im//2:, :].mean()
    
        inferior_superior_ratio = float(bottom / (top + 1e-6))
        left_right_ratio = float(left / (right + 1e-6))
    
        diag1 = img[:h_im//2, :w_im//2, :].mean() - img[h_im//2:, w_im//2:, :].mean()
        diag2 = img[:h_im//2, w_im//2:, :].mean() - img[h_im//2:, :w_im//2, :].mean()
    
        diag1_difference = float(diag1)
        diag2_difference = float(diag2)
    
        radial_symmetry = float(
            abs(top - bottom) +
            abs(left - right) +
            abs(diag1) +
            abs(diag2)
        )
    
        return {
            "dom_h": dom_h,
            "dom_s": dom_s,
            "dom_v": dom_v,
    
            "avg_intensity": avg_intensity,
            "std_intensity": std_intensity,
    
            "center_intensity": center_intensity,
            "periphery_intensity": periphery_intensity,
            "center_periphery_ratio": center_periphery_ratio,
    
            "inferior_superior_ratio": inferior_superior_ratio,
            "left_right_ratio": left_right_ratio,
    
            "diag1_difference": diag1_difference,
            "diag2_difference": diag2_difference,
    
            "radial_symmetry": radial_symmetry
        }
    
        
    
    def run_vision_preprocessing(self, img_pth):
       
        img = cv2.imread(img_pth)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        # STEP 1 — Crop
        crops = self.crop(img)
    
        # STEP 2 — Preprocess for CNN
        processed = self.process_crops(crops)
        
        # start thread here for extract_deep_features and handcrafted_features
        
        vision_output = {}
    
        # STEP 3 — Quadrants
        for key in ["Q1", "Q2", "Q3", "Q4"]:
            pass
         
        vision_output["text_panel"] = crops["text_panel"]
    
        return vision_output
    
    def run_vision_training(self):
        pass




In [14]:
obj = VisionModule(feature_extraction_model="p",classifier="g", configuration=configuration)

img = cv2.imread(r"dataset/cropping_sample/Q2.png")
# img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

h = obj.handcrafted_features(cropped_img_pth=r"dataset/cropping_sample/Q2.png")

print(h)


[(119, 112, 95), (118, 239, 16), (233, 246, 51)]
{'dom_h': 0.2571001494768311, 'dom_s': 0.9330543933054394, 'dom_v': 0.9372549019607843, 'avg_intensity': 0.9078391790390015, 'std_intensity': 0.17134372889995575, 'center_intensity': 0.569532573223114, 'periphery_intensity': 0.467157781124115, 'center_periphery_ratio': 1.2191439300286426, 'inferior_superior_ratio': 0.9308066368103027, 'left_right_ratio': 1.0133957862854004, 'diag1_difference': 0.04128372669219971, 'diag2_difference': 0.02835288643836975, 'radial_symmetry': 0.11092042922973633}


In [34]:
# class TextModule:
#     def __init__(self, ocr_engine, text_classifier, embed_compressor):
#         self.ocr = ocr_engine
#         self.text_classifier = text_classifier
#         self.embed_compressor = embed_compressor
# 
#     def ocr_extract(self, crop): ...
#     def clean_text(self, raw): ...
#     def extract_numeric(self, text): ...
#     def classify_text(self, text): ...
#     def build_output_dict(self, ...): ...
# 
#     def run(self, OCR_crop):
#         raw = self.ocr_extract(OCR_crop)
#         cleaned = self.clean_text(raw)
#         numeric = self.extract_numeric(cleaned)
#         text_pred, text_probs, embed = self.classify_text(cleaned)
# 
#         # build Text_Output_Dict
#         return Text_Output_Dict


In [None]:
# class FusionModule:
#     def __init__(self, xgb_model):
#         self.model = xgb_model
# 
#     def build_fusion_vector(self, image_dict, text_dict): ...
#     def run(self, Image_Output_Dict, Text_Output_Dict):
#         x = self.build_fusion_vector(Image_Output_Dict, Text_Output_Dict)
#         pred = self.model.predict(x)
#         conf = self.model.predict_proba(x)
#         return Fusion_Output_Dict


In [None]:
# class ReportModule:
#     def __init__(self, t5_model, tokenizer):
#         self.model = t5_model
#         self.tokenizer = tokenizer
# 
#     def build_long_text(self, image_dict, text_dict, fusion_dict): ...
#     def summarize(self, text): ...
# 
#     def run(self, Image_Output_Dict, Text_Output_Dict, Fusion_Output_Dict):
#         full = self.build_long_text(...)
#         summary = self.summarize(full)
#         return summary


In [None]:
# def run_pipeline(image_path):
#     # 1 Vision
#     Image_Output_Dict, OCR_crop = VisionModule.run(image_path)
# 
#     # 2 Text
#     Text_Output_Dict = TextModule.run(OCR_crop)
# 
#     # 3 Fusion
#     Fusion_Output_Dict = FusionModule.run(Image_Output_Dict, Text_Output_Dict)
# 
#     # 4 Report
#     final_report = ReportModule.run(Image_Output_Dict, Text_Output_Dict, Fusion_Output_Dict)
# 
#     return {
#         "vision": Image_Output_Dict,
#         "text": Text_Output_Dict,
#         "fusion": Fusion_Output_Dict,
#         "report": final_report
#     }
