# YOLOv8 Image Detection Pipeline

This notebook runs object detection on Telegram images using YOLOv8 and saves results to PostgreSQL.

## Setup
First, let's import the necessary modules and set up the environment.

In [1]:
import os
import sys
import logging
from datetime import datetime
from typing import List, Dict, Any
from glob import glob

# Add src to path for imports
sys.path.append(os.path.abspath('..'))

# Import our custom modules
from src.enrich.yolo_enricher import YOLOEnricher
from src.loader.postgres_loader import PostgresLoader

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Suppress YOLO verbose output
import warnings
warnings.filterwarnings('ignore')

# Disable ultralytics verbose output
import os
os.environ['ULTRALYTICS_VERBOSE'] = 'False'

print("✅ Imports and setup complete")

✅ Imports and setup complete


## Create Silent YOLO Enricher
Define a YOLO enricher class that suppresses all verbose output.

In [2]:
class SilentYOLOEnricher:
    def __init__(self):
        self.model_path = os.getenv('YOLO_MODEL_PATH', 'yolov8n.pt')
        self.confidence_threshold = float(os.getenv('CONFIDENCE_THRESHOLD', 0.5))
        self.model = None
        
    def load_model(self):
        """Load YOLO model silently"""
        try:
            from ultralytics import YOLO
            self.model = YOLO(self.model_path, verbose=False)
            print(f"✅ YOLO model loaded successfully")
        except Exception as e:
            print(f"❌ Error loading YOLO model: {e}")
            raise
            
    def detect_objects(self, image_path: str) -> Dict[str, Any]:
        """Detect objects in an image using YOLO silently"""
        if not self.model:
            self.load_model()
            
        try:
            # Run detection with verbose=False
            results = self.model(image_path, conf=self.confidence_threshold, verbose=False)
            
            detections = []
            confidence_scores = {}
            
            for result in results:
                boxes = result.boxes
                if boxes is not None:
                    for box in boxes:
                        # Get box coordinates
                        x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
                        
                        # Get confidence and class
                        confidence = float(box.conf[0].cpu().numpy())
                        class_id = int(box.cls[0].cpu().numpy())
                        class_name = self.model.names[class_id]
                        
                        detection = {
                            'bbox': [float(x1), float(y1), float(x2), float(y2)],
                            'confidence': confidence,
                            'class_id': class_id,
                            'class_name': class_name
                        }
                        detections.append(detection)
                        
                        # Track confidence scores by class
                        if class_name not in confidence_scores:
                            confidence_scores[class_name] = []
                        confidence_scores[class_name].append(confidence)
            
            # Calculate average confidence for each class
            avg_confidence = {}
            for class_name, scores in confidence_scores.items():
                avg_confidence[class_name] = sum(scores) / len(scores)
            
            return {
                'detections': detections,
                'confidence_scores': avg_confidence,
                'total_detections': len(detections),
                'image_path': image_path
            }
            
        except Exception as e:
            print(f"❌ Error detecting objects in {image_path}: {e}")
            return {
                'detections': [],
                'confidence_scores': {},
                'total_detections': 0,
                'image_path': image_path,
                'error': str(e)
            }

print("✅ Silent YOLO Enricher class defined")

✅ Silent YOLO Enricher class defined


## Configuration
Set the date and optionally a specific channel to process.

In [3]:
# Configuration
DATE = "2025-07-14"  # Change this to your target date
CHANNEL = None  # Set to specific channel name (e.g., "tikvahpharma") or None for all channels

# Base directory for images
BASE_DIR = os.path.join('data', 'raw', 'telegram_images', DATE)

print(f"�� Processing date: {DATE}")
print(f"📁 Base directory: {BASE_DIR}")
if CHANNEL:
    print(f"📺 Channel: {CHANNEL}")
else:
    print(f"📺 Channels: All channels in {DATE}")

�� Processing date: 2025-07-14
📁 Base directory: data\raw\telegram_images\2025-07-14
📺 Channels: All channels in 2025-07-14


## Helper Functions
Define functions to scan images and extract message IDs.

In [4]:
def get_image_files(base_dir: str, channel: str = None) -> List[str]:
    """Get all jpg image files in the directory."""
    if channel:
        # Specific channel
        channel_dir = os.path.join(base_dir, channel)
        if not os.path.exists(channel_dir):
            logger.error(f"Channel directory not found: {channel_dir}")
            return []
        return glob(os.path.join(channel_dir, '*.jpg'))
    else:
        # All channels
        image_files = []
        if os.path.exists(base_dir):
            for channel_dir in os.listdir(base_dir):
                channel_path = os.path.join(base_dir, channel_dir)
                if os.path.isdir(channel_path):
                    image_files.extend(glob(os.path.join(channel_path, '*.jpg')))
        return image_files

def extract_message_id(image_path: str) -> int:
    """Extract message_id from image filename (e.g., 123456.jpg)."""
    filename = os.path.basename(image_path)
    try:
        return int(os.path.splitext(filename)[0])
    except Exception:
        logger.error(f"Could not extract message_id from {filename}")
        return None

print("✅ Helper functions defined")

✅ Helper functions defined


## Create Database Tables
Create the necessary tables if they don't exist.

In [5]:
# Create database tables
loader = PostgresLoader()
loader.connect()

try:
    cursor = loader.conn.cursor()
    
    # Drop existing table if it exists (to fix constraints)
    cursor.execute("DROP TABLE IF EXISTS processed_images CASCADE;")
    loader.conn.commit()
    print("🗑️ Dropped existing table")
    
    # Create processed_images table with proper constraints
    create_table_sql = """
    CREATE TABLE processed_images (
        id SERIAL PRIMARY KEY,
        message_id BIGINT UNIQUE NOT NULL,
        image_path TEXT,
        detection_results JSONB,
        confidence_scores JSONB,
        processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
    );
    """
    
    cursor.execute(create_table_sql)
    loader.conn.commit()
    print("✅ processed_images table created successfully with UNIQUE constraint")
    
    # Create index for better performance
    cursor.execute("""
        CREATE INDEX IF NOT EXISTS idx_processed_images_message_id 
        ON processed_images(message_id);
    """)
    loader.conn.commit()
    print("✅ Index created successfully")
    
except Exception as e:
    print(f"❌ Error creating tables: {e}")
    loader.conn.rollback()
finally:
    cursor.close()
    loader.disconnect()
    print("🔌 Database connection closed")

Connected to PostgreSQL successfully
🗑️ Dropped existing table
✅ processed_images table created successfully with UNIQUE constraint
✅ Index created successfully
🔌 Database connection closed


## Check Available Images
Let's see what images are available for processing.

In [6]:
# Get image files
image_files = get_image_files(BASE_DIR, CHANNEL)

print(f"📊 Found {len(image_files)} images to process")

if len(image_files) > 0:
    print("\n📋 Sample images:")
    for i, img_path in enumerate(image_files[:5]):
        message_id = extract_message_id(img_path)
        print(f"  {i+1}. {os.path.basename(img_path)} (message_id: {message_id})")
    
    if len(image_files) > 5:
        print(f"  ... and {len(image_files) - 5} more images")
else:
    print("❌ No images found. Check the date and channel configuration.")

📊 Found 1272 images to process

📋 Sample images:
  1. 10.jpg (message_id: 10)
  2. 11.jpg (message_id: 11)
  3. 13.jpg (message_id: 13)
  4. 14.jpg (message_id: 14)
  5. 15.jpg (message_id: 15)
  ... and 1267 more images


## Initialize YOLO Model and Database Connection
Load the YOLOv8 model and connect to PostgreSQL.

In [7]:
# Initialize silent YOLO enricher
print("🤖 Loading YOLOv8 model silently...")
yolo = SilentYOLOEnricher()
yolo.load_model()

# Initialize database loader
print("🗄️ Connecting to PostgreSQL...")
loader = PostgresLoader()
loader.connect()
print("✅ Database connection established")

🤖 Loading YOLOv8 model silently...
✅ YOLO model loaded successfully
🗄️ Connecting to PostgreSQL...
Connected to PostgreSQL successfully
✅ Database connection established


## Process Images with YOLO Detection
Run object detection on each image and collect results.

In [8]:
# Process images with silent YOLO
processed_data = []
errors = []

print(f"🚀 Starting silent object detection on {len(image_files)} images...")

for i, image_path in enumerate(image_files):
    message_id = extract_message_id(image_path)
    if message_id is None:
        errors.append(f"Could not extract message_id from {image_path}")
        continue
    
    try:
        # Run silent YOLO detection
        detection_result = yolo.detect_objects(image_path)
        detections = detection_result.get('detections', [])
        
        # Log progress (only this will show)
        print(f"📸 [{i+1}/{len(image_files)}] {os.path.basename(image_path)}: {len(detections)} objects detected")
        
        # Store results
        processed_data.append({
            'message_id': message_id,
            'image_path': image_path,
            'detection_results': detection_result.get('detections', []),
            'confidence_scores': detection_result.get('confidence_scores', {})
        })
        
    except Exception as e:
        error_msg = f"Error processing {image_path}: {e}"
        print(f"❌ {error_msg}")
        errors.append(error_msg)

print(f"\n✅ Processing complete!")
print(f"📊 Successfully processed: {len(processed_data)} images")
print(f"❌ Errors: {len(errors)} images")

🚀 Starting silent object detection on 1272 images...
📸 [1/1272] 10.jpg: 0 objects detected
📸 [2/1272] 11.jpg: 0 objects detected
📸 [3/1272] 13.jpg: 1 objects detected
📸 [4/1272] 14.jpg: 0 objects detected
📸 [5/1272] 15.jpg: 0 objects detected
📸 [6/1272] 17.jpg: 1 objects detected
📸 [7/1272] 18.jpg: 0 objects detected
📸 [8/1272] 19.jpg: 2 objects detected
📸 [9/1272] 2.jpg: 0 objects detected
📸 [10/1272] 20.jpg: 2 objects detected
📸 [11/1272] 21.jpg: 0 objects detected
📸 [12/1272] 22.jpg: 0 objects detected
📸 [13/1272] 23.jpg: 1 objects detected
📸 [14/1272] 25.jpg: 2 objects detected
📸 [15/1272] 26.jpg: 1 objects detected
📸 [16/1272] 27.jpg: 0 objects detected
📸 [17/1272] 30.jpg: 0 objects detected
📸 [18/1272] 31.jpg: 1 objects detected
📸 [19/1272] 33.jpg: 2 objects detected
📸 [20/1272] 34.jpg: 0 objects detected
📸 [21/1272] 38.jpg: 0 objects detected
📸 [22/1272] 39.jpg: 7 objects detected
📸 [23/1272] 40.jpg: 6 objects detected
📸 [24/1272] 41.jpg: 0 objects detected
📸 [25/1272] 43.jpg: 1

## Save Detection Results to PostgreSQL
Store the detection results in the database.

In [9]:
if processed_data:
    try:
        print("💾 Saving results to PostgreSQL...")
        loader.load_processed_images(processed_data)
        print(f"✅ Successfully saved {len(processed_data)} detection results to database")
    except Exception as e:
        print(f"❌ Error saving to database: {e}")
else:
    print("⚠️ No data to save - no images were processed successfully")

# Close database connection
loader.disconnect()
print("🔌 Database connection closed")

💾 Saving results to PostgreSQL...
Inserted/updated 1272 processed images
✅ Successfully saved 1272 detection results to database
🔌 Database connection closed


## Summary and Analysis
Let's analyze the detection results.

In [10]:
if processed_data:
    # Analyze results
    total_detections = sum(len(item['detection_results']) for item in processed_data)
    avg_detections = total_detections / len(processed_data) if processed_data else 0
    
    # Count object classes
    class_counts = {}
    for item in processed_data:
        for detection in item['detection_results']:
            class_name = detection.get('class_name', 'unknown')
            class_counts[class_name] = class_counts.get(class_name, 0) + 1
    
    print("📈 Detection Summary:")
    print(f"   • Images processed: {len(processed_data)}")
    print(f"   • Total objects detected: {total_detections}")
    print(f"   • Average objects per image: {avg_detections:.2f}")
    
    if class_counts:
        print(f"\n🏷️ Top detected objects:")
        sorted_classes = sorted(class_counts.items(), key=lambda x: x[1], reverse=True)
        for class_name, count in sorted_classes[:10]:
            print(f"   • {class_name}: {count}")

if errors:
    print(f"\n❌ Errors encountered ({len(errors)}):")
    for error in errors[:5]:  # Show first 5 errors
        print(f"   • {error}")
    if len(errors) > 5:
        print(f"   ... and {len(errors) - 5} more errors")

print("\n🎉 Processing pipeline complete!")

📈 Detection Summary:
   • Images processed: 1272
   • Total objects detected: 730
   • Average objects per image: 0.57

🏷️ Top detected objects:
   • person: 281
   • bottle: 280
   • cup: 29
   • tv: 19
   • refrigerator: 16
   • cell phone: 15
   • scissors: 11
   • chair: 11
   • vase: 11
   • book: 8

🎉 Processing pipeline complete!


## Verify Database Results
Let's check that the data was saved correctly in the database.

In [11]:
# Reconnect to database for verification
loader = PostgresLoader()
loader.connect()

try:
    # Query recent processed images
    cursor = loader.conn.cursor()
    cursor.execute("""
        SELECT message_id, image_path, 
               jsonb_array_length(detection_results) as detection_count,
               processed_at
        FROM processed_images 
        WHERE processed_at >= CURRENT_DATE
        ORDER BY processed_at DESC 
        LIMIT 10
    """)
    
    results = cursor.fetchall()
    
    if results:
        print("📊 Recent database entries:")
        for row in results:
            print(f"   • Message {row[0]}: {row[2]} detections ({row[1]})")
    else:
        print("📊 No recent entries found in database")
        
except Exception as e:
    print(f"❌ Error querying database: {e}")
finally:
    cursor.close()
    loader.disconnect()

Connected to PostgreSQL successfully
📊 Recent database entries:
   • Message 11: 0 detections (data\raw\telegram_images\2025-07-14\CheMed123\11.jpg)
   • Message 13: 1 detections (data\raw\telegram_images\2025-07-14\CheMed123\13.jpg)
   • Message 14: 0 detections (data\raw\telegram_images\2025-07-14\CheMed123\14.jpg)
   • Message 15: 0 detections (data\raw\telegram_images\2025-07-14\CheMed123\15.jpg)
   • Message 17: 1 detections (data\raw\telegram_images\2025-07-14\CheMed123\17.jpg)
   • Message 18: 0 detections (data\raw\telegram_images\2025-07-14\CheMed123\18.jpg)
   • Message 19: 2 detections (data\raw\telegram_images\2025-07-14\CheMed123\19.jpg)
   • Message 2: 0 detections (data\raw\telegram_images\2025-07-14\CheMed123\2.jpg)
   • Message 20: 2 detections (data\raw\telegram_images\2025-07-14\CheMed123\20.jpg)
   • Message 10: 0 detections (data\raw\telegram_images\2025-07-14\CheMed123\10.jpg)
