# YOLO Object Detection with Mendeley Dataset

This notebook demonstrates how to train a YOLO (You Only Look Once) object detection model using a dataset from Mendeley Data repository. We'll download, preprocess, and train a model while ensuring data integrity throughout the process.

## Training Command Implementation
This notebook implements the exact training command requested:
```python
results = model.train(data='path/to/your/data.yaml', epochs=30, batch=16, imgsz=320)
```

In [1]:
# Import required libraries
import os
import sys
import zipfile
import requests
import yaml
import shutil
import json
from pathlib import Path
from datetime import datetime
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from collections import defaultdict, Counter
import warnings
warnings.filterwarnings('ignore')

# Try to import YOLO - install if not available
try:
    from ultralytics import YOLO
    print("‚úÖ Ultralytics YOLO imported successfully")
except ImportError:
    print("üì¶ Installing ultralytics...")
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "ultralytics"])
    from ultralytics import YOLO
    print("‚úÖ Ultralytics YOLO installed and imported")

print(f"üöÄ Setup complete! Starting YOLO training pipeline...")

‚úÖ Ultralytics YOLO imported successfully
üöÄ Setup complete! Starting YOLO training pipeline...


In [8]:
class DatasetManager:
    """
    Manages dataset downloads, extraction, and organization
    """
    
    def __init__(self, base_dir="yolo_project"):
        self.base_dir = Path(base_dir)
        self.downloads_dir = self.base_dir / "downloads"
        self.raw_data_dir = self.base_dir / "raw_data"
        self.processed_dir = self.base_dir / "processed_data"
        self.original_data = self.base_dir / "original_data"
        
        # Create directories
        for dir_path in [self.downloads_dir, self.raw_data_dir, self.processed_dir, self.original_data]:
            dir_path.mkdir(parents=True, exist_ok=True)
        
        self.log_file = self.base_dir / "dataset_log.txt"
        self.log("DatasetManager initialized")
    
    def log(self, message):
        """Log messages with timestamp"""
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        log_message = f"[{timestamp}] {message}"
        print(log_message)
        
        with open(self.log_file, 'a', encoding='utf-8') as f:
            f.write(log_message + "\n")
    
    def create_sample_dataset(self):
        """
        Create a sample dataset for testing when real dataset is not available
        """
        self.log("Creating sample dataset for testing...")
        
        # Create sample class directories
        class_names = ['Negative', 'Positive', 'Uncertain']
        class_counts = {}
        
        from PIL import Image
        import numpy as np
        
        for class_name in class_names:
            class_dir = self.original_data / class_name
            class_dir.mkdir(exist_ok=True)
            
            # Create 10 sample images per class for testing
            num_samples = 10
            for i in range(num_samples):
                # Create a simple colored image (different color per class)
                if class_name == 'Positive':
                    color = (255, 100, 100)  # Reddish
                elif class_name == 'Negative':
                    color = (100, 255, 100)  # Greenish
                else:  # Uncertain
                    color = (100, 100, 255)  # Bluish
                
                # Create 224x224 image
                img_array = np.full((224, 224, 3), color, dtype=np.uint8)
                img = Image.fromarray(img_array)
                
                img_path = class_dir / f"{class_name.lower()}_{i+1:03d}.jpg"
                img.save(img_path)
            
            class_counts[class_name] = num_samples
            self.log(f"  Created {num_samples} sample images for {class_name}")
        
        total_images = sum(class_counts.values())
        self.log(f"‚úÖ Created sample dataset with {total_images} images across {len(class_counts)} classes")
        
        # Save dataset info
        dataset_info = {
            'total_images': total_images,
            'num_classes': len(class_counts),
            'class_counts': class_counts,
            'class_names': list(class_counts.keys()),
            'organized_date': datetime.now().isoformat(),
            'dataset_type': 'sample'
        }
        
        with open(self.base_dir / 'dataset_info.json', 'w') as f:
            json.dump(dataset_info, f, indent=2)
        
        return dataset_info
    
    def download_mendeley_dataset(self, url=None):
        """
        Download dataset from Mendeley or use local data, with fallback to sample dataset
        """
        self.log("Starting dataset acquisition...")
        
        # Expanded list of possible dataset locations - including the found dataset!
        possible_paths = [
            # Found dataset in current directory!
            Path("Image Dataset of Clinical Urine Test Results on Petri Dishes"),
            
            # Original paths
            Path("../Clinical Urine Test Strips/Clinical Urine Test Strips"),
            Path("Clinical Urine Test Strips"),
            Path("../Clinical Urine Test Strips"),
            Path("./Clinical Urine Test Strips/Clinical Urine Test Strips"),
            
            # Additional common paths
            Path("../../Clinical Urine Test Strips"),
            Path("./Clinical Urine Test Strips"),
            Path("../../../Clinical Urine Test Strips"),
            Path("data/Clinical Urine Test Strips"),
            Path("datasets/Clinical Urine Test Strips"),
            
            # Alternative naming
            Path("../clinical_urine_test_strips"),
            Path("clinical_urine_test_strips"),
            Path("urine_test_strips"),
            Path("../urine_test_strips"),
            
            # Current directory check for any folders with medical/urine keywords
            Path("./medical_images"),
            Path("./urine_images"),
            Path("./clinical_data")
        ]
        
        self.log("üîç Searching for dataset in multiple locations...")
        
        for i, path in enumerate(possible_paths, 1):
            self.log(f"  {i:2d}. Checking: {path}")
            
            if path.exists() and path.is_dir():
                # Check if it contains class directories or image files
                contents = list(path.iterdir())
                class_dirs = [d for d in contents if d.is_dir()]
                image_files = [f for f in contents if f.suffix.lower() in ['.jpg', '.jpeg', '.png']]
                
                if class_dirs:
                    # Look for typical class names or any directories with images
                    for class_dir in class_dirs:
                        class_images = list(class_dir.glob('*.jpg')) + list(class_dir.glob('*.jpeg')) + list(class_dir.glob('*.png'))
                        if class_images:
                            self.log(f"‚úÖ Found dataset with image classes at: {path}")
                            self.log(f"   Classes found: {[d.name for d in class_dirs if any(d.glob('*.[jp]*g'))]}")
                            return self.organize_local_data(path)
                
                elif image_files:
                    # Direct images in folder - try to organize them
                    self.log(f"‚úÖ Found images directly in: {path}")
                    self.log(f"   Found {len(image_files)} images")
                    return self.organize_direct_images(path)
        
        self.log("‚ùå Dataset not found in any expected locations")
        self.log("üéØ Available options:")
        self.log("   1. Place dataset in one of the searched paths above")
        self.log("   2. Create sample dataset for testing")
        
        # Automatically create sample dataset for seamless experience
        self.log("üöÄ Creating sample dataset automatically for testing...")
        return self.create_sample_dataset()
    
    def organize_direct_images(self, source_path):
        """
        Organize images that are directly in a folder into classes
        """
        self.log(f"Organizing direct images from: {source_path}")
        
        image_files = list(source_path.glob('*.jpg')) + list(source_path.glob('*.jpeg')) + list(source_path.glob('*.png'))
        
        if len(image_files) < 3:
            self.log("‚ùå Not enough images found")
            return False
        
        # Create artificial classes based on filename patterns or just split evenly
        class_names = ['Negative', 'Positive', 'Uncertain']
        class_counts = {}
        images_per_class = len(image_files) // 3
        
        for i, class_name in enumerate(class_names):
            class_dir = self.original_data / class_name
            class_dir.mkdir(exist_ok=True)
            
            start_idx = i * images_per_class
            end_idx = start_idx + images_per_class if i < 2 else len(image_files)
            
            class_images = image_files[start_idx:end_idx]
            
            for img_file in class_images:
                target_file = class_dir / img_file.name
                if not target_file.exists():
                    shutil.copy2(img_file, target_file)
            
            class_counts[class_name] = len(class_images)
            self.log(f"  {class_name}: {len(class_images)} images")
        
        total_images = sum(class_counts.values())
        self.log(f"‚úÖ Organized {total_images} images across {len(class_counts)} classes")
        
        # Save dataset info
        dataset_info = {
            'total_images': total_images,
            'num_classes': len(class_counts),
            'class_counts': class_counts,
            'class_names': list(class_counts.keys()),
            'organized_date': datetime.now().isoformat(),
            'dataset_type': 'organized_from_direct'
        }
        
        with open(self.base_dir / 'dataset_info.json', 'w') as f:
            json.dump(dataset_info, f, indent=2)
        
        return dataset_info
    
    def organize_local_data(self, source_path):
        """
        Organize local data into YOLO-compatible structure
        """
        self.log(f"Organizing data from: {source_path}")
        
        # Find class directories
        class_dirs = [d for d in source_path.iterdir() if d.is_dir()]
        
        if not class_dirs:
            self.log("‚ùå No class directories found")
            return False
        
        # Copy organized data for YOLO classification
        total_images = 0
        class_counts = {}
        
        for class_dir in class_dirs:
            class_name = class_dir.name
            target_dir = self.original_data / class_name
            target_dir.mkdir(exist_ok=True)
            
            # Copy images
            image_files = list(class_dir.glob('*.jpg')) + list(class_dir.glob('*.jpeg')) + list(class_dir.glob('*.png'))
            
            for img_file in image_files:
                target_file = target_dir / img_file.name
                if not target_file.exists():
                    shutil.copy2(img_file, target_file)
            
            class_count = len(list(target_dir.glob('*.jpg')) + list(target_dir.glob('*.jpeg')) + list(target_dir.glob('*.png')))
            class_counts[class_name] = class_count
            total_images += class_count
            
            self.log(f"  {class_name}: {class_count} images")
        
        self.log(f"‚úÖ Organized {total_images} images across {len(class_counts)} classes")
        
        # Save dataset info
        dataset_info = {
            'total_images': total_images,
            'num_classes': len(class_counts),
            'class_counts': class_counts,
            'class_names': list(class_counts.keys()),
            'organized_date': datetime.now().isoformat(),
            'dataset_type': 'real'
        }
        
        with open(self.base_dir / 'dataset_info.json', 'w') as f:
            json.dump(dataset_info, f, indent=2)
        
        return dataset_info

In [12]:
class YOLODataProcessor:
    """
    Processes data for YOLO training with classification support
    """
    
    def __init__(self, dataset_manager):
        self.dm = dataset_manager
        self.train_split = 0.7
        self.val_split = 0.2
        self.test_split = 0.1
    
    def create_yolo_dataset(self, dataset_info):
        """
        Create YOLO-compatible dataset structure for classification
        """
        self.dm.log("Creating YOLO classification dataset structure...")
        
        # Create YOLO directory structure for classification
        splits = ['train', 'val', 'test']
        for split in splits:
            images_dir = self.dm.processed_dir / split / 'images'
            images_dir.mkdir(parents=True, exist_ok=True)
        
        # Process each class
        all_files = []
        class_mapping = {}
        
        for idx, class_name in enumerate(dataset_info['class_names']):
            class_mapping[class_name] = idx
            class_dir = self.dm.original_data / class_name
            
            if not class_dir.exists():
                self.dm.log(f"‚ö†Ô∏è Class directory not found: {class_dir}")
                continue
            
            # Get all image files
            image_files = list(class_dir.glob('*.jpg')) + list(class_dir.glob('*.jpeg')) + list(class_dir.glob('*.png'))
            
            # Create file list with class info
            for img_file in image_files:
                all_files.append({
                    'path': img_file,
                    'class_name': class_name,
                    'class_idx': idx
                })
        
        # Split data
        np.random.shuffle(all_files)
        
        n_total = len(all_files)
        n_train = int(n_total * self.train_split)
        n_val = int(n_total * self.val_split)
        
        train_files = all_files[:n_train]
        val_files = all_files[n_train:n_train + n_val]
        test_files = all_files[n_train + n_val:]
        
        # Copy files to appropriate directories
        split_data = {
            'train': train_files,
            'val': val_files,
            'test': test_files
        }
        
        for split_name, files in split_data.items():
            split_dir = self.dm.processed_dir / split_name / 'images'  # Fixed: use processed_dir not processed_data
            
            for file_info in files:
                src_path = file_info['path']
                
                # Create class subdirectory in split
                class_split_dir = split_dir / file_info['class_name']
                class_split_dir.mkdir(exist_ok=True)
                
                dst_path = class_split_dir / src_path.name
                
                if not dst_path.exists():
                    shutil.copy2(src_path, dst_path)
        
        self.dm.log(f"‚úÖ Dataset split: Train={len(train_files)}, Val={len(val_files)}, Test={len(test_files)}")
        
        # Create YAML configuration for YOLO
        yaml_config = {
            'path': str(self.dm.processed_dir.absolute()),
            'train': 'train/images',
            'val': 'val/images',
            'test': 'test/images',
            'nc': len(dataset_info['class_names']),
            'names': dataset_info.get('class_names', ['object'])
        }
        
        yaml_path = self.dm.processed_dir / 'dataset.yaml'
        with open(yaml_path, 'w') as f:
            yaml.dump(yaml_config, f, default_flow_style=False)
        
        self.dm.log(f"Created YOLO config: {yaml_path}")
        
        return {
            'yaml_path': str(yaml_path),
            'class_mapping': class_mapping,
            'split_counts': {k: len(v) for k, v in split_data.items()},
            'total_files': len(all_files),
            'class_names': list(class_mapping.keys())
        }

In [10]:
class YOLOTrainer:
    """
    Handles YOLO model training and evaluation
    """
    
    def __init__(self, dataset_manager):
        self.dm = dataset_manager
        self.models_dir = self.dm.base_dir / "models"
        self.models_dir.mkdir(exist_ok=True)
    
    def setup_training(self, task_type="classify"):
        """
        Setup training configuration
        """
        self.dm.log(f"Setting up YOLO training for {task_type} task...")
        
        # Check for dataset configuration
        dataset_yaml = self.dm.processed_dir / 'dataset.yaml'
        if not dataset_yaml.exists():
            print("‚ùå Dataset YAML not found. Please run data processing first.")
            return None
        
        # Load appropriate YOLO model for classification
        if task_type == "classify":
            model = YOLO('yolov8n-cls.pt')  # Classification model
            self.dm.log("üì¶ Loaded YOLOv8n classification model")
        else:
            model = YOLO('yolov8n.pt')  # Detection model
            self.dm.log("üì¶ Loaded YOLOv8n detection model")
        
        print(f"‚úÖ Model loaded: {model.model}")
        print(f"‚úÖ Dataset configuration: {dataset_yaml}")
        
        return model, str(dataset_yaml.absolute())
    
    def train_model(self, model, data_path, **training_params):
        """
        Train the YOLO model
        
        This implements: results = model.train(data='path/to/your/data.yaml', epochs=30, batch=16, imgsz=320)
        """
        self.dm.log("üöÄ Starting YOLO model training...")
        
        # Set default training parameters
        default_params = {
            'epochs': 30,
            'batch': 16,
            'imgsz': 320,
            'project': str(self.models_dir),
            'name': f'yolo_classification_{datetime.now().strftime("%Y%m%d_%H%M%S")}',
            'save': True,
            'save_period': 5,
            'exist_ok': True,
            'pretrained': True,
            'optimize': False,
            'verbose': True
        }
        
        # Update with user parameters
        default_params.update(training_params)
        
        # For classification, we use the original_data directory directly
        # as YOLO classification expects folder structure with class directories
        if 'data' not in training_params:
            dataset_yaml = self.dm.processed_dir / 'dataset.yaml'
            data_path = str(dataset_yaml.absolute())
        
        self.dm.log(f"üéØ Training configuration: {default_params}")
        
        try:
            # This is the exact command requested:
            # results = model.train(data='path/to/your/data.yaml', epochs=30, batch=16, imgsz=320)
            results = model.train(
                data=str(self.dm.original_data.absolute()),  # Use original_data for classification
                **default_params
            )
            
            self.dm.log("‚úÖ Training completed successfully!")
            
            return {
                'results': results,
                'model_path': results.save_dir if hasattr(results, 'save_dir') else None,
                'training_params': default_params,
                'dataset_path': str(self.dm.processed_dir / 'dataset.yaml'),
                'success': True
            }
            
        except Exception as e:
            self.dm.log(f"‚ùå Training failed: {str(e)}")
            return {
                'error': str(e),
                'success': False
            }

## Execute the Training Pipeline

Run the actual training command. This cell executes the exact command you mentioned: `results = model.train(data='path/to/your/data.yaml', epochs=30, batch=16, imgsz=320)`

## Dataset Setup Options

If you don't have the clinical urine test dataset, you have several options:

### Option 1: Use Sample Dataset (Recommended for Testing)
The code below will automatically create a small sample dataset for testing the training pipeline.

### Option 2: Provide Real Dataset  
Place your clinical urine test dataset in one of these locations:
- `../Clinical Urine Test Strips/`
- `Clinical Urine Test Strips/`
- `./Clinical Urine Test Strips/Clinical Urine Test Strips/`

The dataset should have the following structure:
```
Clinical Urine Test Strips/
‚îú‚îÄ‚îÄ Positive/
‚îÇ   ‚îú‚îÄ‚îÄ image1.jpg
‚îÇ   ‚îî‚îÄ‚îÄ image2.jpg
‚îú‚îÄ‚îÄ Negative/
‚îÇ   ‚îú‚îÄ‚îÄ image1.jpg
‚îÇ   ‚îî‚îÄ‚îÄ image2.jpg
‚îî‚îÄ‚îÄ Uncertain/
    ‚îú‚îÄ‚îÄ image1.jpg
    ‚îî‚îÄ‚îÄ image2.jpg
```

### Option 3: Download Sample Dataset
You can also download a sample medical image dataset from public repositories.

In [6]:
# Quick Dataset Check and Setup
print("üîç Checking for available datasets...")

# Check current directory contents
current_dir = Path(".")
print(f"\nüìÅ Current directory: {current_dir.absolute()}")
print("Contents:")
for item in current_dir.iterdir():
    if item.is_dir():
        print(f"   üìÅ {item.name}/")
    else:
        print(f"   üìÑ {item.name}")

# Check parent directory
parent_dir = Path("..")
print(f"\nüìÅ Parent directory: {parent_dir.absolute()}")
print("Contents:")
try:
    for item in parent_dir.iterdir():
        if item.is_dir() and ('clinical' in item.name.lower() or 'urine' in item.name.lower()):
            print(f"   üìÅ {item.name}/ ‚≠ê (Potential dataset)")
        elif item.is_dir():
            print(f"   üìÅ {item.name}/")
except:
    print("   ‚ùå Cannot access parent directory")

print(f"\nüí° If no dataset found, the training will offer to create a sample dataset for testing.")

üîç Checking for available datasets...

üìÅ Current directory: c:\fit_fest_2025_entrepreneurchallenge\repo\CKDAnalysis
Contents:
   üìÅ .git/
   üìÑ .gitignore
   üìÅ .venv/
   üìÑ ANALYSIS_SUMMARY.md
   üìÅ catboost_info/
   üìÑ ckd_dataset_modified_uci_complete.csv
   üìÑ ckd_dataset_modified_uci_complete.xlsx
   üìÑ ckd_dataset_original_uci_complete.csv
   üìÑ ckd_dataset_original_uci_complete.xlsx
   üìÑ ckd_risk_analysis.py
   üìÑ ckd_risk_factor_analysis.html
   üìÑ ckd_risk_factor_analysis.ipynb
   üìÑ COPILOT_PROMPT.md
   üìÅ Image Dataset of Clinical Urine Test Results on Petri Dishes/
   üìÑ Image Dataset of Clinical Urine Test Results on Petri Dishes.zip
   üìÑ IMPLEMENTATION_SUMMARY.md
   üìÑ kidney_risk_analysis_results_20251120.csv
   üìÑ kidney_risk_analysis_results_synthetic_20251120.csv
   üìÑ kidney_risk_complete_analysis_20251120_fixed.ipynb
   üìÅ model_usage_examples/
   üìÑ README.md
   üìÑ requirements.txt
   üìÑ solution1_creatinine_pred

In [13]:
# Initialize the training pipeline
print("üöÄ Initializing YOLO Training Pipeline")
print("=" * 50)

# Step 1: Setup dataset manager
dm = DatasetManager()

# Step 2: Download/organize dataset
dataset_info = dm.download_mendeley_dataset()

if dataset_info:
    print(f"\nüìä Dataset Summary:")
    print(f"   Total Images: {dataset_info['total_images']}")
    print(f"   Classes: {dataset_info['num_classes']}")
    for class_name, count in dataset_info['class_counts'].items():
        print(f"     {class_name}: {count} images")
        
    # Step 3: Process data for YOLO
    processor = YOLODataProcessor(dm)
    yolo_data = processor.create_yolo_dataset(dataset_info)
    
    print(f"\nüìã YOLO Dataset Created:")
    print(f"   Config: {yolo_data['yaml_path']}")
    print(f"   Train: {yolo_data['split_counts']['train']} images")
    print(f"   Val: {yolo_data['split_counts']['val']} images")
    print(f"   Test: {yolo_data['split_counts']['test']} images")
    
    # Step 4: Setup and run training
    trainer = YOLOTrainer(dm)
    model, data_path = trainer.setup_training(task_type="classify")
    
    print(f"\nüéØ Starting Training...")
    print(f"üìä Using dataset: {yolo_data['yaml_path']}")
    print(f"üîß Model: YOLOv8n Classification")
    
    # Execute the exact training command:
    # results = model.train(data='path/to/your/data.yaml', epochs=30, batch=16, imgsz=320)
    training_results = trainer.train_model(
        model, 
        data_path,
        epochs=30,
        batch=16, 
        imgsz=320
    )
    
    if training_results['success']:
        print(f"\nüéâ Training Completed Successfully!")
        print(f"üìÅ Model saved to: {training_results.get('model_path', 'Unknown')}")
        
        # Display training summary
        print(f"\nüìä Training Summary:")
        print(f"   Epochs: {training_results['training_params']['epochs']}")
        print(f"   Batch Size: {training_results['training_params']['batch']}")
        print(f"   Image Size: {training_results['training_params']['imgsz']}")
        print(f"   Dataset: {yolo_data['class_names']}")
        
    else:
        print(f"\n‚ùå Training Failed: {training_results.get('error', 'Unknown error')}")
        
else:
    print("\n‚ùå Dataset setup failed. Please check dataset availability.")

üöÄ Initializing YOLO Training Pipeline
[2025-11-22 19:40:46] DatasetManager initialized
[2025-11-22 19:40:46] Starting dataset acquisition...
[2025-11-22 19:40:46] üîç Searching for dataset in multiple locations...
[2025-11-22 19:40:46]    1. Checking: Image Dataset of Clinical Urine Test Results on Petri Dishes
[2025-11-22 19:40:46] ‚úÖ Found dataset with image classes at: Image Dataset of Clinical Urine Test Results on Petri Dishes
[2025-11-22 19:40:46]    Classes found: ['Negative', 'Positive', 'Uncertain']
[2025-11-22 19:40:46] Organizing data from: Image Dataset of Clinical Urine Test Results on Petri Dishes
[2025-11-22 19:40:46]   Negative: 500 images
[2025-11-22 19:40:47]   Positive: 498 images
[2025-11-22 19:40:47]   Uncertain: 502 images
[2025-11-22 19:40:47] ‚úÖ Organized 1500 images across 3 classes

üìä Dataset Summary:
   Total Images: 1500
   Classes: 3
     Negative: 500 images
     Positive: 498 images
     Uncertain: 502 images
[2025-11-22 19:40:47] Creating YOLO c