In [4]:
import os
import random
import shutil
from pathlib import Path

# Set random seed for reproducibility
random.seed(42)

# Define source and destination directories
SOURCE_DIR = 'crop pictures/data'
DEST_DIR = 'crop pictures'

# Define split ratios
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# Ensure total = 1.0
assert abs((train_ratio + val_ratio + test_ratio) - 1.0) < 1e-6, "Ratios must sum to 1.0"

# Get class names (folder names)
classes = [d for d in os.listdir(SOURCE_DIR) if os.path.isdir(os.path.join(SOURCE_DIR, d))]

# Create output directory structure
for split in ['train', 'val', 'test']:
    for class_name in classes:
        Path(os.path.join(DEST_DIR, split, class_name)).mkdir(parents=True, exist_ok=True)

# Process each class
for class_name in classes:
    class_path = os.path.join(SOURCE_DIR, class_name)
    images = os.listdir(class_path)
    images = [img for img in images if img.lower().endswith(('.jpg', '.jpeg', '.png'))]
    random.shuffle(images)

    total = len(images)
    train_end = int(total * train_ratio)
    val_end = train_end + int(total * val_ratio)

    train_imgs = images[:train_end]
    val_imgs = images[train_end:val_end]
    test_imgs = images[val_end:]

    # Copy files
    for img_name in train_imgs:
        shutil.copy(os.path.join(class_path, img_name), os.path.join(DEST_DIR, 'train', class_name, img_name))
    for img_name in val_imgs:
        shutil.copy(os.path.join(class_path, img_name), os.path.join(DEST_DIR, 'val', class_name, img_name))
    for img_name in test_imgs:
        shutil.copy(os.path.join(class_path, img_name), os.path.join(DEST_DIR, 'test', class_name, img_name))

print("✅ Dataset split into train, val, and test sets.")


✅ Dataset split into train, val, and test sets.


In [4]:
# Check for duplicate images across splits
import os
import random
import shutil
from pathlib import Path

# Set random seed for reproducibility
random.seed(42)

# Define source and destination directories
SOURCE_DIR = 'crop pictures/data'
DEST_DIR = 'crop pictures'

# Define split ratios
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# Ensure total = 1.0
assert abs((train_ratio + val_ratio + test_ratio) - 1.0) < 1e-6, "Ratios must sum to 1.0"

# Get class names (folder names)
classes = [d for d in os.listdir(SOURCE_DIR) if os.path.isdir(os.path.join(SOURCE_DIR, d))]


def check_duplicates_across_splits():
    splits = ['train', 'val_pre_updated', 'test']
    image_sets = {split: set() for split in splits}
    
    for split in splits:
        for class_name in classes:
            split_path = os.path.join(DEST_DIR, split, class_name)
            if os.path.exists(split_path):
                images = os.listdir(split_path)
                image_sets[split].update(images)
    
    # Check for duplicates between train and val/test
    train_val_duplicates = image_sets['train'] & image_sets['val_pre_updated']
    train_test_duplicates = image_sets['train'] & image_sets['test']
    
    if train_val_duplicates:
        print(f"⚠️ Found {len(train_val_duplicates)} duplicate images between train and val sets")
    else:
        print("✅ No duplicate images between train and val sets")
    
    if train_test_duplicates:
        print(f"⚠️ Found {len(train_test_duplicates)} duplicate images between train and test sets")
    else:
        print("✅ No duplicate images between train and test sets")

check_duplicates_across_splits()


✅ No duplicate images between train and val sets
✅ No duplicate images between train and test sets


In [3]:
import pandas as pd

# Create a sample submission DataFrame with example data
sample_submission = pd.DataFrame({
    'ID': ['image_000001.jpg', 'image_000002.JPG', 'image_000003.JPG', 
           'image_000004.JPG', 'image_000005.JPG'],
    'Blight': [0.73140, 0.00108, 0.00000, 0.26471, 0.00045],
    'Common_Rust': [0.01391, 0.00085, 1.00000, 0.03529, 0.00035],
    'Gray_Leaf_Spot': [0.23257, 0.00012, 0.00000, 0.06836, 0.00012],
    'Healthy': [0.02212, 0.99795, 0.00000, 0.63164, 0.99908]
})

# Save the sample submission file
sample_submission.to_csv('sample_submission.csv', index=False)
print("Sample submission file 'sample_submission.csv' created.")


Sample submission file 'sample_submission.csv' created.


In [None]:
# 1. Load ground truth and predictions
ground_truth = pd.read_csv('ground_truth.csv')  # Columns: ID, True_Label
    # Your confidence scores output

# 2. Merge them on filename
combined = pd.merge(confidence_df, ground_truth, on='ID')

# 3. Calculate accuracy metrics
from sklearn.metrics import classification_report, accuracy_score

# Get predicted labels (class with highest confidence)
combined['Predicted'] = combined[['Blight','Common_Rust','Gray_Leaf_Spot','Healthy']].idxmax(axis=1)

# Calculate overall accuracy
accuracy = accuracy_score(combined['True_Label'], combined['Predicted'])
print(f"Overall Accuracy: {accuracy:.2%}")

# Detailed class-wise metrics
print("\nClassification Report:")
print(classification_report(
    combined['True_Label'], 
    combined['Predicted'],
    target_names=['Blight','Common_Rust','Gray_Leaf_Spot','Healthy']
))

In [2]:
import os
import csv

# Create train.csv
train_data = []
train_dir = "crop pictures/train"
for label in ["Blight", "Common_Rust", "Gray_Leaf_Spot", "Healthy"]:
    label_dir = os.path.join(train_dir, label)
    for filename in os.listdir(label_dir):
        if filename.endswith(('.jpg', '.JPG')):
            train_data.append([filename, label])

with open('Train.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['image_id', 'label'])  # header
    writer.writerows(train_data)

# Create val.csv
val_data = []
val_dir = "crop pictures/val"
for label in ["Blight", "Common_Rust", "Gray_Leaf_Spot", "Healthy"]:
    label_dir = os.path.join(val_dir, label)
    for filename in os.listdir(label_dir):
        if filename.endswith(('.jpg', '.JPG')):
            val_data.append([filename, label])

with open('Val.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['image_id', 'label'])  # header
    writer.writerows(val_data)

# Create test.csv
test_dir = "crop pictures/test"
test_data = []
for filename in os.listdir(test_dir):
    if filename.endswith(('.jpg', '.JPG')):
        test_data.append([filename])

with open('Test.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['image_id'])  # header
    writer.writerows(test_data)