Custom VLM dataset for Crop Disease dataset named PlantVillage(plant disease classification)

In [1]:
import os
import random
import shutil

# Path to the source dataset directory
source_train_dir = 'PlantVillage-Dataset/train/'

# Create directories for custom dataset
custom_images_dir = 'custom_vlm_dataset/images/'
custom_labels_dir = 'custom_vlm_dataset/labels/'
os.makedirs(custom_images_dir, exist_ok=True)
os.makedirs(custom_labels_dir, exist_ok=True)

# Number of samples to use for the custom VLM dataset
num_samples = 50

# Function to create custom dataset
def create_custom_vlm_dataset(source_train_dir, custom_images_dir, custom_labels_dir, num_samples):
    # Go through each plant disease folder in the source dataset
    for disease_class in os.listdir(source_train_dir):
        disease_class_path = os.path.join(source_train_dir, disease_class)

        if os.path.isdir(disease_class_path):
            # Get a list of image files for the current disease class
            image_files = os.listdir(disease_class_path)

            # Randomly select a subset of images for the current disease
            selected_files = random.sample(image_files, num_samples)

            for file in selected_files:
                # Define paths for copying images and creating text files
                img_src_path = os.path.join(disease_class_path, file)
                img_dst_path = os.path.join(custom_images_dir, file)

                # Copy the image to the custom images directory
                shutil.copy(img_src_path, img_dst_path)

                # Create a text file with the disease class name as the label
                label_txt_path = os.path.join(custom_labels_dir, file.split('.')[0] + '.txt')
                with open(label_txt_path, 'w') as label_file:
                    label_file.write(disease_class.replace('_', ' '))  # Write disease name as the label

# Create the custom VLM dataset
create_custom_vlm_dataset(source_train_dir, custom_images_dir, custom_labels_dir, num_samples)

print(f"Custom VLM dataset created with {num_samples} images for each class.")


Custom VLM dataset created with 50 images for each class.


In [11]:
from transformers import pipeline

# Load pre-trained model for text generation
generator = pipeline("text-generation", model="gpt2")

def generate_description(label):
    prompt = f"Provide a detailed description of the plant disease '{label}', including its symptoms, causes, and recommended control measures or treatments."
    try:
        response = generator(prompt, max_length=200)
        return response[0]['generated_text'].strip()
    except Exception as e:
        print(f"Error generating description for {label}: {e}")
        return "Description not available"


In [12]:
from PIL import Image
import os

# Directory paths
image_dir = 'custom_vlm_dataset/images/'
label_dir = 'custom_vlm_dataset/labels/'

# Get all image files
image_files = os.listdir(image_dir)

# Load image-text pairs
image_text_pairs = []
for img_file in image_files:
    # Load image
    img_path = os.path.join(image_dir, img_file)
    img = Image.open(img_path)

    # Load corresponding label
    label_path = os.path.join(label_dir, img_file.split('.')[0] + '.txt')
    with open(label_path, 'r') as label_file:
        label = label_file.read().strip()

    # Generate description for the label using LLM
    description = generate_description(label)
    
    # Append image, label, and description to the data list
    image_text_pairs.append({"image_name": img_file, "label": label, "description": description})

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for 

KeyboardInterrupt: 

In [13]:
len(image_text_pairs)

693

In [1]:
import pandas as pd

# Create a DataFrame
df = pd.DataFrame(image_text_pairs)

# Save as CSV
#df.to_csv('custom_dataset/labels.csv', index=False)
df[0]

NameError: name 'image_text_pairs' is not defined