In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def rename_files(folder):
    for root, _, files in os.walk(folder):
        for filename in files:
            if " " in filename:
                old_path = os.path.join(root, filename)
                new_filename = filename.replace(" ", "_")
                new_path = os.path.join(root, new_filename)
                
                # Check if the new file name already exists, and if so, remove it
                if os.path.exists(new_path):
                    os.remove(new_path)
                
                # Rename the file
                os.rename(old_path, new_path)

rename_files("/kaggle/input")

In [None]:
import h5py
from PIL import Image, ImageOps, ImageEnhance

def adjust_contrast(img, contrast_factor):
    enhancer = ImageEnhance.Contrast(img)
    contrast_adjusted_img = enhancer.enhance(contrast_factor)
    return Image.blend(img, contrast_adjusted_img, contrast_factor)

# Function to read and convert a PNG file to a dataset
def convert_jpg_to_array(image_path):
    image = Image.open(image_path)
    image = adjust_contrast(image, 1.5)
    image_data = np.array(image)
    return image_data
    
def convert_images_to_dataset():
    ls = []
    image_data=[]
    for root, _, files in os.walk("/kaggle/input/"):
        label = root[len("/kaggle/input/rice-leaf-images/rice_images")+1:]
        for filename in files:
            image_path = os.path.join(root, filename)
            data = convert_jpg_to_array(image_path)
            image_data.append(data)
            if label == "_BrownSpot":
                ls.append(1)
            elif label == "_Healthy":
                ls.append(0)
            elif label == "_Hispa":
                ls.append(2)
            elif label == "_LeafBlast":
                ls.append(3)
    return image_data, ls

image_data, ls = convert_images_to_dataset()

In [None]:
df = pd.DataFrame()
df["image_data"] = image_data
df["label"] = ls
df = df.sample(frac=1, random_state=42)  # Set a random_state for reproducibility
df.reset_index(drop=True, inplace=True)
image_data = df["image_data"].to_list()
label = df["label"].to_list()
h5_file_path = 'train_rice_data.h5'  
with h5py.File(h5_file_path, 'w') as hf:
    hf.create_dataset('image_data', data=np.array(image_data))
    hf.create_dataset('label', data=np.array(label))
    

In [None]:
cutoff = int(len(df) * 0.8)
image_data = df["image_data"].iloc[cutoff:].to_list()
label = df["label"].iloc[cutoff:].to_list()
h5_file_path = 'test_rice_data.h5'  
with h5py.File(h5_file_path, 'w') as hf:
    hf.create_dataset('image_data', data=np.array(image_data))
    hf.create_dataset('label', data=np.array(label))