In [1]:
import os
import pandas as pd
from PIL import Image
import numpy as np

# Paths to dataset folders
base_path = r'E:\3rd Year NOTES\A.I. & M.L\HYDRO_FLOAT\HydroFloat-Dataset\images'
folders = ['train', 'test', 'val']  # The three splits

# Initialize an empty list to hold data
data = []

# Process each folder
for split in folders:
    folder_path = os.path.join(base_path, split)
    for file_name in os.listdir(folder_path):
        if file_name.endswith(('jpg', 'png', 'jpeg')):  # Ensure it's an image file
            # Extract label from the file name
            label = file_name.split('-')[0]  # Assuming label is before the first underscore
            
            # Load the image and convert to pixel values
            image_path = os.path.join(folder_path, file_name)
            image = Image.open(image_path).convert('RGB')  # Ensure RGB format
            pixel_values = np.array(image).flatten()  # Flatten pixel values into a 1D array
            
            # Append data to the list
            data.append({
                'split': split,
                'label': label,
                **{f'pixel_{i}': val for i, val in enumerate(pixel_values)}
            })

# Convert the list of data to a DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
output_path = 'HydroFloat_dataset.csv'
df.to_csv(output_path, index=False)
print(f"CSV file created successfully at: {output_path}")


MemoryError: 

In [2]:
#Memory error solved

import os
import csv
from PIL import Image
import numpy as np

# Paths to dataset folders
base_path = r'E:\3rd Year NOTES\A.I. & M.L\HYDRO_FLOAT\HydroFloat-Dataset\images'
folders = ['train', 'test', 'val']  # The three splits

# Output CSV file
output_path = r'E:\3rd Year NOTES\A.I. & M.L\HYDRO_FLOAT\HydroFloat_dataset.csv'

# Initialize the CSV file and write headers
with open(output_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(['split', 'label'] + [f'pixel_{i}' for i in range(3 * 224 * 224)])  # Adjust resolution if necessary

# Process each folder
    for split in folders:
        folder_path = os.path.join(base_path, split)
        for file_name in os.listdir(folder_path):
            if file_name.endswith(('jpg', 'png', 'jpeg')):  # Ensure it's an image file
                # Extract label from the file name
                label = file_name.rsplit('-', 1)[0]  # Get the part before the last hyphen

                # Load the image, resize, and flatten pixel values
                image_path = os.path.join(folder_path, file_name)
                image = Image.open(image_path).convert('RGB')  # Ensure RGB format
                image = image.resize((224, 224))  # Resize to a smaller resolution
                pixel_values = np.array(image).flatten()  # Flatten pixel values into a 1D array

                # Write a row to the CSV file
                writer.writerow([split, label] + pixel_values.tolist())

print(f"CSV file created successfully at: {output_path}")


CSV file created successfully at: E:\3rd Year NOTES\A.I. & M.L\HYDRO_FLOAT\HydroFloat_dataset.csv


In [1]:
import os
import csv
from PIL import Image
import numpy as np

# Paths to dataset folders
base_path = r'E:\3rd Year NOTES\A.I. & M.L\HYDRO_FLOAT\HydroFloat-Dataset\images'
folders = ['train', 'test', 'val']  # The three splits

# Output CSV file
output_path = r'E:\3rd Year NOTES\A.I. & M.L\HYDRO_FLOAT\HydroFloat_dataset_optimized.csv'

# Image resolution for resizing
resize_resolution = (64, 64)  # Reduce resolution to 64x64

# Initialize the CSV file and write headers
with open(output_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(['split', 'label'] + [f'pixel_{i}' for i in range(3 * resize_resolution[0] * resize_resolution[1])])  # Adjust resolution

# Process each folder
    for split in folders:
        folder_path = os.path.join(base_path, split)
        for file_name in os.listdir(folder_path):
            if file_name.endswith(('jpg', 'png', 'jpeg')):  # Ensure it's an image file
                # Extract label from the file name
                label = file_name.rsplit('-', 1)[0]  # Get the part before the last hyphen

                # Load the image, resize, normalize, and flatten pixel values
                image_path = os.path.join(folder_path, file_name)
                image = Image.open(image_path).convert('RGB')  # Ensure RGB format
                image = image.resize(resize_resolution)  # Resize to smaller resolution
                pixel_values = (np.array(image) / 255.0).flatten().astype(np.float32)  # Normalize to 0-1

                # Write a row to the CSV file
                writer.writerow([split, label] + pixel_values.tolist())

print(f"Optimized CSV file created successfully at: {output_path}")


Optimized CSV file created successfully at: E:\3rd Year NOTES\A.I. & M.L\HYDRO_FLOAT\HydroFloat_dataset_optimized.csv


In [2]:
import pandas as pd

# Input and output file paths
input_csv_path = r'E:\3rd Year NOTES\A.I. & M.L\HYDRO_FLOAT\HydroFloat_dataset_optimized.csv'
output_csv_path = r'E:\3rd Year NOTES\A.I. & M.L\HYDRO_FLOAT\HydroFloat_dataset_updated.csv'

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(input_csv_path)

# Function to clean and update labels
def clean_label(label):
    # Remove unwanted characters or numbers
    label = ''.join(filter(str.isalpha, label))  # Keep only alphabetic characters
    
    # Replace specific patterns
    label = label.replace('v', 'garbage patch')  # Replace 'v' with 'garbage patch'
    label = label.replace('bb', 'plastic bottles')  # Replace 'bb' with 'plastic bottles'
    label = label.replace('ylg', 'container')  # Replace 'ylg' with 'container'
    label = label.replace('.jpg', '')  # Remove '.jpg' from the label
    
    return label

# Apply the cleaning function to the label column
df['label'] = df['label'].apply(clean_label)

# Save the updated DataFrame to a new CSV file
df.to_csv(output_csv_path, index=False)

print(f"Updated CSV file saved at: {output_csv_path}")


Updated CSV file saved at: E:\3rd Year NOTES\A.I. & M.L\HYDRO_FLOAT\HydroFloat_dataset_updated.csv


In [4]:
#Final label update

import pandas as pd

# Input and output file paths
input_csv_path = r'E:\3rd Year NOTES\A.I. & M.L\HYDRO_FLOAT\HydroFloat_dataset_updated.csv'
output_csv_path = r'E:\3rd Year NOTES\A.I. & M.L\HYDRO_FLOAT\HydroFloat_dataset_final.csv'

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(input_csv_path)

# Function to clean the labels further
def refine_label(label):
    # Remove 'jpg' at the end, if present
    if label.endswith('jpg'):
        label = label[:-3]
    
    # Remove 'a' at the beginning, if present
    if label.startswith('A'):
        label = label[1:]
    
    return label.strip()  # Remove any extra spaces

# Apply the refinement function to the label column
df['label'] = df['label'].apply(refine_label)

# Save the updated DataFrame to a new CSV file
df.to_csv(output_csv_path, index=False)

print(f"Final updated CSV file saved at: {output_csv_path}")


Final updated CSV file saved at: E:\3rd Year NOTES\A.I. & M.L\HYDRO_FLOAT\HydroFloat_dataset_final.csv


Pre-Processing the dataset

In [6]:
#Pre-processed dataset

import pandas as pd

# Load the dataset
file_path = r'E:\3rd Year NOTES\A.I. & M.L\HYDRO_FLOAT\HydroFloat_dataset_final.csv'  # Update with the path to your dataset
df = pd.read_csv(file_path)

# 1. Check for null values
print("Checking for null values...")
null_counts = df.isnull().sum()
print(null_counts)

# Handle null values
if null_counts.any():
    print("Handling null values...")
    # Option 1: Drop rows with null values
    df = df.dropna()

    # Option 2: Fill null values (uncomment if needed)
    # df['split'] = df['split'].fillna('unknown')  # Replace with default value for 'split'
    # df['label'] = df['label'].fillna('unknown')  # Replace with default value for 'label'
    # df.iloc[:, 2:] = df.iloc[:, 2:].fillna(0)  # Fill missing pixel values with 0

# 2. Check for duplicate rows
print("Checking for duplicate rows...")
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

# Remove duplicate rows if any
if duplicate_count > 0:
    print("Removing duplicate rows...")
    df = df.drop_duplicates()

# 3. Check for inconsistent or unexpected values in 'split' and 'label'
print("Checking for unexpected values in 'split' and 'label' columns...")
print("Unique values in 'split':", df['split'].unique())
print("Unique values in 'label':", df['label'].unique())

# 4. Normalize 'split' column (Optional: standardize split names)
valid_splits = ['train', 'test', 'val']
df = df[df['split'].isin(valid_splits)]  # Remove rows with invalid 'split' values

# 5. Pixel value checks (ensure all are numeric)
print("Checking pixel values...")
pixel_columns = df.columns[2:]  # All pixel columns
non_numeric_pixels = df[pixel_columns].applymap(lambda x: not isinstance(x, (int, float))).sum().sum()
print(f"Non-numeric pixel values: {non_numeric_pixels}")

# Convert all pixel values to numeric, coercing errors to NaN
df[pixel_columns] = df[pixel_columns].apply(pd.to_numeric, errors='coerce')

# Handle any resulting NaN pixel values (if coercion introduced NaN values)
df[pixel_columns] = df[pixel_columns].fillna(0)

# 6. Save the cleaned and preprocessed dataset
output_file_path = r'E:\3rd Year NOTES\A.I. & M.L\HYDRO_FLOAT\HydroFloat_dataset_preprocessed.csv'
df.to_csv(output_file_path, index=False)
print(f"Preprocessed dataset saved at: {output_file_path}")


Checking for null values...
split          0
label          0
pixel_0        0
pixel_1        0
pixel_2        0
              ..
pixel_12283    0
pixel_12284    0
pixel_12285    0
pixel_12286    0
pixel_12287    0
Length: 12290, dtype: int64
Checking for duplicate rows...
Number of duplicate rows: 3
Removing duplicate rows...
Checking for unexpected values in 'split' and 'label' columns...
Unique values in 'split': ['train' 'test' 'val']
Unique values in 'label': ['bag' 'deadfish' 'xdeadfish' 'xinbag' 'plastic bottles' 'fish'
 'garbage patch' 'waterhyacinth' 'container']
Checking pixel values...


  non_numeric_pixels = df[pixel_columns].applymap(lambda x: not isinstance(x, (int, float))).sum().sum()


Non-numeric pixel values: 0
Preprocessed dataset saved at: E:\3rd Year NOTES\A.I. & M.L\HYDRO_FLOAT\HydroFloat_dataset_preprocessed.csv
