In [20]:
import h5py
import numpy as np

def inspect_hdf5_file(file_path):
    """
    Thoroughly inspect the contents of an HDF5 file and check for null values, anomalies, and other potential issues.
    """
    with h5py.File(file_path, 'r') as h5file:
        print("Keys in the file:", list(h5file.keys()))

        for key in h5file.keys():
            dataset = h5file[key]
            print(f"\nInspecting dataset: {key}")
            print(f"Shape: {dataset.shape}")
            print(f"Data type: {dataset.dtype}")

            # Load a sample of the data (first 1000 items or all if less)
            sample_size = min(1000, dataset.shape[0])
            data_sample = dataset[:sample_size]

            # Check for NaN values
            nan_count = np.isnan(data_sample).sum()
            if nan_count > 0:
                print(f"WARNING: {nan_count} NaN values found in the sample!")
            else:
                print("No NaN values found in the sample.")

            # Check for infinity values
            inf_count = np.isinf(data_sample).sum()
            if inf_count > 0:
                print(f"WARNING: {inf_count} infinity values found in the sample!")
            else:
                print("No infinity values found in the sample.")

            # Check data range
            data_min = np.min(data_sample)
            data_max = np.max(data_sample)
            print(f"Data range: [{data_min}, {data_max}]")

            if key == 'inputs':
                # For 'inputs', check if values are within [0, 1]
                if data_min < 0 or data_max > 1:
                    print("WARNING: 'inputs' contains values outside the expected [0, 1] range!")
                else:
                    print("'inputs' values are within the expected [0, 1] range.")

            elif key == 'evals':
                # For 'evals', check if values are within [0, 1] (normalized evaluation scores)
                if data_min < 0 or data_max > 1:
                    print("WARNING: 'evals' contains values outside the expected [0, 1] range!")
                else:
                    print("'evals' values are within the expected [0, 1] range.")

            elif key == 'moves':
                # For 'moves', check if it's one-hot encoded (sum of each row should be 1)
                row_sums = np.sum(data_sample, axis=1)
                if not np.allclose(row_sums, 1):
                    print("WARNING: 'moves' may not be properly one-hot encoded!")
                else:
                    print("'moves' appears to be properly one-hot encoded.")

            # Check for constant values
            if np.all(data_sample == data_sample[0]):
                print(f"WARNING: All values in '{key}' are constant!")

            # Print a small sample of the data
            print(f"Sample data from '{key}':")
            if data_sample.ndim == 1:
                print(data_sample[:5])
            else:
                print(data_sample[:2])

# Example usage
h5_file_path = r"Chess_dataset_final10_eval.h5"
inspect_hdf5_file(h5_file_path)

Keys in the file: ['evals', 'inputs']

Inspecting dataset: evals
Shape: (99086,)
Data type: float64
No NaN values found in the sample.
No infinity values found in the sample.
Data range: [0.0005527786369235996, 0.9994472213630764]
'evals' values are within the expected [0, 1] range.
Sample data from 'evals':
[0.52061331 0.52560259 0.47065875 0.4725277  0.47626785]

Inspecting dataset: inputs
Shape: (99086, 8, 8, 19)
Data type: float32
No NaN values found in the sample.
No infinity values found in the sample.
Data range: [0.0, 1.0]
'inputs' values are within the expected [0, 1] range.
Sample data from 'inputs':
[[[[0. 0. 0. ... 1. 0. 0.]
   [0. 1. 0. ... 1. 0. 0.]
   [0. 0. 1. ... 1. 0. 0.]
   ...
   [0. 0. 1. ... 1. 0. 0.]
   [0. 1. 0. ... 1. 0. 0.]
   [0. 0. 0. ... 1. 0. 0.]]

  [[1. 0. 0. ... 1. 0. 0.]
   [1. 0. 0. ... 1. 0. 0.]
   [1. 0. 0. ... 1. 0. 0.]
   ...
   [1. 0. 0. ... 1. 0. 0.]
   [1. 0. 0. ... 1. 0. 0.]
   [1. 0. 0. ... 1. 0. 0.]]

  [[0. 0. 0. ... 1. 0. 0.]
   [0. 0. 0. 

In [3]:
import h5py
import numpy as np

def augment_chess_dataset(input_file, output_file):
    """
    Augment chess dataset by flipping and rotating the input data.
    The original and augmented data are saved into a new HDF5 file.

    :param input_file: Path to the input HDF5 file
    :param output_file: Path to the output HDF5 file
    """
    with h5py.File(input_file, 'r') as h5_in, h5py.File(output_file, 'w') as h5_out:
        # Copy original datasets to the output file
        for key in h5_in.keys():
            h5_out.create_dataset(key, data=h5_in[key])
        
        # Augment the 'inputs' dataset (chess boards)
        inputs = h5_in['inputs'][:]
        moves = h5_in['moves'][:]  # Preserve moves (labels)
        
        print(f"Original inputs shape: {inputs.shape}")
        
        augmented_inputs = []
        augmented_moves = []

        for i in range(inputs.shape[0]):
            board = inputs[i]
            move = moves[i]

            # Original
            augmented_inputs.append(board)
            augmented_moves.append(move)

            # Horizontal flip
            flipped_h = np.flip(board, axis=1)
            augmented_inputs.append(flipped_h)
            augmented_moves.append(move)

            # Vertical flip
            flipped_v = np.flip(board, axis=0)
            augmented_inputs.append(flipped_v)
            augmented_moves.append(move)

            # Rotate 90 degrees
            rotated_90 = np.rot90(board, k=1, axes=(0, 1))
            augmented_inputs.append(rotated_90)
            augmented_moves.append(move)

            # Rotate 180 degrees
            rotated_180 = np.rot90(board, k=2, axes=(0, 1))
            augmented_inputs.append(rotated_180)
            augmented_moves.append(move)

            # Rotate 270 degrees
            rotated_270 = np.rot90(board, k=3, axes=(0, 1))
            augmented_inputs.append(rotated_270)
            augmented_moves.append(move)

        # Convert augmented data to numpy arrays
        augmented_inputs = np.array(augmented_inputs)
        augmented_moves = np.array(augmented_moves)

        # Save augmented datasets
        h5_out.create_dataset('augmented_inputs', data=augmented_inputs, compression="gzip")
        h5_out.create_dataset('augmented_moves', data=augmented_moves, compression="gzip")

        print(f"Augmented inputs shape: {augmented_inputs.shape}")
        print("Augmented dataset saved successfully.")

# Example usage
input_file_path = r"D:\Aarti\Dataset\Chess_dataset4.h5"
output_file_path = r"D:\Aarti\Dataset\Chess_data_augmented3.h5"

augment_chess_dataset(input_file_path, output_file_path)


Original inputs shape: (6804, 8, 8, 19)


KeyboardInterrupt: 

In [14]:
import h5py
import numpy as np

def process_and_save_hdf5(input_file_path, output_file_path):
    """
    Process an HDF5 file to extract specific keys and save them to a new file.
    """
    with h5py.File(input_file_path, 'r') as h5file:
        # Check for the presence of required keys
        required_keys = ['inputs', 'moves', 'evals']
        for key in required_keys:
            if key not in h5file:
                raise KeyError(f"Key '{key}' not found in the input HDF5 file!")

        # Extract data for the required keys
        inputs = h5file['inputs'][:]
        moves = h5file['moves'][:]
        evals = h5file['evals'][:]

        # Perform any necessary processing (if needed)
        # Example: Normalize inputs or apply transformations
        augmented_inputs = inputs  # Replace with any desired augmentation logic

        # Save the processed data into a new HDF5 file
        with h5py.File(output_file_path, 'w') as output_file:
            output_file.create_dataset('augmented_inputs', data=augmented_inputs)
            output_file.create_dataset('moves', data=moves)
            output_file.create_dataset('evals', data=evals)

    print(f"Processed data saved to {output_file_path}")

# Example usage
input_h5_file_path = r"D:\Aarti\Dataset\Chess_data_augmented3.h5"
output_h5_file_path = r"D:\Aarti\Dataset\Processed_Chess_Data3.h5"

process_and_save_hdf5(input_h5_file_path, output_h5_file_path)


Processed data saved to D:\Aarti\Dataset\Processed_Chess_Data3.h5


In [19]:
import h5py
import numpy as np

def merge_h5_files(file1, file2, output_file):
    with h5py.File(file1, 'r') as hf1, h5py.File(file2, 'r') as hf2:
        # Read datasets from both files
        inputs1 = hf1['inputs'][:]
        inputs2 = hf2['inputs'][:]
        
        
        eval1 = hf1['evals'][:]
        eval2 = hf2['evals'][:]
        
        # Concatenate datasets
        inputs = np.concatenate((inputs1, inputs2), axis=0)
      
        evals = np.concatenate((eval1, eval2), axis=0)
        
        # Write merged data to new HDF5 file
        with h5py.File(output_file, 'w') as hf_out:
            hf_out.create_dataset('inputs', data=inputs)
           
            hf_out.create_dataset('evals', data=evals)
        
    print(f"Merged data saved to {output_file}")

# Example usage
file1 = r'D:\\Aarti\\Dataset\\Chess_dataset18_eval.h5'  # Replace with your first HDF5 file
file2 = r'Chess_dataset_final9_eval.h5'  # Replace with your second HDF5 file
output_file = 'Chess_dataset_final10_eval.h5'  # Output file for merged data

merge_h5_files(file1, file2, output_file)


Merged data saved to Chess_dataset_final10_eval.h5
