## Creating Custom Dataset

Source : https://academictorrents.com/details/4b9b7e449aa732842aea1a7d4e6413f4507aea99

Customizing the online dataset tp create a custom dataset for the project. 
- Including 'sex' coloumn for groud truth values in Gender Classification. 

- Including 'weight' and 'height' coloumns for BMI Classification.

In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [49]:
df = pd.read_csv(r'illinois_doc_dataset\illinois_doc_dataset\csv\person.csv', sep=';')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61110 entries, 0 to 61109
Data columns (total 22 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              61110 non-null  object 
 1   name                            61110 non-null  object 
 2   date_of_birth                   61096 non-null  object 
 3   weight                          60716 non-null  float64
 4   hair                            61109 non-null  object 
 5   sex                             61109 non-null  object 
 6   height                          60728 non-null  float64
 7   race                            61106 non-null  object 
 8   eyes                            61109 non-null  object 
 9   admission_date                  61109 non-null  object 
 10  projected_parole_date           33932 non-null  object 
 11  last_paroled_date               8474 non-null   object 
 12  projected_discharge_date        

In [50]:
custom_df = df[['id', 'sex', 'weight', 'height']]
custom_df.eval('weight = weight * 0.453592', inplace=True)
custom_df.eval('height = height * 0.0254', inplace=True)
custom_df.head()

Unnamed: 0,id,sex,weight,height
0,A00147,Male,83.91452,1.7018
1,A00220,Male,70.30676,1.8542
2,A00360,Male,75.749864,1.7526
3,A00367,Male,111.13004,1.8288
4,A01054,Male,75.296272,1.7018


In [51]:
custom_df.eval('bmi = weight / height ** 2', inplace=True)
custom_df = custom_df.round(2)
custom_df.head()

Unnamed: 0,id,sex,weight,height,bmi
0,A00147,Male,83.91,1.7,28.97
1,A00220,Male,70.31,1.85,20.45
2,A00360,Male,75.75,1.75,24.66
3,A00367,Male,111.13,1.83,33.23
4,A01054,Male,75.3,1.7,26.0


In [52]:
custom_df.to_csv('custom_dataset.csv', sep=',', index=False)

# Run only this

In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv(r'illinois_doc_dataset\illinois_doc_dataset\csv\person.csv', sep=';')

custom_df = df[['id', 'sex', 'weight', 'height']]
custom_df.eval('weight = weight * 0.453592', inplace=True)
custom_df.eval('height = height * 0.0254', inplace=True)

custom_df.eval('bmi = weight / height ** 2', inplace=True)
custom_df = custom_df.round(2)

custom_df.to_csv('custom_dataset.csv', sep=',', index=False)

In [54]:
import pandas as pd
import numpy as np
import os
import cv2

# Load the existing CSV DataFrame
df = pd.read_csv(r'illinois_doc_dataset\illinois_doc_dataset\csv\person.csv', sep=';')

# Create custom DataFrame with selected columns
custom_df = df[['id', 'sex', 'weight', 'height']]

# Convert weight from pounds to kilograms and height from inches to meters
custom_df.eval('weight = weight * 0.453592', inplace=True)
custom_df.eval('height = height * 0.0254', inplace=True)

# Calculate BMI
custom_df.eval('bmi = weight / height ** 2', inplace=True)
custom_df = custom_df.round(2)

# Define the folder containing images
image_folder = "illinois_doc_dataset/illinois_doc_dataset/front/front"
image_size = (128, 128)  # Resize all images to 128x128 for consistency

# Initialize a list to store image IDs and data
image_ids = []
image_data = []

# Track the number of images processed (limit to 10)
processed_images = 0

# Read and store images data, but limit it to the first 10 images
for filename in os.listdir(image_folder):
    if processed_images >= 10:
        break
    
    if filename.endswith(".jpg") or filename.endswith(".png"):  # Adjust extensions as needed
        file_path = os.path.join(image_folder, filename)
        
        # Extract the image ID (filename without extension)
        image_id = os.path.splitext(filename)[0]
        
        # Check if the image_id exists in custom_df (only process if it exists in the DataFrame)
        if image_id in custom_df['id'].values:
            # Read the image
            image = cv2.imread(file_path)
            
            # Check if the image was loaded successfully
            if image is None:
                print(f"Warning: Unable to read image file {file_path}")
                continue
            
            # Preprocess the image
            try:
                image = cv2.resize(image, image_size)  # Resize
                image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)  # Convert to RGB for displaying
                
                # Store the image ID and flattened image data
                image_ids.append(image_id)
                image_data.append(image.flatten())  # Flatten the image data for storage
                processed_images += 1  # Increment the counter
            except Exception as e:
                print(f"Error processing {file_path}: {e}")
                continue

# Create a DataFrame from the image IDs and data
image_df = pd.DataFrame({
    "image_id": image_ids,
    "image_data": image_data
})

# Ensure consistency between the 'id' in custom_df and the image filenames
# Clean up any potential extra spaces or formatting issues
custom_df['id'] = custom_df['id'].str.strip()  # Remove any leading/trailing spaces
image_df['image_id'] = image_df['image_id'].str.strip()  # Remove spaces in image filenames

# Merge custom_df with image_df on 'id' and 'image_id'
merged_df = pd.merge(custom_df, image_df, how='left', left_on='id', right_on='image_id')

# Drop the 'image_id' column as it's no longer needed
merged_df.drop(columns=['image_id'], inplace=True)

# Drop rows where image_data is missing (if any)
merged_df.dropna(subset=['image_data'], inplace=True)

# Now save the final dataset with image data to a new CSV file
# Save the DataFrame to an HDF5 file
merged_df.to_hdf('custom_dataset_with_images_limited_10.h5', key='df', mode='w')

print(f"Final DataFrame saved to 'custom_dataset_with_images_limited_10.csv' with {len(merged_df)} records.")


Final DataFrame saved to 'custom_dataset_with_images_limited_10.csv' with 10 records.


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['id', 'sex', 'image_data'], dtype='object')]

  merged_df.to_hdf('custom_dataset_with_images_limited_10.h5', key='df', mode='w')


In [55]:
# Read the DataFrame from the HDF5 file
loaded_df = pd.read_hdf('custom_dataset_with_images_limited_10.h5', key='df')

# Check the length of the image_data
loaded_df['image_data'].apply(lambda x: len(x))

0     16384
2     16384
3     16384
4     16384
5     16384
6     16384
7     16384
8     16384
9     16384
10    16384
Name: image_data, dtype: int64

In [72]:
import pandas as pd
import numpy as np
import os
import cv2
import pickle

# Load the existing CSV DataFrame
df = pd.read_csv(r'illinois_doc_dataset\illinois_doc_dataset\csv\person.csv', sep=';')

# Create custom DataFrame with selected columns
custom_df = df[['id', 'sex', 'weight', 'height']]

# Convert weight from pounds to kilograms and height from inches to meters
custom_df.eval('weight = weight * 0.453592', inplace=True)
custom_df.eval('height = height * 0.0254', inplace=True)

# Calculate BMI
custom_df.eval('bmi = weight / height ** 2', inplace=True)
custom_df = custom_df.round(2)

# Define the folder containing images
image_folder = "illinois_doc_dataset/illinois_doc_dataset/front/front"
image_size = (128, 128)  # Resize all images to 128x128 for consistency

# Initialize a list to store image IDs and data
image_ids = []
image_front_data = []

# Read and store images data, but limit it to the first 10 images
for filename in os.listdir(image_folder):
    
    if filename.endswith(".jpg") or filename.endswith(".png"):  # Adjust extensions as needed
        file_path = os.path.join(image_folder, filename)
        
        # Extract the image ID (filename without extension)
        image_id = os.path.splitext(filename)[0]
        
        # Check if the image_id exists in custom_df (only process if it exists in the DataFrame)
        if image_id in custom_df['id'].values:
            # Read the image
            image = cv2.imread(file_path)
            
            # Check if the image was loaded successfully
            if image is None:
                print(f"Warning: Unable to read image file {file_path}")
                continue
            
            # Preprocess the image
            try:
                image = cv2.resize(image, image_size)  # Resize
                image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)  # Convert to grayscale
                
                # Store the image ID and flattened image data
                image_ids.append(image_id)
                image_front_data.append(image.flatten())  # Flatten the image data for storage
            except Exception as e:
                print(f"Error processing {file_path}: {e}")
                continue

# Create a DataFrame from the image IDs and data
image_df = pd.DataFrame({
    "image_id": image_ids,
    "image_front_data": image_front_data
})

# Ensure consistency between the 'id' in custom_df and the image filenames
# Clean up any potential extra spaces or formatting issues
custom_df['id'] = custom_df['id'].str.strip()  # Remove any leading/trailing spaces
image_df['image_id'] = image_df['image_id'].str.strip()  # Remove spaces in image filenames

# Merge custom_df with image_df on 'id' and 'image_id'
merged_df = pd.merge(custom_df, image_df, how='left', left_on='id', right_on='image_id')

# Drop the 'image_id' column as it's no longer needed
merged_df.drop(columns=['image_id'], inplace=True)

# Drop rows where image_front_data is missing (if any)
merged_df.dropna(subset=['image_front_data'], inplace=True)

# Define the folder containing images
image_folder = "illinois_doc_dataset/illinois_doc_dataset/side/side"
image_size = (128, 128)  # Resize all images to 128x128 for consistency

# Initialize a list to store image IDs and data
image_ids = []
image_side_data = []

# Read and store images data, but limit it to the first 10 images
for filename in os.listdir(image_folder):
    
    if filename.endswith(".jpg") or filename.endswith(".png"):  # Adjust extensions as needed
        file_path = os.path.join(image_folder, filename)
        
        # Extract the image ID (filename without extension)
        image_id = os.path.splitext(filename)[0]
        
        # Check if the image_id exists in custom_df (only process if it exists in the DataFrame)
        if image_id in custom_df['id'].values:
            # Read the image
            image = cv2.imread(file_path)
            
            # Check if the image was loaded successfully
            if image is None:
                print(f"Warning: Unable to read image file {file_path}")
                continue
            
            # Preprocess the image
            try:
                image = cv2.resize(image, image_size)  # Resize
                image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)  # Convert to grayscale
                
                # Store the image ID and flattened image data
                image_ids.append(image_id)
                image_side_data.append(image.flatten())  # Flatten the image data for storage
            except Exception as e:
                print(f"Error processing {file_path}: {e}")
                continue

# Create a DataFrame from the image IDs and data
image_df = pd.DataFrame({
    "image_id": image_ids,
    "image_side_data": image_side_data
})

# Ensure consistency between the 'id' in custom_df and the image filenames
# Clean up any potential extra spaces or formatting issues
custom_df['id'] = custom_df['id'].str.strip()  # Remove any leading/trailing spaces
image_df['image_id'] = image_df['image_id'].str.strip()  # Remove spaces in image filenames

# Merge custom_df with image_df on 'id' and 'image_id'
merged_df = pd.merge(merged_df, image_df, how='left', left_on='id', right_on='image_id')

# Drop the 'image_id' column as it's no longer needed
merged_df.drop(columns=['image_id'], inplace=True)

# Drop rows where image_front_data is missing (if any)
merged_df.dropna(subset=['image_side_data'], inplace=True)

# Save the DataFrame to a pickle file
with open('custom_dataset_with_images.pkl', 'wb') as f:
    pickle.dump(merged_df, f)

print(f"Final DataFrame saved to 'custom_dataset_with_images_limited_10.pkl' with {len(merged_df)} records.")

Final DataFrame saved to 'custom_dataset_with_images_limited_10.pkl' with 60141 records.


In [None]:
# Read the DataFrame from the pickle file
with open('custom_dataset_with_images.pkl', 'rb') as f:
    loaded_df = pickle.load(f)

# Convert the list to a 2D matrix of shape (128, 128)
loaded_df['image_front_data'] = loaded_df['image_front_data'].apply(lambda x: np.array(x).reshape(128, 128))
loaded_df['image_side_data'] = loaded_df['image_sode_data'].apply(lambda x: np.array(x).reshape(128, 128))

# Check the shape of the first image to verify
print(loaded_df['image_front_data'].iloc[120].shape)
print(loaded_df['image_side_data'].iloc[120].shape)
loaded_df.head()

Unnamed: 0,id,sex,weight,height,bmi,image_front_data,image_side_data
0,A00147,Male,83.91,1.7,28.97,"[181, 180, 180, 179, 181, 177, 180, 180, 178, ...","[154, 155, 152, 150, 153, 154, 156, 155, 154, ..."
1,A00360,Male,75.75,1.75,24.66,"[183, 181, 182, 181, 182, 180, 182, 180, 178, ...","[184, 187, 183, 183, 183, 183, 183, 183, 184, ..."
2,A00367,Male,111.13,1.83,33.23,"[190, 189, 194, 205, 199, 206, 195, 182, 206, ...","[192, 190, 202, 205, 202, 202, 202, 202, 211, ..."
3,A01054,Male,75.3,1.7,26.0,"[209, 211, 206, 211, 210, 208, 206, 211, 208, ...","[208, 211, 208, 210, 210, 209, 205, 211, 208, ..."
4,A01072,Male,88.45,1.75,28.8,"[151, 155, 152, 156, 156, 158, 156, 156, 158, ...","[140, 147, 146, 148, 145, 148, 149, 149, 154, ..."
