In [16]:
import cv2
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
# Path to your folder
folder_path = 'images-224/images-224'

In [3]:
df = pd.read_csv('cardiomegaly.csv')

In [4]:
df.head()

Unnamed: 0,Image Index,Cardiomegaly
0,00004445_001.png,1
1,00013593_002.png,0
2,00001301_042.png,1
3,00016433_000.png,0
4,00007718_008.png,0


In [5]:
df.shape

(5544, 2)

In [None]:
image_names = df['Image Index'].tolist()
data = []

# Precompute the column names once instead of inside the loop
pixel_cols = [f"{row+1}x{col+1}" for row in range(224) for col in range(224)]

batch_size = 256   # adjust depending on RAM
batches = []       # to store intermediate DataFrames

for i, filename in enumerate(image_names):
    img_path = os.path.join(folder_path, filename)
    if os.path.exists(img_path):
        img = cv2.imread(img_path, 0)  # Load grayscale image
        
        if img is None:
            print(f"Failed to load: {filename}")
            continue

        img = cv2.resize(img, (224, 224))
        img = img / 255.0  # Normalize to [0, 1]

        flattened = img.flatten()

        # Create dict using precomputed keys
        pixel_dict = {col_name: flattened[idx]
                      for idx, col_name in enumerate(pixel_cols)}

        pixel_dict['filename'] = filename
        data.append(pixel_dict)
        print(f"Processed: {filename}")
    else:
        print(f"Not found: {filename}")

    # When batch is full, convert to DataFrame and clear list to free memory
    if len(data) >= batch_size:
        batch_df = pd.DataFrame(data)
        batches.append(batch_df)
        data = []    # drop references so Python can free memory

# Process remaining data
if len(data) > 0:
    batch_df = pd.DataFrame(data)
    batches.append(batch_df)
    data = []

# Concatenate all batches into final df3
df3 = pd.concat(batches, ignore_index=True)
print(f"\nTotal images processed: {len(df3)}")

In [7]:
pixel_columns = [col for col in df3.columns if col != 'filename']

df = pd.merge(
    df,
    df3[['filename'] + pixel_columns],
    left_on='Image Index',
    right_on='filename',
    how='left'
)


In [8]:
df = df.drop(columns=['filename'])

In [9]:
df

Unnamed: 0,Image Index,Cardiomegaly,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,...,224x215,224x216,224x217,224x218,224x219,224x220,224x221,224x222,224x223,224x224
0,00004445_001.png,1,0.537255,0.541176,0.552941,0.568627,0.611765,0.674510,0.729412,0.749020,...,0.772549,0.780392,0.796078,0.803922,0.807843,0.784314,0.823529,0.811765,0.780392,0.784314
1,00013593_002.png,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,00001301_042.png,1,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,...,0.517647,0.521569,0.529412,0.529412,0.529412,0.525490,0.513725,0.501961,0.490196,0.447059
3,00016433_000.png,0,0.870588,0.698039,0.560784,0.470588,0.400000,0.337255,0.325490,0.329412,...,0.070588,0.066667,0.066667,0.066667,0.066667,0.066667,0.070588,0.078431,0.133333,0.231373
4,00007718_008.png,0,0.047059,0.047059,0.047059,0.047059,0.047059,0.050980,0.047059,0.047059,...,0.592157,0.611765,0.631373,0.639216,0.650980,0.666667,0.670588,0.682353,0.690196,0.639216
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5539,00020434_006.png,1,0.247059,0.160784,0.101961,0.050980,0.011765,0.000000,0.000000,0.000000,...,0.207843,0.180392,0.152941,0.137255,0.094118,0.011765,0.003922,0.039216,0.101961,0.196078
5540,00000116_013.png,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.156863,0.196078,0.047059,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5541,00030437_001.png,1,,,,,,,,,...,,,,,,,,,,
5542,00021790_000.png,1,0.737255,0.717647,0.694118,0.670588,0.650980,0.631373,0.619608,0.611765,...,0.023529,0.023529,0.023529,0.023529,0.023529,0.023529,0.027451,0.027451,0.027451,0.043137


In [10]:
df.isna().sum()

Image Index      0
Cardiomegaly     0
1x1             97
1x2             97
1x3             97
                ..
224x220         97
224x221         97
224x222         97
224x223         97
224x224         97
Length: 50178, dtype: int64

In [11]:
df = df.dropna()

In [12]:
df.isna().sum()

Image Index     0
Cardiomegaly    0
1x1             0
1x2             0
1x3             0
               ..
224x220         0
224x221         0
224x222         0
224x223         0
224x224         0
Length: 50178, dtype: int64

In [13]:
df.duplicated().sum()

np.int64(0)

In [14]:
df = df.drop('Image Index', axis=1)

In [15]:
df

Unnamed: 0,Cardiomegaly,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,224x215,224x216,224x217,224x218,224x219,224x220,224x221,224x222,224x223,224x224
0,1,0.537255,0.541176,0.552941,0.568627,0.611765,0.674510,0.729412,0.749020,0.752941,...,0.772549,0.780392,0.796078,0.803922,0.807843,0.784314,0.823529,0.811765,0.780392,0.784314
1,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,1,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,...,0.517647,0.521569,0.529412,0.529412,0.529412,0.525490,0.513725,0.501961,0.490196,0.447059
3,0,0.870588,0.698039,0.560784,0.470588,0.400000,0.337255,0.325490,0.329412,0.337255,...,0.070588,0.066667,0.066667,0.066667,0.066667,0.066667,0.070588,0.078431,0.133333,0.231373
4,0,0.047059,0.047059,0.047059,0.047059,0.047059,0.050980,0.047059,0.047059,0.047059,...,0.592157,0.611765,0.631373,0.639216,0.650980,0.666667,0.670588,0.682353,0.690196,0.639216
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5538,1,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,...,0.039216,0.007843,0.007843,0.007843,0.007843,0.007843,0.007843,0.007843,0.007843,0.007843
5539,1,0.247059,0.160784,0.101961,0.050980,0.011765,0.000000,0.000000,0.000000,0.000000,...,0.207843,0.180392,0.152941,0.137255,0.094118,0.011765,0.003922,0.039216,0.101961,0.196078
5540,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.156863,0.196078,0.047059,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5542,1,0.737255,0.717647,0.694118,0.670588,0.650980,0.631373,0.619608,0.611765,0.600000,...,0.023529,0.023529,0.023529,0.023529,0.023529,0.023529,0.027451,0.027451,0.027451,0.043137


In [18]:
df.to_csv('cardiomegaly-part2.csv', index=False)