In [None]:
import pandas as pd
import os

In [None]:
df = pd.read_csv('data/unhealthy_data_aligned.csv')
df.head(5)
# df.info()
# object_columns = df.select_dtypes(include=['object']).columns.tolist()
# print("Object columns:", object_columns)

# numeric_cols = df.select_dtypes(include=['int64']).columns
# print("Min value:", df[numeric_cols].min().min())
# print("Max value:", df[numeric_cols].max().max())

# df[numeric_cols] = df[numeric_cols].astype('uint32')
    
# df.info()

## Handle null values

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)
df

## Removing duplicate rows

In [None]:
dropped = df.drop_duplicates()
dropped

# Alternate code for initial preprocessing

This uses a memory efficient way for healthy dataset.

Also checks for duplicate columns


In [None]:
input_csv_path = 'data/healthy_data.csv'
output_csv_path = 'data/healthy_data_processed.csv'
chunk_size = 8000 # chunks are rows
processed_columns = None
is_first_chunk = True

# Check if the input file exists
if not os.path.exists(input_csv_path):
    print(f"Error: Input file not found at {input_csv_path}")
else:
    print(f"Starting memory-efficient processing for {input_csv_path}")
    
    # Remove output file if it exists to start fresh
    if os.path.exists(output_csv_path):
        os.remove(output_csv_path)

    for chunk_df in pd.read_csv(input_csv_path, chunksize=chunk_size):
        print(f"Processing chunk with original shape: {chunk_df.shape}")

        # Handle duplicate columns (determine from the first chunk, apply to all)
        if is_first_chunk:
            original_cols = chunk_df.columns.tolist()
            temp_processed_columns = []
            seen_columns = set()
            for col_name in original_cols:
                if col_name not in seen_columns:
                    temp_processed_columns.append(col_name)
                    seen_columns.add(col_name)
                else:
                    print(f"  Duplicate column found and will be removed: {col_name}")
            processed_columns = temp_processed_columns
        
        # Select only the unique columns
        chunk_df = chunk_df[processed_columns]

        # Drop rows with any null values
        rows_before_dropna = len(chunk_df)
        chunk_df.dropna(inplace=True)

        # Drop duplicate rows (within the current chunk)
        rows_before_drop_duplicates = len(chunk_df)
        chunk_df.drop_duplicates(inplace=True)

        # Write processed chunk to a new CSV file
        if is_first_chunk:
            chunk_df.to_csv(output_csv_path, index=False, mode='w', header=True)
            is_first_chunk = False
            print(f"  Written first processed chunk to {output_csv_path}")
        else:
            chunk_df.to_csv(output_csv_path, index=False, mode='a', header=False)
        
        print(f"  Finished processing chunk. Current shape: {chunk_df.shape}")

    print(f"\nFinished processing. Processed data saved to {output_csv_path}")