In [6]:
# Cell 0 — Environment Setup

import pandas as pd
import numpy as np

# Set file paths
raw_path = "/Users/seanice/Desktop/Bootcamp/bootcamp_Linxi_Pan/homework/homework6/raw/util_test.csv"
processed_path = "/Users/seanice/Desktop/Bootcamp/bootcamp_Linxi_Pan/homework/homework6/processed/cleaned.csv"


In [8]:
# Cell 1 — Load Raw Data

# Load CSV file
df_raw = pd.read_csv(raw_path)

# Basic checks
print("Shape:", df_raw.shape)
print("\nMissing values per column:\n", df_raw.isna().sum())

# Peek first rows
df_raw.head()



Shape: (10, 3)

Missing values per column:
 category    0
value       0
date        0
dtype: int64


Unnamed: 0,category,value,date
0,A,10,2025-08-01
1,B,15,2025-08-02
2,A,12,2025-08-03
3,B,18,2025-08-04
4,C,25,2025-08-05


In [9]:
# Cell 2 — Define simple cleaning functions
import pandas as pd
import numpy as np

def fill_missing_median(df: pd.DataFrame) -> pd.DataFrame:
    """
    Fill NaNs in numeric columns with the column median.
    - Non-numeric columns are left unchanged.
    - Returns a new DataFrame (does not modify input in place).
    """
    out = df.copy()
    num_cols = out.select_dtypes(include=[np.number]).columns
    for c in num_cols:
        med = out[c].median()
        out[c] = out[c].fillna(med)
    return out

def drop_missing(df: pd.DataFrame) -> pd.DataFrame:
    """
    Drop rows that contain any NaN in any column.
    - Very strict: if a row has one NaN, it will be removed.
    """
    return df.dropna(axis=0, how="any")

def normalize_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Min-Max normalize numeric columns to [0, 1].
    - Non-numeric columns are kept as-is.
    - If a column is constant (max == min), keep zeros to avoid division by zero.
    """
    out = df.copy()
    num_cols = out.select_dtypes(include=[np.number]).columns
    for c in num_cols:
        col = out[c]
        mn, mx = col.min(), col.max()
        if pd.isna(mn) or pd.isna(mx):
            # if still has NaNs, skip normalization for this column
            continue
        if mx > mn:
            out[c] = (col - mn) / (mx - mn)
        else:
            out[c] = 0.0
    return out


In [10]:
# Cell 3 — Apply fill_missing_median() and save

import os
from pathlib import Path

# Ensure processed folder exists
Path(processed_path).parent.mkdir(parents=True, exist_ok=True)

# 1) Basic before-stats
print("Before — shape:", df_raw.shape)
print("Before — NA counts per column:\n", df_raw.isna().sum())

# 2) Apply cleaning
df_fillmed = fill_missing_median(df_raw)

# 3) After-stats
print("\nAfter (fill median) — shape:", df_fillmed.shape)
print("After (fill median) — NA counts per column:\n", df_fillmed.isna().sum())

# 4) Save cleaned file
out_fillmed = str(Path(processed_path).with_name("cleaned_fillmedian.csv"))
df_fillmed.to_csv(out_fillmed, index=False)
print("\nSaved to:", out_fillmed)

# Quick peek
df_fillmed.head()


Before — shape: (10, 3)
Before — NA counts per column:
 category    0
value       0
date        0
dtype: int64

After (fill median) — shape: (10, 3)
After (fill median) — NA counts per column:
 category    0
value       0
date        0
dtype: int64

Saved to: /Users/seanice/Desktop/Bootcamp/bootcamp_Linxi_Pan/homework/homework6/processed/cleaned_fillmedian.csv


Unnamed: 0,category,value,date
0,A,10,2025-08-01
1,B,15,2025-08-02
2,A,12,2025-08-03
3,B,18,2025-08-04
4,C,25,2025-08-05


In [11]:
# Cell 4 — Apply drop_missing() and save

from pathlib import Path

# 1) Before stats
print("Before — shape:", df_raw.shape)
print("Before — total NA:", int(df_raw.isna().sum().sum()))

# 2) Drop rows with any NaN
df_dropna = drop_missing(df_raw)

# 3) After stats
print("\nAfter (drop rows with any NaN) — shape:", df_dropna.shape)
print("After — total NA:", int(df_dropna.isna().sum().sum()))

# 4) Save
out_dropna = str(Path(processed_path).with_name("cleaned_dropna.csv"))
df_dropna.to_csv(out_dropna, index=False)
print("\nSaved to:", out_dropna)

# Peek
df_dropna.head()


Before — shape: (10, 3)
Before — total NA: 0

After (drop rows with any NaN) — shape: (10, 3)
After — total NA: 0

Saved to: /Users/seanice/Desktop/Bootcamp/bootcamp_Linxi_Pan/homework/homework6/processed/cleaned_dropna.csv


Unnamed: 0,category,value,date
0,A,10,2025-08-01
1,B,15,2025-08-02
2,A,12,2025-08-03
3,B,18,2025-08-04
4,C,25,2025-08-05


In [12]:
# Cell 5 — Apply normalize_data() and save

# 1) Use the median-filled version as input (safer for scaling)
print("Input for normalization — shape:", df_fillmed.shape)

# 2) Normalize numeric columns to [0,1]
df_norm = normalize_data(df_fillmed)

# 3) Simple checks
num_cols = df_norm.select_dtypes(include=[np.number]).columns.tolist()
print("Numeric columns normalized:", num_cols[:10], "..." if len(num_cols) > 10 else "")
print("After (normalize) — shape:", df_norm.shape)

# 4) Save
from pathlib import Path
out_norm = str(Path(processed_path).with_name("cleaned_normalized.csv"))
df_norm.to_csv(out_norm, index=False)
print("\nSaved to:", out_norm)

# Peek
df_norm.head()


Input for normalization — shape: (10, 3)
Numeric columns normalized: ['value'] 
After (normalize) — shape: (10, 3)

Saved to: /Users/seanice/Desktop/Bootcamp/bootcamp_Linxi_Pan/homework/homework6/processed/cleaned_normalized.csv


Unnamed: 0,category,value,date
0,A,0.0,2025-08-01
1,B,0.25,2025-08-02
2,A,0.1,2025-08-03
3,B,0.4,2025-08-04
4,C,0.75,2025-08-05


## Data Cleaning Documentation

### Cleaning Functions
- **fill_missing_median()**  
  Filled NaN values in numeric columns with the median.  
  Advantage: keeps all rows, but may distort distribution slightly.  

- **drop_missing()**  
  Dropped rows with any missing values.  
  Advantage: simple, no imputation bias.  
  Risk: may lose too many rows if dataset has lots of NaNs.  

- **normalize_data()**  
  Min-max normalization of numeric columns to [0, 1].  
  Advantage: keeps scale consistent for modeling.  
  Assumption: features are comparable after scaling.  

### Output Files
- `cleaned_fillmedian.csv`  
- `cleaned_dropna.csv`  
- `cleaned_normalized.csv`  

Saved in: `/data/processed/` folder.  

### Assumptions & Tradeoffs
- Median filling is reasonable for skewed numeric data.  
- Dropping rows may reduce sample size but avoids introducing bias.  
- Normalization assumes linear scaling is sufficient; other methods (z-score, robust scaling) could be used.  
- Categorical columns were not touched.  
