# Homework Starter — Stage 6: Data Preprocessing
Use this notebook to apply your cleaning functions and save processed data.

In [2]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path

project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

from src import cleaning



# --- Inspect where you are ---
DIR = Path.cwd().parent  # current working directory for this kernel
print("CWD:", DIR)

# Make data/raw as a sibling of this notebook folder
RAW_DIR = (DIR / "data" / "raw")
PROC_DIR = (DIR / "data" / "processed")
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR.mkdir(parents=True, exist_ok=True)

csv_path = RAW_DIR / "sample_data.csv"
print("Will write to:", csv_path)

# Define the sample data
data = {
    'age': [34, 45, 29, 50, 38, np.nan, 41],
    'income': [55000, np.nan, 42000, 58000, np.nan, np.nan, 49000],
    'score': [0.82, 0.91, np.nan, 0.76, 0.88, 0.65, 0.79],
    'zipcode': ['90210', '10001', '60614', '94103', '73301', '12345', '94105'],
    'city': ['Beverly', 'New York', 'Chicago', 'SF', 'Austin', 'Unknown', 'San Francisco'],
    'extra_data': [np.nan, 42, np.nan, np.nan, np.nan, 5, np.nan]
}

# Create DataFrame
df = pd.DataFrame(data)

print("Writing to:", csv_path.resolve())

# Overwrite by default so you don't get the "exists" message
df.to_csv(csv_path, index=False)
print("Wrote:", csv_path.resolve())

CWD: /Users/r.z/bootcamp_runze_zhu/project
Will write to: /Users/r.z/bootcamp_runze_zhu/project/data/raw/sample_data.csv
Writing to: /Users/r.z/bootcamp_runze_zhu/project/data/raw/sample_data.csv
Wrote: /Users/r.z/bootcamp_runze_zhu/project/data/raw/sample_data.csv


## Load Raw Dataset

In [3]:
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,age,income,score,zipcode,city,extra_data
0,34.0,55000.0,0.82,90210,Beverly,
1,45.0,,0.91,10001,New York,42.0
2,29.0,42000.0,,60614,Chicago,
3,50.0,58000.0,0.76,94103,SF,
4,38.0,,0.88,73301,Austin,


## Apply Cleaning Functions

In [4]:
df = cleaning.drop_missing(df, threshold=0.67)
df = cleaning.fill_missing_median(df, columns=['age', 'income', 'score', 'extra_data'])

## Save Cleaned Dataset

In [5]:
csv_path = PROC_DIR / "sample_data_cleaned.csv"
df.to_csv(csv_path, index=False)

When we fill the null values with the median, we need to first verify whether it is reasonable to use the median value as the common value to fill out the null.