In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler

def fill_missing_median(df, columns=None):
    df_copy = df.copy()
    if columns is None:
        columns = df.select_dtypes(include=np.number).columns
    for col in columns:
        df_copy[col] = df_copy[col].fillna(df_copy[col].median())
    return df_copy

def drop_missing(df, columns=None, threshold=None):
    df_copy = df.copy()
    if columns is not None:
        return df_copy.dropna(subset=columns)
    if threshold is not None:
        return df_copy.dropna(thresh=int(threshold*df_copy.shape[1]))
    return df_copy.dropna()

def normalize_data(df, columns=None, method='minmax'):
    df_copy = df.copy()
    if columns is None:
        columns = df_copy.select_dtypes(include=np.number).columns
    if method=='minmax':
        scaler = MinMaxScaler()
    else:
        scaler = StandardScaler()
    df_copy[columns] = scaler.fit_transform(df_copy[columns])
    return df_copy

In [3]:
df = pd.read_csv('/Users/billqiu/bootcamp_Zhiang_Qiu/homework/homework3/data/starter_data.csv')
df.head()

Unnamed: 0,category,value,date
0,A,10,2025-08-01
1,B,15,2025-08-02
2,A,12,2025-08-03
3,B,18,2025-08-04
4,C,25,2025-08-05


In [4]:
df_cleaned = df.copy()
df_cleaned = fill_missing_median(df_cleaned)
df_cleaned = drop_missing(df_cleaned, threshold=0.5)
df_cleaned = normalize_data(df_cleaned)

# Inspect
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   category  10 non-null     object 
 1   value     10 non-null     float64
 2   date      10 non-null     object 
dtypes: float64(1), object(2)
memory usage: 368.0+ bytes


In [5]:
df_cleaned.head()

Unnamed: 0,category,value,date
0,A,0.0,2025-08-01
1,B,0.25,2025-08-02
2,A,0.1,2025-08-03
3,B,0.4,2025-08-04
4,C,0.75,2025-08-05


In [6]:
df_cleaned.to_csv('/Users/billqiu/bootcamp_Zhiang_Qiu/project/data/processed/combined_cleaned_data.csv', index=False)