In [13]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler

def fill_missing_median(df, columns=None):
    df_copy = df.copy()
    if columns is None:
        columns = df.select_dtypes(include=np.number).columns
    for col in columns:
        df_copy[col] = df_copy[col].fillna(df_copy[col].median())
    return df_copy

def drop_missing(df, columns=None, threshold=None):
    df_copy = df.copy()
    if columns is not None:
        return df_copy.dropna(subset=columns)
    if threshold is not None:
        return df_copy.dropna(thresh=int(threshold*df_copy.shape[1]))
    return df_copy.dropna()

def normalize_data(df, columns=None, method='minmax'):
    df_copy = df.copy()
    if columns is None:
        columns = df_copy.select_dtypes(include=np.number).columns
    if method=='minmax':
        scaler = MinMaxScaler()
    else:
        scaler = StandardScaler()
    df_copy[columns] = scaler.fit_transform(df_copy[columns])
    return df_copy

In [14]:
# Define folder paths relative to this notebook
raw_dir = '../data/raw'
processed_dir = '../data/processed'

# Create folders if they don't exist
os.makedirs(raw_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)

# Define the sample data
data = {
    'age': [34, 45, 29, 50, 38, np.nan, 41],
    'income': [55000, np.nan, 42000, 58000, np.nan, np.nan, 49000],
    'score': [0.82, 0.91, np.nan, 0.76, 0.88, 0.65, 0.79],
    'zipcode': ['90210', '10001', '60614', '94103', '73301', '12345', '94105'],
    'city': ['Beverly', 'New York', 'Chicago', 'SF', 'Austin', 'Unknown', 'San Francisco'],
    'extra_data': [np.nan, 42, np.nan, np.nan, np.nan, 5, np.nan]
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV in raw data folder
csv_path = os.path.join(raw_dir, 'sample_data.csv')
if not os.path.exists(csv_path):
    df.to_csv(csv_path, index=False)
    print(f'Sample dataset created and saved to {csv_path}')
else:
    print(f'File already exists at {csv_path}. Skipping CSV creation to avoid overwrite.')


Sample dataset created and saved to ../data/raw/sample_data.csv


In [17]:
df

Unnamed: 0,age,income,score,zipcode,city,extra_data
0,34.0,55000.0,0.82,90210,Beverly,
1,45.0,,0.91,10001,New York,42.0
2,29.0,42000.0,,60614,Chicago,
3,50.0,58000.0,0.76,94103,SF,
4,38.0,,0.88,73301,Austin,
5,,,0.65,12345,Unknown,5.0
6,41.0,49000.0,0.79,94105,San Francisco,


In [15]:
# clean the data
df_cleaned = df.copy()
df_cleaned = fill_missing_median(df_cleaned)
df_cleaned = drop_missing(df_cleaned, threshold=0.5)
df_cleaned = normalize_data(df_cleaned)

# Inspect
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age         7 non-null      float64
 1   income      7 non-null      float64
 2   score       7 non-null      float64
 3   zipcode     7 non-null      object 
 4   city        7 non-null      object 
 5   extra_data  7 non-null      float64
dtypes: float64(4), object(2)
memory usage: 464.0+ bytes


In [16]:
df_cleaned.head()

Unnamed: 0,age,income,score,zipcode,city,extra_data
0,0.238095,0.8125,0.653846,90210,Beverly,0.5
1,0.761905,0.625,1.0,10001,New York,1.0
2,0.0,0.0,0.596154,60614,Chicago,0.5
3,1.0,1.0,0.423077,94103,SF,0.5
4,0.428571,0.625,0.884615,73301,Austin,0.5


In [18]:
df_cleaned.to_csv('/Users/billqiu/bootcamp_Zhiang_Qiu/homework/homework6/data/processed/combined_cleaned_data.csv', index=False)