# Setup: Generate Sample Dataset

This cell creates the required folder structure (`data/raw/` and `data/processed/`) relative to the notebook, and generates the sample CSV dataset with missing values. 
This ensures the dataset is ready for cleaning functions and saves it to `data/raw/sample_data.csv`.

In [30]:
import os
import pandas as pd
import numpy as np

# Define folder paths relative to this notebook
raw_dir = '../data/raw'
processed_dir = '../data/processed'

# Create folders if they don't exist
os.makedirs(raw_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)

# Define the sample data
data = {
    'age': [34, 45, 29, 50, 38, np.nan, 41],
    'income': [55000, np.nan, 42000, 58000, np.nan, np.nan, 49000],
    'score': [0.82, 0.91, np.nan, 0.76, 0.88, 0.65, 0.79],
    'zipcode': ['90210', '10001', '60614', '94103', '73301', '12345', '94105'],
    'city': ['Beverly', 'New York', 'Chicago', 'SF', 'Austin', 'Unknown', 'San Francisco'],
    'extra_data': [np.nan, 42, np.nan, np.nan, np.nan, 5, np.nan]
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV in raw data folder
csv_path = os.path.join(raw_dir, 'sample_data.csv')
if not os.path.exists(csv_path):
    df.to_csv(csv_path, index=False)
    print(f'Sample dataset created and saved to {csv_path}')
else:
    print(f'File already exists at {csv_path}. Skipping CSV creation to avoid overwrite.')

File already exists at ../data/raw/sample_data.csv. Skipping CSV creation to avoid overwrite.


# Homework Starter — Stage 6: Data Preprocessing
Use this notebook to apply your cleaning functions and save processed data.

In [31]:
import pandas as pd
import sys
sys.path.append('..')
from src import cleaning

## Kaggle Dataset Download
- Source: Kaggle API via `kagglehub`
- Target: Credit Card Fraud Detection dataset (mlg-ulb/creditcardfraud)
- Notes: Download latest version; dataset includes 285K+ transactions with 31 features for fraud classification.


## Load Raw Dataset

In [32]:
df = pd.read_csv('../data/raw/sample_data.csv')
df.head(100)

Unnamed: 0,age,income,score,zipcode,city,extra_data
0,34.0,55000.0,0.82,90210,Beverly,
1,45.0,,0.91,10001,New York,42.0
2,29.0,42000.0,,60614,Chicago,
3,50.0,58000.0,0.76,94103,SF,
4,38.0,,0.88,73301,Austin,
5,,,0.65,12345,Unknown,5.0
6,41.0,49000.0,0.79,94105,San Francisco,


## Apply Cleaning Functions

In [33]:
# TODO: Apply your functions here
# Example:
print(df.head(100))
#income score fhave NaN
df = cleaning.fill_missing_median(df, ['age','income','score'])
print(df.head(100))

#not any specific column
df = cleaning.drop_missing(df, threshold=0.5)
print(df.head(100))

#age and income have numeric values
#cannot properly normalize age without some other modificaiotns and assumptions
df = cleaning.normalize_data(df, ['income'])  
print(df.head(100))


    age   income  score  zipcode           city  extra_data
0  34.0  55000.0   0.82    90210        Beverly         NaN
1  45.0      NaN   0.91    10001       New York        42.0
2  29.0  42000.0    NaN    60614        Chicago         NaN
3  50.0  58000.0   0.76    94103             SF         NaN
4  38.0      NaN   0.88    73301         Austin         NaN
5   NaN      NaN   0.65    12345        Unknown         5.0
6  41.0  49000.0   0.79    94105  San Francisco         NaN
    age   income  score  zipcode           city  extra_data
0  34.0  55000.0  0.820    90210        Beverly         NaN
1  45.0  52000.0  0.910    10001       New York        42.0
2  29.0  42000.0  0.805    60614        Chicago         NaN
3  50.0  58000.0  0.760    94103             SF         NaN
4  38.0  52000.0  0.880    73301         Austin         NaN
5  39.5  52000.0  0.650    12345        Unknown         5.0
6  41.0  49000.0  0.790    94105  San Francisco         NaN
    age   income  score  zipcode        

## Save Cleaned Dataset

In [34]:
df.to_csv('../data/processed/sample_data_cleaned.csv', index=False)
df2 = pd.read_csv('../data/processed/sample_data_cleaned.csv')
df2.head(100)

Unnamed: 0,age,income,score,zipcode,city,extra_data
0,34.0,0.8125,0.82,90210,Beverly,
1,45.0,0.625,0.91,10001,New York,42.0
2,29.0,0.0,0.805,60614,Chicago,
3,50.0,1.0,0.76,94103,SF,
4,38.0,0.625,0.88,73301,Austin,
5,39.5,0.625,0.65,12345,Unknown,5.0
6,41.0,0.4375,0.79,94105,San Francisco,


## Compare Datasets

In [None]:
df1 = pd.read_csv('../data/raw/sample_data.csv')
df2 = pd.read_csv('../data/processed/sample_data_cleaned.csv')
print(df1.head(100))
print(df1.shape)
print(df2.head(100))
print(df2.shape)
#shape same because no rows have been dropped with the .5 criteria

    age   income  score  zipcode           city  extra_data
0  34.0  55000.0   0.82    90210        Beverly         NaN
1  45.0      NaN   0.91    10001       New York        42.0
2  29.0  42000.0    NaN    60614        Chicago         NaN
3  50.0  58000.0   0.76    94103             SF         NaN
4  38.0      NaN   0.88    73301         Austin         NaN
5   NaN      NaN   0.65    12345        Unknown         5.0
6  41.0  49000.0   0.79    94105  San Francisco         NaN
(7, 6)
    age  income  score  zipcode           city  extra_data
0  34.0  0.8125  0.820    90210        Beverly         NaN
1  45.0  0.6250  0.910    10001       New York        42.0
2  29.0  0.0000  0.805    60614        Chicago         NaN
3  50.0  1.0000  0.760    94103             SF         NaN
4  38.0  0.6250  0.880    73301         Austin         NaN
5  39.5  0.6250  0.650    12345        Unknown         5.0
6  41.0  0.4375  0.790    94105  San Francisco         NaN
(7, 6)
