# Setup: Generate Sample Dataset

This cell creates the required folder structure (`data/raw/` and `data/processed/`) relative to the notebook, and generates the sample CSV dataset with missing values. 

This ensures the dataset is ready for cleaning functions and saves it to `data/raw/sample_data.csv`.

In [None]:
import os
import pandas as pd
import numpy as np

raw_dir = '../data/raw'
processed_dir = '../data/processed'

os.makedirs(raw_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)

data = {
    'gold_price': [2650.30, 2645.80, np.nan, 2672.10, 2668.50, np.nan, 2655.90],
    'volume': [15000, np.nan, 12800, 18500, np.nan, np.nan, 14200],
    'volatility': [0.025, 0.032, np.nan, 0.019, 0.028, 0.021, 0.024],
    'market': ['COMEX', 'LBMA', 'COMEX', 'LBMA', 'COMEX', 'LBMA', 'COMEX'],
    'currency': ['USD', 'USD', 'USD', 'USD', 'USD', 'USD', 'USD'],
    'extra_data': [np.nan, 42, np.nan, np.nan, np.nan, 5, np.nan]
}

df = pd.DataFrame(data)

csv_path = os.path.join(raw_dir, 'sample_data.csv')
if not os.path.exists(csv_path):
    df.to_csv(csv_path, index=False)
    print(f'Sample dataset created and saved to {csv_path}')
else:
    print(f'File already exists at {csv_path}. Skipping CSV creation to avoid overwrite.')

# Homework Starter — Stage 6: Data Preprocessing
Use this notebook to apply your cleaning functions and save processed data.

In [None]:
import pandas as pd
from src import cleaning

## Load Raw Dataset

In [None]:
df = pd.read_csv('D:/School/5/Bootcamp/HW/HW6/data/raw/sample_data.csv')
df.head()

## Apply Cleaning Functions

In [None]:
print("Original data shape:", df.shape)
print("Missing values per column:")
print(df.isnull().sum())
print()

df_cleaned = cleaning.fill_missing_median(df, ['gold_price', 'volume', 'volatility'])
df_cleaned = cleaning.drop_missing(df_cleaned, threshold=0.5)
df_cleaned = cleaning.normalize_data(df_cleaned, ['gold_price', 'volume', 'volatility'])

print("Cleaned data shape:", df_cleaned.shape)
print("Missing values per column after cleaning:")
print(df_cleaned.isnull().sum())
df_cleaned.head()

## Save Cleaned Dataset

In [None]:
df_cleaned.to_csv('D:/School/5/Bootcamp/HW/HW6/data/processed/sample_data_cleaned.csv', index=False)
print('Cleaned dataset saved to ../data/processed/sample_data_cleaned.csv')