In [1]:
import sys
import os
sys.path.append(os.path.abspath("../"))



from pathlib import Path
import pandas as pd
import tarfile
import urllib.request



def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
    with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    return pd.read_csv(Path("datasets/housing/housing.csv"))

housing = load_housing_data()

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

def load_data(path):
    """Load dataset from given path (CSV)."""
    return pd.read_csv(path)

def split_data(data, target_col, test_size=0.2, random_state=42):
    """Auto decide between random split and stratified split."""
    
    # Check if target column is categorical
    if data[target_col].dtype == 'object' or len(data[target_col].unique()) < 20:
        print("Using Stratified Split ✅")
        split = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
        for train_idx, test_idx in split.split(data, data[target_col]):
            train_set = data.iloc[train_idx]
            test_set = data.iloc[test_idx]
    else:
        print("Using Random Split ✅")
        train_set, test_set = train_test_split(
            data, test_size=test_size, random_state=random_state
        )
    
    return train_set, test_set

def save_splits(train_set, test_set, train_path, test_path):
    """Save train and test sets as CSV files."""
    train_set.to_csv(train_path, index=False)
    test_set.to_csv(test_path, index=False)


train_set, test_set = split_data(housing, target_col="total_bedrooms")

Using Random Split ✅


In [4]:
print("Train set shape:", train_set.shape)
print("Test set shape:", test_set.shape)

train_set.head()



Train set shape: (16512, 10)
Test set shape: (4128, 10)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
14196,-117.22,32.75,34.0,6001.0,1111.0,2654.0,1072.0,4.5878,291000.0,NEAR OCEAN
8267,-117.03,32.69,10.0,901.0,163.0,698.0,167.0,4.6648,156100.0,NEAR OCEAN
17445,-122.27,37.74,28.0,6909.0,1554.0,2974.0,1484.0,3.6875,353900.0,NEAR BAY
14265,-121.82,37.25,25.0,4021.0,634.0,2178.0,650.0,5.1663,241200.0,<1H OCEAN
2271,-115.98,33.32,8.0,240.0,46.0,63.0,24.0,1.4688,53800.0,INLAND


In [5]:
test_set.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20046,-122.38,40.67,10.0,2281.0,444.0,1274.0,438.0,2.212,65600.0,INLAND
3024,-118.37,33.83,35.0,1207.0,207.0,601.0,213.0,4.7308,353400.0,<1H OCEAN
15663,-117.24,32.72,39.0,3089.0,431.0,1175.0,432.0,7.5925,466700.0,NEAR OCEAN
20484,-118.44,34.05,18.0,4780.0,1192.0,1886.0,1036.0,4.4674,500001.0,<1H OCEAN
9814,-118.44,34.18,33.0,2127.0,414.0,1056.0,391.0,4.375,286100.0,<1H OCEAN


In [6]:
import pandas as pd

def separate_features_and_labels(data, target_col="median_house_value"):
    """
    Split dataset into features (X) and labels (y).
    
    Parameters
    ----------
    data : pd.DataFrame
        The dataset (train/test set).
    target_col : str
        The name of the column to predict (label).
    
    Returns
    -------
    X : pd.DataFrame
        Features (all columns except target).
    y : pd.Series
        Labels (only target column).
    """
    X = data.drop(target_col, axis=1)   # features
    y = data[target_col].copy()         # labels
    return X, y


In [7]:
from src.data_preparation.split_data import split_data

from src.data_preparation.download_data import load_housing_data

# load full data
housing = load_housing_data()

# split into train/test
train_set, test_set = split_data(housing, target_col="ocean_proximity")


Using Stratified Split ✅


In [8]:
from src.data_preparation.data_preparation import separate_features_and_labels

housing_features, housing_labels = separate_features_and_labels(train_set)

print(housing_features.shape)
print(housing_labels.shape)


(16512, 9)
(16512,)


In [9]:

from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd
def handle_missing_values(data):
    """
    Handle missing values in the dataset using SimpleImputer (median strategy).
    """
    # Select only numerical columns
    num_data = data.select_dtypes(include=[np.number])
    
    # Create and fit imputer
    imputer = SimpleImputer(strategy="median")
    imputer.fit(num_data)
    
    # Transform data and convert back to DataFrame
    transformed_data = pd.DataFrame(imputer.transform(num_data), columns=num_data.columns)
    
    return transformed_data, imputer