# Data cleaning notebook

In [75]:
import pandas as pd
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
from collections import Counter

In [38]:
train = pd.read_csv("../Data/Raw/train.csv")
test = pd.read_csv("../Data/Raw/test.csv")

## Train dataset (Where you train your model)

In [18]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [58]:
y = train["SalePrice"]
X = train.drop(columns=["SalePrice"])

In [59]:
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

print("Numeric columns:", len(num_cols))
print("Categorical columns:", len(cat_cols))

Numeric columns: 37
Categorical columns: 43


In [64]:
# Missing values

train.isnull().sum().sort_values(ascending=False).head(20)

LotFrontage     259
GarageYrBlt      81
MasVnrArea        8
Id                0
MSSubClass        0
Street            0
Alley             0
MSZoning          0
LotArea           0
Utilities         0
LotConfig         0
LandSlope         0
Neighborhood      0
Condition1        0
Condition2        0
LotShape          0
LandContour       0
HouseStyle        0
BldgType          0
YearBuilt         0
dtype: int64

## Cleaning Columns with KNNImputer and OneHotEncoder

In [65]:
numeric_pipeline = Pipeline(steps=[
    ("imputer", KNNImputer(n_neighbors=5, weights="distance")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="None")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, num_cols),
        ("cat", categorical_pipeline, cat_cols)
    ]
)

In [66]:
X_train_processed = preprocessor.fit_transform(X)
X_test_processed  = preprocessor.transform(test)

In [87]:

# Get feature names
num_features = list(num_cols)

ohe = preprocessor.named_transformers_["cat"].named_steps["onehot"]
cat_features = list(ohe.get_feature_names_out(cat_cols))

feature_names = num_features + cat_features

# Convert to DataFrames
X_train_df = pd.DataFrame(X_train_processed.toarray() if hasattr(X_train_processed, "toarray") else X_train_processed,
columns=feature_names)

X_test_df  = pd.DataFrame(X_test_processed.toarray() if hasattr(X_test_processed, "toarray") else X_test_processed,
columns=feature_names)

# Add back target to train (useful for later)
train_processed = X_train_df.copy()
train_processed["SalePrice"] = y.values

# Save
train_processed.to_csv("../Data/Processed/train_ohe_knn.csv", index=False)
X_test_df.to_csv("../Data/Processed/test_ohe_knn.csv", index=False)

In [73]:
X_train_processed.shape

(1460, 304)

In [74]:
X_train_df.isna().sum().sum() # -> No more null values

np.int64(0)

What this pipeline is doing:

Numeric NaNs were imputed (KNN)

Categorical NaNs were filled with "None"

Categorical columns were one-hot encoded

Train/test structure should match (after align or the shared preprocessor)

Estimated missing numbers, labeled missing categories, converted text to numbers, and guaranteed train/test compatibility.

In [76]:
# Count one-hot columns by original feature
prefix_counts = Counter(col.split('_')[0] for col in X_train_df.columns)

sorted(prefix_counts.items(), key=lambda x: x[1], reverse=True)[:10]

[('Neighborhood', 25),
 ('Exterior2nd', 16),
 ('Exterior1st', 15),
 ('Condition1', 9),
 ('SaleType', 9),
 ('Condition2', 8),
 ('HouseStyle', 8),
 ('RoofMatl', 8),
 ('BsmtFinType1', 7),
 ('BsmtFinType2', 7)]

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------

## Saving Processed Data without OneHotEncoder

In [None]:
# Separate target
y = train["SalePrice"]
X = train.drop(columns=["SalePrice"])

num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

X[cat_cols] = X[cat_cols].fillna("None")
test[cat_cols] = test[cat_cols].fillna("None")

# KNN impute numeric columns (fit on train, transform test)
imputer = KNNImputer(n_neighbors=5)
X_num_imputed = imputer.fit_transform(X[num_cols])
test_num_imputed = imputer.transform(test[num_cols])

X_num_df = pd.DataFrame(X_num_imputed, columns=num_cols, index=X.index)
test_num_df = pd.DataFrame(test_num_imputed, columns=num_cols, index=test.index)

X_processed = pd.concat([X_num_df, X[cat_cols]], axis=1)
test_processed = pd.concat([test_num_df, test[cat_cols]], axis=1)

X_processed = X_processed[X.columns]
test_processed = test_processed[test.columns]

# Save
train_knn_clean = X_processed.copy()
train_knn_clean["SalePrice"] = y.values 

train_knn_clean.to_csv("../Data/Processed/train_knn_clean.csv", index=False)
test_processed.to_csv("../Data/Processed/test_knn_clean.csv", index=False)