# Preprocessing
---
1. Convert categorical variables ('type', 'locality') using one-hot encoding

2. Convert categorical variables ('zipcode') using target encoding

3. Split data into training, validation, and testing sets

---

In [85]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder

In [86]:
df = pd.read_csv("../data/cleaned/NY-House-Cleaned-Dataset.csv")
X = df.drop("price", axis=1)
y = df["price"]

In [87]:
one_hot_encoder_features = ["type", "locality"]
target_encoder_features = ["zipcode"]

In [88]:
categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown='error', drop="first"))
])

zipcode_transformer = Pipeline(steps=[
    ("target_enc", TargetEncoder())
])

In [89]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, one_hot_encoder_features),
        ('zip', zipcode_transformer, target_encoder_features)
        ],
    remainder='passthrough'
)

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [91]:
X_train_transformed = preprocessor.fit_transform(X_train, y_train)
X_test_transformed = preprocessor.transform(X_test)



In [92]:
X_train_transformed.shape

(3166, 14)

In [93]:
ohe_columns = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(one_hot_encoder_features)

te_columns = target_encoder_features

passthrough_columns = [col for col in X_train.columns if col not in one_hot_encoder_features + target_encoder_features]

all_columns = list(ohe_columns) + te_columns + passthrough_columns

X_train_final = pd.DataFrame(X_train_transformed, columns=all_columns)
X_test_final = pd.DataFrame(X_test_transformed, columns=all_columns)

In [94]:
import joblib
joblib.dump(preprocessor, "../models/house_preprocessor.joblib")

['../models/house_preprocessor.joblib']

In [95]:
pd.concat([X_train_final, y_train.reset_index(drop=True)], axis=1).to_csv("../data/processed/train.csv", index=False)
pd.concat([X_test_final, y_test.reset_index(drop=True)], axis=1).to_csv("../data/processed/test.csv", index=False)