In [128]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [129]:
# import model
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

### Load Data

In [130]:
# load data
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

### Data Preprocessing

In [131]:
# cat column
cat_col = train.select_dtypes(include=['object']).columns

# NA column
na_col = train.columns[train.isna().sum() > 0]

# NA column with meaning (from the description)
na_col_meaning = ["Alley", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual", "GarageCond", "PoolQC", "Fence", "MiscFeature"]

# NA column meaning - all NA col
na_col_no_meaning = [col for col in na_col if col not in na_col_meaning]

In [132]:
# drop Utilities column (only 1 value)
train.drop(columns=["Utilities"], inplace=True)
test.drop(columns=["Utilities"], inplace=True)

In [133]:
# by hand split categorical columns into 3 types
ordinal_cols = ["LotShape","LandContour","LandSlope","HouseStyle","ExterQual","ExterCond","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2","HeatingQC","KitchenQual","Functional","FireplaceQu","GarageFinish","GarageQual","GarageCond","PavedDrive","PoolQC","Fence"]
cat_cols = ["Street","Alley","CentralAir","BldgType","SaleType","SaleCondition","MSSubClass","MSZoning","Neighborhood","Condition1","Condition2","LotConfig","RoofStyle","RoofMatl","Exterior1st","Exterior2nd","MasVnrType","Foundation","Heating","Electrical","GarageType","MiscFeature"]

threshold = 4 
one_hot_cols = []
freq_cols = []
for col in cat_cols:
    if col in train.columns and train[col].nunique() <= threshold:
        one_hot_cols.append(col)
    else:
        freq_cols.append(col)

In [134]:
categorical_na = ['MasVnrType', 'Electrical']
from sklearn.impute import SimpleImputer
if categorical_na:
    cat_imputer = SimpleImputer(strategy='most_frequent')
    train[categorical_na] = cat_imputer.fit_transform(train[categorical_na])
    test[categorical_na] = cat_imputer.transform(test[categorical_na])

In [135]:
from sklearn.preprocessing import OrdinalEncoder

# define categories in order (best to worst, NA last if present)
ordinal_categories = [
    ['Reg', 'IR1', 'IR2', 'IR3'],  # LotShape
    ['Lvl', 'Bnk', 'HLS', 'Low'],  # LandContour
    # ['AllPub', 'NoSewr', 'NoSeWa', 'ELO'],  # Utilities
    ['Gtl', 'Mod', 'Sev'],  # LandSlope
    ['1Story', '1.5Fin', '1.5Unf', '2Story', '2.5Fin', '2.5Unf', 'SFoyer', 'SLvl'],  # HouseStyle
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],  # ExterQual
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],  # ExterCond
    ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA'],  # BsmtQual
    ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA'],  # BsmtCond
    ['Gd', 'Av', 'Mn', 'No', 'NA'],  # BsmtExposure
    ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'NA'],  # BsmtFinType1
    ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'NA'],  # BsmtFinType2
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],  # HeatingQC
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],  # KitchenQual
    ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev', 'Sal'],  # Functional
    ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA'],  # FireplaceQu
    ['Fin', 'RFn', 'Unf', 'NA'],  # GarageFinish
    ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA'],  # GarageQual
    ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA'],  # GarageCond
    ['Y', 'P', 'N'],  # PavedDrive
    ['Ex', 'Gd', 'TA', 'Fa', 'NA'],  # PoolQC
    ['GdPrv', 'MnPrv', 'GdWo', 'MnWw', 'NA']  # Fence
]

# fit encoder on train (handle NA by filling with 'NA' first)
for col in ordinal_cols:
    if col in train.columns:
        cat_list = ordinal_categories[ordinal_cols.index(col)]
        if 'NA' in cat_list:
            train[col] = train[col].fillna('NA')
            test[col] = test[col].fillna('NA')
        else:
            mode = train[col].mode()[0]
            train[col] = train[col].fillna(mode)
            test[col] = test[col].fillna(mode)

encoder = OrdinalEncoder(categories=ordinal_categories)
train[ordinal_cols] = encoder.fit_transform(train[ordinal_cols])
test[ordinal_cols] = encoder.transform(test[ordinal_cols])

In [136]:
train = pd.get_dummies(train, columns=one_hot_cols, drop_first=True)
test = pd.get_dummies(test, columns=one_hot_cols, drop_first=True)

missing_cols = set(train.columns) - set(test.columns)
for col in missing_cols:
    test[col] = 0
extra_cols = set(test.columns) - set(train.columns)
test = test.drop(columns=extra_cols)
test = test[train.columns]

# align test columns to train
missing_cols = set(train.columns) - set(test.columns)
for col in missing_cols:
    test[col] = 0
extra_cols = set(test.columns) - set(train.columns)
test = test.drop(columns=extra_cols)

# frequency encode frequency_cols
for col in freq_cols:
    if col in train.columns:
        freq_map = train[col].value_counts(normalize=True)
        train[col + '_freq'] = train[col].map(freq_map)
        test[col + '_freq'] = test[col].map(freq_map).fillna(0)
        train.drop(columns=[col], inplace=True)
        test.drop(columns=[col], inplace=True)

# example: check shape
print("train shape after encoding:", train.shape)
print("test shape after encoding:", test.shape)

train shape after encoding: (1460, 83)
test shape after encoding: (1459, 83)


In [137]:
from sklearn.impute import KNNImputer

knn_imputer = KNNImputer(n_neighbors=5)  # adjust k as needed
train = knn_imputer.fit_transform(train)
test = knn_imputer.transform(test)