### Imports

In [55]:
from sklearn.feature_selection import (
    SelectKBest,
    f_regression
)

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')

### Data load

In [46]:
data_path_train = "data/train_init.csv"
data_path_test = "data/test_init.csv"

df_train_init = pd.read_csv(data_path_train)
df_test_init = pd.read_csv(data_path_test)

print("Train shape:", df_train_init.shape)
print("Test shape:", df_test_init.shape)

Train shape: (1460, 81)
Test shape: (1459, 80)


### Data inspection

In [3]:
def categorical_cardinality(df: pd.DataFrame) -> pd.DataFrame:

    cat_cols = df.select_dtypes(include=["object", "category"]).columns

    cardinality_df = (
        df[cat_cols]
        .nunique(dropna=False)
        .to_frame(name="unique_values")
        .reset_index()
        .rename(columns={"index": "feature"})
    )

    cardinality_df = cardinality_df.sort_values(
        by="unique_values",
        ascending=False
    ).reset_index(drop=True)

    return cardinality_df


In [35]:
categorical_cardinality(df_train_init)

Unnamed: 0,feature,unique_values
0,Neighborhood,25
1,Exterior2nd,16
2,Exterior1st,15
3,Condition1,9
4,SaleType,9
5,HouseStyle,8
6,RoofMatl,8
7,Condition2,8
8,BsmtFinType2,7
9,BsmtFinType1,7


In [21]:
def encode_train_test(df_train, df_test, target_col, ohe_threshold=5, smoothing=10):
    train_encoded = df_train.copy()
    test_encoded = df_test.copy()
    
    y_train = train_encoded[target_col]
    global_mean = y_train.mean()

    cat_cols = train_encoded.select_dtypes(include=['object', 'category']).columns.tolist()
    if target_col in cat_cols:
        cat_cols.remove(target_col)
    
    for col in cat_cols:
        unique_vals = train_encoded[col].nunique()
        

        if unique_vals < ohe_threshold:
            print(f"OHE: {col} ({unique_vals} unique values)")
            train_dummies = pd.get_dummies(train_encoded[col], prefix=col, drop_first=True, dtype=int)
            test_dummies = pd.get_dummies(test_encoded[col], prefix=col, drop_first=True, dtype=int)
            test_dummies = test_dummies.reindex(columns=train_dummies.columns, fill_value=0)
            train_encoded = pd.concat([train_encoded.drop(columns=[col]), train_dummies], axis=1)
            test_encoded = pd.concat([test_encoded.drop(columns=[col]), test_dummies], axis=1)
            
        else:
            print(f"Target Encoding: {col} ({unique_vals} unique values)")
            kf = KFold(n_splits=5, shuffle=True, random_state=42)
            cv_series = pd.Series(index=train_encoded.index, dtype=float)
            
            for train_idx, val_idx in kf.split(train_encoded):
                X_t, X_v = train_encoded.iloc[train_idx], train_encoded.iloc[val_idx]
                y_t = y_train.iloc[train_idx]
                
                stats = X_t.assign(target=y_t).groupby(col)['target'].agg(['mean', 'count'])
                stats['te'] = ((stats['mean'] * stats['count'] + global_mean * smoothing) / 
                               (stats['count'] + smoothing))
                
                cv_series.iloc[val_idx] = X_v[col].map(stats['te']).fillna(global_mean)
            
            train_encoded[col] = cv_series
            full_stats = df_train.assign(target=y_train).groupby(col)['target'].agg(['mean', 'count'])
            
            full_stats['te'] = ((full_stats['mean'] * full_stats['count'] + global_mean * smoothing) / 
                                (full_stats['count'] + smoothing))
            
            test_encoded[col] = test_encoded[col].map(full_stats['te']).fillna(global_mean)
            
    return train_encoded, test_encoded

In [47]:
df_train, df_test = encode_train_test(df_train_init, df_test_init, 'SalePrice')

Target Encoding: MSZoning (5 unique values)
OHE: Street (2 unique values)
OHE: Alley (2 unique values)
OHE: LotShape (4 unique values)
OHE: LandContour (4 unique values)
OHE: Utilities (2 unique values)
Target Encoding: LotConfig (5 unique values)
OHE: LandSlope (3 unique values)
Target Encoding: Neighborhood (25 unique values)
Target Encoding: Condition1 (9 unique values)
Target Encoding: Condition2 (8 unique values)
Target Encoding: BldgType (5 unique values)
Target Encoding: HouseStyle (8 unique values)
Target Encoding: RoofStyle (6 unique values)
Target Encoding: RoofMatl (8 unique values)
Target Encoding: Exterior1st (15 unique values)
Target Encoding: Exterior2nd (16 unique values)
OHE: MasVnrType (3 unique values)
OHE: ExterQual (4 unique values)
Target Encoding: ExterCond (5 unique values)
Target Encoding: Foundation (6 unique values)
OHE: BsmtQual (4 unique values)
OHE: BsmtCond (4 unique values)
OHE: BsmtExposure (4 unique values)
Target Encoding: BsmtFinType1 (6 unique value

In [48]:
null_summary = (
    df_train
    .isnull()
    .sum()
    .to_frame(name="missing_count")
)
null_summary["missing_pct"] = null_summary["missing_count"] / len(df_train) * 100

null_summary = null_summary.sort_values("missing_pct", ascending=False)

display(null_summary)

Unnamed: 0,missing_count,missing_pct
LotFrontage,259,17.739726
GarageYrBlt,81,5.547945
MasVnrArea,8,0.547945
MSSubClass,0,0.000000
LotArea,0,0.000000
...,...,...
Fence_MnPrv,0,0.000000
Fence_MnWw,0,0.000000
MiscFeature_Othr,0,0.000000
MiscFeature_Shed,0,0.000000


In [38]:
def high_correlation_pairs(
    df: pd.DataFrame,
    threshold: float = 0.8,
    method: str = "pearson"
) -> pd.DataFrame:

    df_num = df.select_dtypes(include=[np.number])
    corr_matrix = df_num.corr(method=method)
    mask = np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
    corr_pairs = (
        corr_matrix
        .where(mask)
        .stack()
        .reset_index()
    )
    corr_pairs.columns = ["feature_1", "feature_2", "correlation"]

    corr_pairs = corr_pairs[
        corr_pairs["correlation"].abs() >= threshold
    ]

    corr_pairs = corr_pairs.sort_values(
        by="correlation",
        key=lambda x: x.abs(),
        ascending=False
    ).reset_index(drop=True)

    return corr_pairs


In [49]:
high_correlation_pairs(
    df_train
)

Unnamed: 0,feature_1,feature_2,correlation
0,SaleType,SaleCondition,0.953788
1,Exterior1st,Exterior2nd,0.938908
2,ExterQual_Gd,ExterQual_TA,-0.906121
3,GarageCars,GarageArea,0.882475
4,YearBuilt,GarageYrBlt,0.825667
5,GrLivArea,TotRmsAbvGrd,0.825489
6,KitchenQual_Gd,KitchenQual_TA,-0.824457
7,TotalBsmtSF,1stFlrSF,0.81953


In [50]:
df_train =df_train.drop(columns=['GarageCars', 'GarageYrBlt', 'TotRmsAbvGrd', 'SaleCondition', 'Id'], axis = 1)
df_train = df_train.fillna(0)

In [52]:
TARGET = 'SalePrice'

X = df_train.drop(columns=[TARGET], axis = 0)
y = df_train[TARGET]

X_num = X

selector_f = SelectKBest(score_func=f_regression, k='all')
selector_f.fit(X_num.fillna(X_num.median()), y)

feature_scores = pd.DataFrame({
    'feature': X.columns,
    'f_score': selector_f.scores_
}).sort_values('f_score', ascending=False)

display(feature_scores)

Unnamed: 0,feature,f_score
10,OverallQual,2436.770591
5,Neighborhood,1580.066684
33,GrLivArea,1470.585010
44,GarageArea,926.951287
26,TotalBsmtSF,880.341282
...,...,...
35,BsmtHalfBath,0.413789
90,PoolQC_Fa,0.379275
65,Utilities_NoSeWa,0.298804
91,PoolQC_Gd,0.211326


In [53]:
def get_low_correlation_features(df, target_col, threshold=0.1):

    corr_matrix = df.corr()
    target_corr = corr_matrix[target_col].abs().sort_values(ascending=False)
    low_corr_features = target_corr[target_corr < threshold].index.tolist()
    
    return low_corr_features

threshold = 0.02
to_drop = get_low_correlation_features(df_train, 'SalePrice', threshold)

print(f"Weak correlation variables: {to_drop}")

Weak correlation variables: ['BsmtHalfBath', 'PoolQC_Fa', 'Utilities_NoSeWa', 'PoolQC_Gd', 'BsmtFinSF2']


In [54]:
df_train.to_csv('./data/df_train.csv', index = False)
df_test.to_csv('./data/df_test.csv', index = False)