In [1]:
import pandas as pd
ames_housing = pd.read_csv("../datasets/ames_housing_no_missing.csv")

target_name = "SalePrice"
data, target = ames_housing.drop(columns=target_name), ames_housing[target_name]
target = (target > 200_000).astype(int)

In [8]:
numerical_features = [
  "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
  "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
  "GrLivArea", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces",
  "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch",
  "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal",
]

In [12]:
categorical_features = data.columns.difference(numerical_features).tolist()

In [10]:
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

model = make_pipeline(StandardScaler(),
                LogisticRegression())
numerical_data = data.loc[:,numerical_features]
cv_results = cross_validate(model, numerical_data,target,cv=10,
                           error_score='raise')
score = cv_results['test_score']
score.mean()

0.891780821917808

In [11]:
score

array([0.9109589 , 0.89041096, 0.9109589 , 0.88356164, 0.90410959,
       0.88356164, 0.88356164, 0.87671233, 0.89726027, 0.87671233])

In [16]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('numerical',StandardScaler(),numerical_features),
    ('categorical',OneHotEncoder(handle_unknown='ignore'),
    categorical_features)
])
model_w_all_features = make_pipeline(preprocessor,
                                    LogisticRegression(max_iter=1000))
cv_results_all_features = cross_validate(model_w_all_features,
                                        data,target,cv=10)
all_features_score = cv_results_all_features['test_score']
all_features_score

array([0.95890411, 0.90410959, 0.89041096, 0.92465753, 0.9109589 ,
       0.93835616, 0.90410959, 0.91780822, 0.92465753, 0.89726027])

In [26]:
list(zip(
    range(1,11),
    map(lambda x: round(x,4),all_features_score),
    map(lambda x: round(x,4),score)))

[(1, 0.9589, 0.911),
 (2, 0.9041, 0.8904),
 (3, 0.8904, 0.911),
 (4, 0.9247, 0.8836),
 (5, 0.911, 0.9041),
 (6, 0.9384, 0.8836),
 (7, 0.9041, 0.8836),
 (8, 0.9178, 0.8767),
 (9, 0.9247, 0.8973),
 (10, 0.8973, 0.8767)]