In [1]:
# Import necessary libraries for preprocessing
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer

# Load the dataset
Ames = pd.read_csv("Ames.csv")

# Convert the below numeric features to categorical features
Ames["MSSubClass"] = Ames["MSSubClass"].astype("object")
Ames["YrSold"] = Ames["YrSold"].astype("object")
Ames["MoSold"] = Ames["MoSold"].astype("object")

# Exclude "PID" and "SalePrice" from features and handle the "Electrical" column
numeric_features = Ames.select_dtypes(include=["int64", "float64"]) \
    .drop(columns=["PID", "SalePrice"]).columns
categorical_features = Ames.select_dtypes(include=["object"]).columns \
    .difference(["Electrical"])
electrical_feature = ["Electrical"]

# Manually specify the categories for ordinal encoding according to the data dictionary
ordinal_order = {
    "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
    "LotShape": ["IR3", "IR2", "IR1", "Reg"],
    "Utilities": ["ELO", "NoSeWa", "NoSewr", "AllPub"],
    "LandSlope": ["Sev", "Mod", "Gtl"],
    "ExterQual": ["Po", "Fa", "TA", "Gd", "Ex"],
    "ExterCond": ["Po", "Fa", "TA", "Gd", "Ex"],
    "BsmtQual": ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    "BsmtCond": ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    "BsmtExposure": ["None", "No", "Mn", "Av", "Gd"],
    "BsmtFinType1": ["None", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["None", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "HeatingQC": ["Po", "Fa", "TA", "Gd", "Ex"],
    "KitchenQual": ["Po", "Fa", "TA", "Gd", "Ex"],
    "Functional": ["Sal", "Sev", "Maj2", "Maj1", "Mod", "Min2", "Min1", "Typ"],
    "FireplaceQu": ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    "GarageFinish": ["None", "Unf", "RFn", "Fin"],
    "GarageQual": ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    "GarageCond": ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    "PavedDrive": ["N", "P", "Y"],
    "PoolQC": ["None", "Fa", "TA", "Gd", "Ex"],
    "Fence": ["None", "MnWw", "GdWo", "MnPrv", "GdPrv"]
}

# Extract list of ALL ordinal features from dictionary
ordinal_features = list(ordinal_order.keys())

# List of ordinal features except Electrical
ordinal_except_electrical = [feat for feat in ordinal_features if feat != "Electrical"]

# Helper function to fill "None" for missing categorical data
def fill_none(X):
    return X.infer_objects(copy=False).fillna("None")

# Pipeline for "Electrical": Fill missing value with mode then apply ordinal encoding
electrical_transformer = Pipeline(steps=[
    ("impute_electrical", SimpleImputer(strategy="most_frequent")),
    ("ordinal_electrical", OrdinalEncoder(categories=[ordinal_order["Electrical"]]))
])

# Pipeline for numeric features: Impute missing values using mean
numeric_transformer = Pipeline(steps=[
    ("impute_mean", SimpleImputer(strategy="mean"))
])

# Pipeline for ordinal features: Fill missing values with "None" then apply ordinal encoding
ordinal_transformer = Pipeline(steps=[
    ("fill_none", FunctionTransformer(fill_none, validate=False)),
    ("ordinal", OrdinalEncoder(categories=[ordinal_order[feat] for feat in ordinal_except_electrical]))
])

# Pipeline for nominal categorical features: Fill missing values with "None" then apply one-hot encoding
nominal_features = [feat for feat in categorical_features if feat not in ordinal_features]
categorical_transformer = Pipeline(steps=[
    ("fill_none", FunctionTransformer(fill_none, validate=False)),
    ("onehot", OneHotEncoder(handle_unknown="ignore")) 
])

# Combined preprocessor for numeric, ordinal, nominal, and specific electrical data
preprocessor = ColumnTransformer(transformers=[
    ("electrical", electrical_transformer, ["Electrical"]),
    ("num", numeric_transformer, numeric_features),
    ("ordinal", ordinal_transformer, ordinal_except_electrical),
    ("nominal", categorical_transformer, nominal_features)
])

# Apply the preprocessing pipeline to Ames
transformed_data = preprocessor.fit_transform(Ames).toarray()

# Generate column names for the one-hot encoded features
onehot_features = preprocessor.named_transformers_["nominal"] \
    .named_steps["onehot"].get_feature_names_out()

# Combine all feature names
all_feature_names = ["Electrical"] + list(numeric_features) + \
    list(ordinal_except_electrical) + list(onehot_features)

# Convert the transformed array to a DataFrame
transformed_df = pd.DataFrame(transformed_data, columns=all_feature_names)

In [3]:
print(transformed_df)

      Electrical  GrLivArea  LotFrontage  LotArea  OverallQual  OverallCond  \
0            4.0      856.0    68.510628   7890.0          6.0          6.0   
1            4.0     1049.0    42.000000   4235.0          5.0          5.0   
2            4.0     1001.0    60.000000   6060.0          5.0          9.0   
3            4.0     1039.0    80.000000   8146.0          4.0          8.0   
4            4.0     1665.0    70.000000   8400.0          8.0          6.0   
...          ...        ...          ...      ...          ...          ...   
2574         2.0      952.0    68.510628   8854.0          6.0          6.0   
2575         3.0     1733.0    68.510628  13680.0          3.0          5.0   
2576         3.0     2002.0    82.000000   6270.0          5.0          6.0   
2577         4.0     1842.0    68.510628   8826.0          7.0          5.0   
2578         4.0     1911.0    80.000000   9554.0          8.0          5.0   

      YearBuilt  YearRemodAdd  MasVnrArea  BsmtFinS

In [13]:
print(len(numeric_features) + len(ordinal_features) + Ames[nominal_features].fillna("None").nunique().sum())

2819


  print(len(numeric_features) + len(ordinal_features) + Ames[nominal_features].fillna("None").nunique().sum())


In [15]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

# Define the full model pipeline
model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", DecisionTreeRegressor(random_state=42))
])

# Evaluate the model using cross-validation
scores = cross_val_score(model_pipeline,
                         Ames.drop(columns="SalePrice"),
                         Ames["SalePrice"])

# Output the result
print("Decision Tree Regressor Mean CV R^2:", round(scores.mean(), 4))

Decision Tree Regressor Mean CV R^2: 0.7663


In [19]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

models = {
    "Decision Tree (1 Tree)": DecisionTreeRegressor(random_state=42),
    "Bagging Regressor (10 Trees)": BaggingRegressor(
        estimator=DecisionTreeRegressor(random_state=42),
        n_estimators=10,
        random_state=42
    )
}

results = {}

for name, model in models.items():
    # Define the full model pipeline for each model
    model_pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", model)
    ])
    # Perform cross-validation
    scores = cross_val_score(model_pipeline,
                             Ames.drop(columns="SalePrice"),
                             Ames["SalePrice"])
    # Store and print the mean of the scores
    results[name] = round(scores.mean(), 4)

# Output the cross-validation scores
print("Cross-validation scores:", results)

Cross-validation scores: {'Decision Tree (1 Tree)': np.float64(0.7663), 'Bagging Regressor (10 Trees)': np.float64(0.8781)}


In [None]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

n_trees = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

# Define the model pipelines with various regressors
models = {
    'Decision Tree (1 Tree)': DecisionTreeRegressor(random_state=42)
}

# Adding bagging models for each tree count
for n in n_trees:
    models[f'Bagging Regressor {n} Trees'] = BaggingRegressor(
        estimator=DecisionTreeRegressor(random_state=42),
        n_estimators=n,
        random_state=42
    )

results = {}

for name, model in models.items():
    # Define the full model pipeline for each model
    model_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    # Perform cross-validation
    scores = cross_val_score(model_pipeline,
                             Ames.drop(columns='SalePrice'),
                             Ames['SalePrice'])
    # Store the mean of the scores
    results[name] = round(scores.mean(), 4)

# Output the cross-validation scores
print("Cross-validation scores:")
for name, score in results.items():
    print(f"{name}: {score}")

In [None]:
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

# Number of trees to test
n_trees = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

# Define the model pipelines with various regressors
models = {
    "Decision Tree (1 Tree)": DecisionTreeRegressor(random_state=42),
}

# Adding bagging and random forest models for each tree count
for n in n_trees:
    models[f"Bagging Regressor {n} Trees"] = BaggingRegressor(
        estimator=DecisionTreeRegressor(random_state=42),
        n_estimators=n,
        random_state=42
    )
    models[f"Random Forest {n} Trees"] = RandomForestRegressor(
        n_estimators=n,
        random_state=42
    )

results = {}

for name, model in models.items():
    # Define the full model pipeline for each model
    model_pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", model)
    ])
    # Perform cross-validation
    scores = cross_val_score(model_pipeline,
                             Ames.drop(columns="SalePrice"),
                             Ames["SalePrice"])
    # Store the mean of the scores
    results[name] = round(scores.mean(), 4)

# Output the cross-validation scores
print("Cross-validation scores:")
for name, score in results.items():
    print(f"{name}: {score}")