In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OrdinalEncoder

# Load the dataset
Ames = pd.read_csv("Ames.csv")

# Manually specify the categories for ordinal encoding according to the data dictionary
ordinal_order = {
    # Electrical system
    "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
    # General shape of property
    "LotShape": ["IR3", "IR2", "IR1", "Reg"],
    # Type of utilities available
    "Utilities": ["ELO", "NoSeWa", "NoSewr", "AllPub"],
    # Slope of property
    "LandSlope": ["Sev", "Mod", "Gtl"],
    # Evaluates the quality of the material on the exterior
    "ExterQual": ["Po", "Fa", "TA", "Gd", "Ex"],
    # Evaluates the present condition of the material on the exterior
    "ExterCond": ["Po", "Fa", "TA", "Gd", "Ex"],
    # Height of the basement
    "BsmtQual": ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    # General condition of the basement
    "BsmtCond": ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    # Walkout or garden level basement walls
    "BsmtExposure": ["None", "No", "Mn", "Av", "Gd"],
    # Quality of basement finished area
    "BsmtFinType1": ["None", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    # Quality of second basement finished area
    "BsmtFinType2": ["None", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    # Heating quality and condition
    "HeatingQC": ["Po", "Fa", "TA", "Gd", "Ex"],
    # Kitchen quality
    "KitchenQual": ["Po", "Fa", "TA", "Gd", "Ex"],
    # Home functionality
    "Functional": ["Sal", "Sev", "Maj2", "Maj1", "Mod", "Min2", "Min1", "Typ"],
    # Fireplace quality
    "FireplaceQu": ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    # Interior finish of the garage
    "GarageFinish": ["None", "Unf", "RFn", "Fin"],
    # Garage quality
    "GarageQual": ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    # Garage condition
    "GarageCond": ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    # Paved driveway
    "PavedDrive": ["N", "P", "Y"],
    # Pool quality
    "PoolQC": ["None", "Fa", "TA", "Gd", "Ex"],
    # Fence quality
    "Fence": ["None", "MnWw", "GdWo", "MnPrv", "GdPrv"],
}

# Extract list of ALL ordinal features from dictionary
ordinal_features = list(ordinal_order.keys())

# List of ordinal features except Electrical
ordinal_except_electrical = [feat for feat in ordinal_features if feat != "Electrical"]

# Specific transformer for "Electrical" using the mode for imputation
electrical_imputer = Pipeline(steps=[
    ("impute_electrical", SimpleImputer(strategy="most_frequent"))
])

# Helper function to fill "None" for other ordinal features
def fill_none(X):
    return X.fillna("None")

# Pipeline for ordinal features: Fill missing values with "None"
ordinal_imputer = Pipeline(steps=[
    ("fill_none", FunctionTransformer(fill_none, validate=False))
])

# Preprocessor for filling missing values
preprocessor_fill = ColumnTransformer(transformers=[
    ("electrical", electrical_imputer, ["Electrical"]),
    ("cat", ordinal_imputer, ordinal_except_electrical)
])

# Apply preprocessor for filling missing values
# Convert back to DataFrame to apply OrdinalEncoder
Ames_ordinal = preprocessor_fill.fit_transform(Ames[ordinal_features])
Ames_ordinal = pd.DataFrame(Ames_ordinal, columns=["Electrical"] + ordinal_except_electrical)

# Apply Ordinal Encoding
categories = [ordinal_order[feature] for feature in ordinal_features]
ordinal_encoder = OrdinalEncoder(categories=categories)
Ames_ordinal_encoded = ordinal_encoder.fit_transform(Ames_ordinal)
Ames_ordinal_encoded = pd.DataFrame(Ames_ordinal_encoded, columns=["Electrical"] + ordinal_except_electrical)

# Ames dataset of ordinal features prior to ordinal encoding
print(Ames_ordinal)

     Electrical LotShape Utilities LandSlope ExterQual ExterCond BsmtQual  \
0         SBrkr      Reg    AllPub       Gtl        TA        TA       TA   
1         SBrkr      Reg    AllPub       Gtl        Gd        TA       Gd   
2         SBrkr      Reg    AllPub       Gtl        Gd        TA       TA   
3         SBrkr      Reg    AllPub       Gtl        Gd        Gd       Fa   
4         SBrkr      Reg    AllPub       Gtl        Gd        TA       Gd   
...         ...      ...       ...       ...       ...       ...      ...   
2574      FuseF      Reg    AllPub       Gtl        TA        TA       TA   
2575      FuseA      IR1    AllPub       Gtl        TA        TA     None   
2576      FuseA      Reg    AllPub       Gtl        TA        TA       TA   
2577      SBrkr      Reg    AllPub       Gtl        Gd        TA       Gd   
2578      SBrkr      IR1    AllPub       Gtl        Gd        TA       Gd   

     BsmtCond BsmtExposure BsmtFinType1  ... HeatingQC KitchenQual Function

In [4]:
print(categories)

[['Mix', 'FuseP', 'FuseF', 'FuseA', 'SBrkr'], ['IR3', 'IR2', 'IR1', 'Reg'], ['ELO', 'NoSeWa', 'NoSewr', 'AllPub'], ['Sev', 'Mod', 'Gtl'], ['Po', 'Fa', 'TA', 'Gd', 'Ex'], ['Po', 'Fa', 'TA', 'Gd', 'Ex'], ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], ['None', 'No', 'Mn', 'Av', 'Gd'], ['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'], ['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'], ['Po', 'Fa', 'TA', 'Gd', 'Ex'], ['Po', 'Fa', 'TA', 'Gd', 'Ex'], ['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'], ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], ['None', 'Unf', 'RFn', 'Fin'], ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], ['N', 'P', 'Y'], ['None', 'Fa', 'TA', 'Gd', 'Ex'], ['None', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv']]


In [8]:
print(Ames_ordinal_encoded)

      Electrical  LotShape  Utilities  LandSlope  ExterQual  ExterCond  \
0            4.0       3.0        3.0        2.0        2.0        2.0   
1            4.0       3.0        3.0        2.0        3.0        2.0   
2            4.0       3.0        3.0        2.0        3.0        2.0   
3            4.0       3.0        3.0        2.0        3.0        3.0   
4            4.0       3.0        3.0        2.0        3.0        2.0   
...          ...       ...        ...        ...        ...        ...   
2574         2.0       3.0        3.0        2.0        2.0        2.0   
2575         3.0       2.0        3.0        2.0        2.0        2.0   
2576         3.0       3.0        3.0        2.0        2.0        2.0   
2577         4.0       3.0        3.0        2.0        3.0        2.0   
2578         4.0       2.0        3.0        2.0        3.0        2.0   

      BsmtQual  BsmtCond  BsmtExposure  BsmtFinType1  ...  HeatingQC  \
0          3.0       3.0           1.0 

In [12]:
# Import the necessary libraries
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OrdinalEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import dtreeviz

# Load the dataset
Ames = pd.read_csv("Ames.csv")

# Manually specify the categories for ordinal encoding according to the data dictionary
ordinal_order = {
    "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
    "LotShape": ["IR3", "IR2", "IR1", "Reg"],
    "Utilities": ["ELO", "NoSeWa", "NoSewr", "AllPub"],
    "LandSlope": ["Sev", "Mod", "Gtl"],
    "ExterQual": ["Po", "Fa", "TA", "Gd", "Ex"],
    "ExterCond": ["Po", "Fa", "TA", "Gd", "Ex"],
    "BsmtQual": ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    "BsmtCond": ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    "BsmtExposure": ["None", "No", "Mn", "Av", "Gd"],
    "BsmtFinType1": ["None", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["None", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "HeatingQC": ["Po", "Fa", "TA", "Gd", "Ex"],
    "KitchenQual": ["Po", "Fa", "TA", "Gd", "Ex"],
    "Functional": ["Sal", "Sev", "Maj2", "Maj1", "Mod", "Min2", "Min1", "Typ"],
    "FireplaceQu": ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    "GarageFinish": ["None", "Unf", "RFn", "Fin"],
    "GarageQual": ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    "GarageCond": ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    "PavedDrive": ["N", "P", "Y"],
    "PoolQC": ["None", "Fa", "TA", "Gd", "Ex"],
    "Fence": ["None", "MnWw", "GdWo", "MnPrv", "GdPrv"],
}

# Extract list of all ordinal features from the dictionary
ordinal_features = list(ordinal_order.keys())

# List of ordinal features except Electrical
ordinal_except_electrical = [feat for feat in ordinal_features if feat != "Electrical"]

# Specific transformer for "Electrical" using the mode for imputation
electrical_imputer = Pipeline(steps=[
    ("impute_electrical", SimpleImputer(strategy="most_frequent"))
])

# Helper function to fill "None" for other ordinal features
def fill_none(X):
    return X.fillna("None")

# Pipeline for ordinal features: Fill missing values with "None"
ordinal_imputer = Pipeline(steps=[
    ("fill_none", FunctionTransformer(fill_none, validate=False))
])

# Preprocessor for filling missing values
preprocessor_fill = ColumnTransformer(transformers=[
    ("electrical", electrical_imputer, ["Electrical"]),
    ("cat", ordinal_imputer, ordinal_except_electrical)
])

# Apply preprocessor for filling missing values
Ames_ordinal = preprocessor_fill.fit_transform(Ames[ordinal_features])

# Convert back to DataFrame to apply OrdinalEncoder
Ames_ordinal = pd.DataFrame(
    Ames_ordinal,
    columns=["Electrical"] + ordinal_except_electrical
)

# Apply Ordinal Encoding
categories = [ordinal_order[feature] for feature in ordinal_features]
ordinal_encoder = OrdinalEncoder(categories=categories)
Ames_ordinal_encoded = ordinal_encoder.fit_transform(Ames_ordinal)
Ames_ordinal_encoded = pd.DataFrame(
    Ames_ordinal_encoded,
    columns=["Electrical"] + ordinal_except_electrical
)

# Load and split the data
X_ordinal = Ames_ordinal_encoded  # Use only the ordinal features
y = Ames["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(
    X_ordinal, y, test_size=0.2, random_state=42
)

# Initialize and fit the decision tree
tree_model = DecisionTreeRegressor(max_depth=3)
tree_model.fit(X_train.values, y_train)

# Visualize the decision tree using dtreeviz
viz = dtreeviz.model(
    tree_model,
    X_train,
    y_train,
    target_name="SalePrice",
    feature_names=X_train.columns.tolist()
)

# Render and show the visualization
v = viz.view()
v.show()