In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, FunctionTransformer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV

# Linear regression

In [2]:
def train_linear_regression(
    data: pd.DataFrame, 
    target: str, 
    alphas: list[int] = [0.1, 1, 10, 100],
    num_transformer: str = 'robust',
    regularization: str = None,
    test_size: float = 0.2, 
    random_state: int = 42
):
    """
    Train a linear regression model with optional regularization.
    Calculates R² correctly for log-transformed targets.
    Handles ordinal features with ordinal encoding.
    """
    X = data.drop(target, axis=1)
    
    ordinal_columns = [
        'OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 
        'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual', 
        'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC'
    ]
    
    
    # Separate nominal categorical columns 
    nominal_columns = [col for col in X.select_dtypes(include=['object', 'category']).columns 
                       if col not in ordinal_columns]
    
    numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns
    
    # Define ordinal mappings
    ordinal_mappings = {
        # Quality features (Ex, Gd, TA, Fa, Po)
        'ExterQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'ExterCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'BsmtQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'BsmtCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'HeatingQC': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'KitchenQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'FireplaceQu': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'GarageQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'GarageCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'PoolQC': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        
        # Overall quality and condition (1-10)
        'OverallQual': {i: i for i in range(1, 11)},
        'OverallCond': {i: i for i in range(1, 11)},
        
        # Basement exposure
        'BsmtExposure': {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'NA': 0},
        
        # Basement finish types
        'BsmtFinType1': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'NA': 0},
        'BsmtFinType2': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'NA': 0}
    }
    
    log_transformer = FunctionTransformer(np.log1p, validate=True)
    
    # transformations
    use_log_transform = num_transformer in ['log', 'log+robust']
    y = data[target]
    
    if num_transformer == 'robust': 
        num_pipeline = RobustScaler()
    elif num_transformer == 'standard': 
        num_pipeline = StandardScaler()
    elif num_transformer == 'log+robust':
        num_pipeline = Pipeline([
            ('log', log_transformer),
            ('scaler', RobustScaler())
        ])
        y = np.log1p(y)
    elif num_transformer == 'log+standard':
        num_pipeline = Pipeline([
            ('log', log_transformer),
            ('scaler', StandardScaler())
        ])
        y = np.log1p(y)
    elif num_transformer == 'log':
        num_pipeline = log_transformer
        y = np.log1p(y)
    else:
        num_pipeline = 'passthrough'
    
    # Create transformers for each ordinal feature
    ordinal_transformers = []
    for col in ordinal_columns:
        ordinal_transformers.append(
            (f'ord_{col}', 
                OrdinalEncoder(
                    categories=[list(ordinal_mappings[col].keys())],
                    handle_unknown='use_encoded_value',
                    unknown_value=-1
                ), 
                [col])
        )
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), nominal_columns),
            ('num', num_pipeline, numerical_columns)
        ] + ordinal_transformers
    )

    # Select appropriate regressor
    if regularization == 'ridge':
        regressor = Ridge()
        param_grid = {'regressor__alpha': alphas}
    elif regularization == 'lasso':
        regressor = Lasso(max_iter=50000, tol=0.001, selection='random') # for better stability reduced tolerance and increase max_iter
        param_grid = {'regressor__alpha': alphas}
    else:
        regressor = LinearRegression()
        param_grid = {}

    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', regressor)
    ])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    if regularization in ['ridge', 'lasso']:
        grid_search = GridSearchCV(
            model, 
            param_grid, 
            cv=10, 
            scoring='neg_root_mean_squared_error'
        )
        grid_search.fit(X_train, y_train)
        
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        y_pred = best_model.predict(X_test)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        best_model = model
        best_params = {}

    if use_log_transform:
        # Apply safe expm1 by clipping extreme values
        max_safe_value = 30  # log(max_safe_value) is about 1.3e13
        y_pred_clipped = np.clip(y_pred, -max_safe_value, max_safe_value)
        y_test_clipped = np.clip(y_test, -max_safe_value, max_safe_value)
        y_pred = np.expm1(y_pred_clipped)
        y_test = np.expm1(y_test_clipped)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # Get feature names from the preprocessor
    # This requires a bit more complex handling now with ordinal features
    feature_names = []
    
    # Add one-hot encoded nominal features
    if nominal_columns:
        feature_names.extend(
            best_model.named_steps['preprocessor']
            .named_transformers_['cat']
            .get_feature_names_out(nominal_columns)
        )
    
    # Add numerical features
    feature_names.extend(numerical_columns)
    
    # Add ordinal features
    feature_names.extend(ordinal_columns)
    
    # Create coefficients DataFrame
    # We need to handle the extraction of coefficients differently now
    coefficients = pd.DataFrame({
        'feature': feature_names,
        'importance': np.abs(best_model.named_steps['regressor'].coef_),
        'value': best_model.named_steps['regressor'].coef_,
        'type': ['Numerical' if col in numerical_columns else 
                 'Ordinal' if col in ordinal_columns else 
                 'Categorical' for col in feature_names]
    }).sort_values('importance', ascending=False)

    return {
        'model': best_model,
        'performance': {
            'root_mean_squared_error': rmse,
            'r2_score': r2
        },
        'best_params': best_params,
        'feature_importances': coefficients,
        'train_data': (X_train, y_train),
        'test_data': (X_test, y_test)
    }

For only cleaned data we necessarily need to apply log, so either log or log+robust

In [None]:

df = pd.read_csv('data/train_cleaned.csv')

# df = pd.read_csv('data/train_rm_OL.csv')
trans_options = ['log', 'log+robust', 'log+standard']
regularization_options = [None, 'ridge', 'lasso']

best_r2 = float('-inf')
best_result = None

for trans in trans_options:
    for regularization in regularization_options:
        alphas = [1, 10, 100, 200] if regularization == 'ridge' else [0.0001, 0.001, 0.01, 0.1]
        results = train_linear_regression(df, target='SalePrice', alphas=alphas, num_transformer=trans, regularization=regularization)
        
        print(f'Results with preprocessing {trans} and regularization {regularization}:')
        for metric, value in results['performance'].items():
            print(f"{metric}: {value}")
        if regularization:
            print('The best alpha=', results['best_params']['regressor__alpha'])
        print("=" * 20)
        
        # Store the best model
        if results['performance']['r2_score'] > best_r2:
            best_r2 = results['performance']['r2_score']
            best_result = (trans, regularization, results)



Results with preprocessing log and regularization None:
root_mean_squared_error: 639782490519.0297
r2_score: -76277740073152.56
Results with preprocessing log and regularization ridge:
root_mean_squared_error: 22546.73928045989
r2_score: 0.9052671500501497
The best alpha= 10
Results with preprocessing log and regularization lasso:
root_mean_squared_error: 22740.54692476231
r2_score: 0.9036315379428266
The best alpha= 0.0001
Results with preprocessing log+robust and regularization None:
root_mean_squared_error: 639782490519.0297
r2_score: -76277740073152.56
Results with preprocessing log+robust and regularization ridge:
root_mean_squared_error: 21916.078600935864
r2_score: 0.9104926260164716
The best alpha= 10
Results with preprocessing log+robust and regularization lasso:
root_mean_squared_error: 22458.212215640728
r2_score: 0.9060096039283818
The best alpha= 0.0001
Results with preprocessing log+standard and regularization None:
root_mean_squared_error: 415829751.3847926
r2_score: -1.

In [4]:
# Print the best model at the end
if best_result:
    trans, regularization, results = best_result
    print("Best model:")
    print(f"Preprocessing: {trans}, Regularization: {regularization}")
    for metric, value in results['performance'].items():
        print(f"{metric}: {value}")
    if regularization:
        print('The best alpha=', results['best_params']['regressor__alpha'])
        
    print(results['feature_importances'].head(10))
        


Best model:
Preprocessing: log+robust, Regularization: ridge
root_mean_squared_error: 21916.078600935864
r2_score: 0.9104926260164716
The best alpha= 10
                   feature  importance     value         type
205              GrLivArea    0.134737  0.134737    Numerical
0         MSZoning_C (all)    0.121066 -0.121066  Categorical
33    Neighborhood_Crawfor    0.097950  0.097950  Categorical
49    Neighborhood_StoneBr    0.080126  0.080126  Categorical
195              YearBuilt    0.064764  0.064764    Numerical
151         Functional_Typ    0.062604  0.062604  Categorical
95     Exterior1st_BrkFace    0.062251  0.062251  Categorical
184  SaleCondition_Abnorml    0.060802 -0.060802  Categorical
226            OverallQual    0.058921  0.058921    Numerical
146        Functional_Maj2    0.051361 -0.051361  Categorical


### Linear regression conclusions:

This result is pretty impresive for linear regression.

From above we can see, that the best R2 is obtained for cleaned-only data using no regularization. Of course, given that we have almost 400 predictors (most from one-hot-encoding) and 1560 rows, there is no problem with overfitting to the data. Linear models tend to have relativily large bias due to strong assumptions, and regularization only enhances the bias without lowering variance. Probably non-linear models with low bias will perform better.

# Decision tree

In [5]:

from sklearn.tree import DecisionTreeRegressor

def train_decision_tree_regression(
    data: pd.DataFrame, 
    target: str, 
    max_depth: list[int] = [2, 3, 5, 7, 10],
    min_samples_splits: list[int] = [2, 5, 10],
    min_samples_leaf: list[int] = [1, 2, 4],
    max_features: list[int, float, str] = [None, 'sqrt', 'log2'],
    num_transformer: str = 'robust',
    test_size: float = 0.2, 
    random_state: int = 42
):
    """
    Train a decision tree regression model with pruning and advanced preprocessing.
    Supports various numerical transformations and handles categorical features.
    
    Parameters:
    -----------
    data : pd.DataFrame
        Input dataset
    target : str
        Name of the target column
    pruning_depths : list
        Maximum depths to test for pruning
    min_samples_splits : list
        Minimum number of samples required to split an internal node
    num_transformer : str
        Numerical feature transformation method
        Options: 'robust', 'standard', 'log', 'log+robust', 'log+standard'
    test_size : float
        Proportion of the dataset to include in the test split
    random_state : int
        Random seed for reproducibility
    
    Returns:
    --------
    dict : A dictionary containing model, performance metrics, 
           best parameters, and feature importances
    """
    # Separate features and target
    X = data.drop(target, axis=1)
    
    # Identify column types
    categorical_columns = X.select_dtypes(include=['object', 'category']).columns
    numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns
    
    # Log transformer for skewed features
    log_transformer = FunctionTransformer(np.log1p, validate=True)
    
    # Determine scaling and target transformation
    y = data[target]
    use_log_transform = num_transformer in ['log', 'log+robust', 'log+standard']
    
    # Create numerical preprocessing pipeline
    if num_transformer == 'robust': 
        num_pipeline = RobustScaler()
    elif num_transformer == 'standard': 
        num_pipeline = StandardScaler()
    elif num_transformer == 'log+robust':
        num_pipeline = Pipeline([
            ('log', log_transformer),
            ('scaler', RobustScaler())
        ])
        y = np.log1p(y)
    elif num_transformer == 'log+standard':
        num_pipeline = Pipeline([
            ('log', log_transformer),
            ('scaler', StandardScaler())
        ])
        y = np.log1p(y)
    elif num_transformer == 'log':
        num_pipeline = log_transformer
        y = np.log1p(y)
    else:
        num_pipeline = 'passthrough'

    # Create preprocessor with one-hot encoder for categories
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
            ('num', num_pipeline, numerical_columns)
        ])


    param_grid = {
    'regressor__max_depth': max_depth,
    'regressor__min_samples_split': min_samples_splits,
    'regressor__min_samples_leaf': min_samples_leaf,
    'regressor__max_features': max_features
}

    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', DecisionTreeRegressor(random_state=random_state))
    ])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    grid_search = GridSearchCV(
        model, 
        param_grid, 
        cv=10, 
        scoring='neg_root_mean_squared_error'
    )
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    y_pred = best_model.predict(X_test)

    if use_log_transform:
        # Inverse transform predictions and actual values
        y_pred = np.expm1(y_pred)
        y_test = np.expm1(y_test)
        

    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # Get feature names from model because one-hot encoding
    feature_names = (
        list(best_model.named_steps['preprocessor']
             .named_transformers_['cat']
             .get_feature_names_out(categorical_columns)) + 
        list(numerical_columns)
    )
    
    # Extract feature importances
    importances = best_model.named_steps['regressor'].feature_importances_
    feature_importances = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)

    return {
        'model': best_model,
        'performance': {
            'root_mean_squared_error': rmse,
            'r2_score': r2
        },
        'best_params': best_params,
        'feature_importances': feature_importances,
        'train_data': (X_train, y_train),
        'test_data': (X_test, y_test)
    }

In [6]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OrdinalEncoder

def train_decision_tree_regression(
    data: pd.DataFrame, 
    target: str, 
    max_depth: list[int] = [2, 3, 5, 7, 10],
    min_samples_splits: list[int] = [2, 5, 10],
    min_samples_leaf: list[int] = [1, 2, 4],
    max_features: list[int, float, str] = [None, 'sqrt', 'log2'],
    num_transformer: str = 'robust',
    test_size: float = 0.2, 
    random_state: int = 42
):
    """
    Train a decision tree regression model with pruning and advanced preprocessing.
    Supports various numerical transformations and handles categorical features.
    Properly handles ordinal features using ordinal encoding.
    
    Parameters:
    -----------
    data : pd.DataFrame
        Input dataset
    target : str
        Name of the target column
    pruning_depths : list
        Maximum depths to test for pruning
    min_samples_splits : list
        Minimum number of samples required to split an internal node
    num_transformer : str
        Numerical feature transformation method
        Options: 'robust', 'standard', 'log', 'log+robust', 'log+standard'
    test_size : float
        Proportion of the dataset to include in the test split
    random_state : int
        Random seed for reproducibility
    
    Returns:
    --------
    dict : A dictionary containing model, performance metrics, 
           best parameters, and feature importances
    """
    # Separate features and target
    X = data.drop(target, axis=1)
    
    # Define feature types
    ordinal_columns = [
        'OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 
        'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual', 
        'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC'
    ]
    
    # Keep only ordinal columns that actually exist in the dataset
    ordinal_columns = [col for col in ordinal_columns if col in X.columns]
    
    # Separate remaining categorical columns (nominal)
    nominal_columns = [col for col in X.select_dtypes(include=['object', 'category']).columns 
                       if col not in ordinal_columns]
    
    numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns
    
    # Define ordinal mappings
    ordinal_mappings = {
        # Quality features (Ex, Gd, TA, Fa, Po)
        'ExterQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'ExterCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'BsmtQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'BsmtCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'HeatingQC': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'KitchenQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'FireplaceQu': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'GarageQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'GarageCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        'PoolQC': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
        
        # Overall quality and condition (1-10)
        'OverallQual': {i: i for i in range(1, 11)},
        'OverallCond': {i: i for i in range(1, 11)},
        
        # Basement exposure
        'BsmtExposure': {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'NA': 0},
        
        # Basement finish types
        'BsmtFinType1': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'NA': 0},
        'BsmtFinType2': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'NA': 0}
    }
    
    # Log transformer for skewed features
    log_transformer = FunctionTransformer(np.log1p, validate=True)
    
    # Determine scaling and target transformation
    y = data[target]
    use_log_transform = num_transformer in ['log', 'log+robust', 'log+standard']
    
    # Create numerical preprocessing pipeline
    if num_transformer == 'robust': 
        num_pipeline = RobustScaler()
    elif num_transformer == 'standard': 
        num_pipeline = StandardScaler()
    elif num_transformer == 'log+robust':
        num_pipeline = Pipeline([
            ('log', log_transformer),
            ('scaler', RobustScaler())
        ])
        y = np.log1p(y)
    elif num_transformer == 'log+standard':
        num_pipeline = Pipeline([
            ('log', log_transformer),
            ('scaler', StandardScaler())
        ])
        y = np.log1p(y)
    elif num_transformer == 'log':
        num_pipeline = log_transformer
        y = np.log1p(y)
    else:
        num_pipeline = 'passthrough'

    # Create transformers for each ordinal feature
    ordinal_transformers = []
    for col in ordinal_columns:
        if col in ordinal_mappings:
            ordinal_transformers.append(
                (f'ord_{col}', 
                 OrdinalEncoder(
                     categories=[list(ordinal_mappings[col].keys())],
                     handle_unknown='use_encoded_value',
                     unknown_value=-1
                 ), 
                 [col])
            )

    # Create preprocessor with one-hot encoder for nominal categories
    # and ordinal encoder for ordinal categories
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), nominal_columns),
            ('num', num_pipeline, numerical_columns)
        ] + ordinal_transformers
    )

    param_grid = {
        'regressor__max_depth': max_depth,
        'regressor__min_samples_split': min_samples_splits,
        'regressor__min_samples_leaf': min_samples_leaf,
        'regressor__max_features': max_features
    }

    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', DecisionTreeRegressor(random_state=random_state))
    ])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    grid_search = GridSearchCV(
        model, 
        param_grid, 
        cv=10, 
        scoring='neg_root_mean_squared_error'
    )
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    y_pred = best_model.predict(X_test)

    if use_log_transform:
        # Inverse transform predictions and actual values
        y_pred = np.expm1(y_pred)
        y_test = np.expm1(y_test)
        
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # Get feature names from model - needs to be handled differently with ordinal features
    feature_names = []
    
    # Add one-hot encoded nominal features
    if nominal_columns:
        feature_names.extend(
            best_model.named_steps['preprocessor']
            .named_transformers_['cat']
            .get_feature_names_out(nominal_columns)
        )
    
    # Add numerical features
    feature_names.extend(numerical_columns)
    
    # Add ordinal features
    feature_names.extend(ordinal_columns)
    
    # Extract feature importances
    importances = best_model.named_steps['regressor'].feature_importances_
    
    # Create a DataFrame for feature importances
    feature_importances = pd.DataFrame({
        'feature': feature_names,
        'importance': importances,
        'type': ['Numerical' if col in numerical_columns else 
                 'Ordinal' if col in ordinal_columns else 
                 'Categorical' for col in feature_names]
    }).sort_values('importance', ascending=False)

    return {
        'model': best_model,
        'performance': {
            'root_mean_squared_error': rmse,
            'r2_score': r2
        },
        'best_params': best_params,
        'feature_importances': feature_importances,
        'train_data': (X_train, y_train),
        'test_data': (X_test, y_test)
    }

In [7]:
from sklearn.tree import plot_tree

In [None]:
# df = pd.read_csv('data/train_cleaned.csv')
df = pd.read_csv('data/train_rm_OL.csv')
# df = pd.read_csv('data/train_full_EDA.csv')
# trans_options = ['robust'] # this is the best preprocessing
trans_options = ['robust', 'standard', 'log', 'log+robust', 'log+standard']

best_r2 = float('-inf')
best_result = None

for trans in trans_options:
    results_tree = train_decision_tree_regression(
        df, target='SalePrice', min_samples_splits=[10, 20, 30], max_depth=[ 7, 9, 11, 13, 15, 20], num_transformer=trans
    )
    
    print(f'Results with preprocessing {trans}:')
    for metric, value in results_tree['performance'].items():
        print(f"{metric}: {value}")
    print(results_tree['best_params'])
    
    # Store the best model
    if results_tree['performance']['r2_score'] > best_r2:
        best_r2 = results_tree['performance']['r2_score']
        best_result = (trans, results_tree)

# Print the best model at the end and visualize it
if best_result:
    trans, results_tree = best_result
    best_model = results_tree['model'].named_steps['regressor']  # Extract regressor from pipeline
    preprocessor = results_tree['model'].named_steps['preprocessor']  # Extract preprocessor from pipeline
    
    # Get transformed feature names
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns
    numerical_columns = df.select_dtypes(include=['int64', 'float64']).drop('SalePrice', axis=1).columns
    feature_names = (
        list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_columns)) +
        list(numerical_columns)
    )

    print("Best model:")
    print(f"Preprocessing: {trans}")
    for metric, value in results_tree['performance'].items():
        print(f"{metric}: {value}")
    print(results_tree['best_params'])

    # Visualizing the tree
    plt.figure(figsize=(20, 10))
    plot_tree(best_model, filled=True, feature_names=feature_names)
    plt.show()
