In [1]:
import pandas as pd
from sklearn import datasets, linear_model, metrics
import numpy as np
from sklearn.metrics import mean_squared_error
import tensorflow as tf
import tensorflow_decision_forests as tfdf
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt


In [2]:
train = pd.read_csv('../data/processed/train_data.csv')
test = pd.read_csv('../data/processed/test_data.csv')

In [3]:
#split input and output
X_train=train.drop(['SalePrice'], axis=1)
y_train= train['SalePrice']

X_test=test.drop(['SalePrice'], axis=1)
y_test= test['SalePrice']

In [None]:
#one hot encoding - convert categorical columns to numerical for linear regression. 
#there was an error with vanishing columns in test. This fixes it. 
X_train_one_hot = pd.get_dummies(X_train, drop_first=True) 
cols = X_train_one_hot.columns.tolist()
X_test_one_hot = pd.get_dummies(X_test, drop_first=True) 
X_test_one_hot = X_test_one_hot.reindex(columns=cols).fillna(0)


In [None]:
#Linear regression: 
#fit_intercept=False is needed as collinearity of one hot encoding, first row has to be dropped. 
reg = linear_model.LinearRegression(fit_intercept=False)
reg.fit(X_train_one_hot, y_train)

In [None]:
#predict sales price on test set
predictions_lm = reg.predict(X_test_one_hot)

In [None]:
#Apply inverse transformation to bring predictions back to the original scale
predictions_lm = np.exp(predictions_lm)

In [None]:
#RMSE 
rms = mean_squared_error(np.exp(y_test), predictions_lm, squared=False)
rms

In [None]:
#tensorflow decision trees
#!pip install tensorflow_decision_forests

In [4]:
# Convert the pandas dataframe into a TensorFlow dataset
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train, label="SalePrice", task = tfdf.keras.Task.REGRESSION)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test, label="SalePrice", task = tfdf.keras.Task.REGRESSION)

In [None]:
# Create a RandomSearch tuner
tuner_rf = tfdf.tuner.RandomSearch(num_trials=20)

tuner_rf.choice("num_trees", [100, 200, 500])
tuner_rf.choice("max_depth", [-1, 10, 30])
tuner_rf.choice("min_examples", [2, 5, 10])

In [None]:
model = tfdf.keras.RandomForestModel(task=tfdf.keras.Task.REGRESSION, tuner=tuner_rf)
model.compile

In [None]:
model.fit(train_ds)

In [None]:
tuning_logs_rf = model.make_inspector().tuning_logs()
# Best hyper-parameters.
tuning_logs_rf[tuning_logs_rf.best].iloc[0]

In [None]:
model.compile(metrics=["accuracy"])
print(model.evaluate(test_ds))

In [None]:
inspector = model.make_inspector()
inspector.evaluation()

In [None]:
#note this RMSE is on the logarmithic data, not the actual data! 

In [None]:
#calculate the actual RMSE
preds = model.predict(test_ds)

In [None]:
predics=np.exp(preds)
y=test['SalePrice']

In [None]:
rms = mean_squared_error(np.exp(y), preds, squared=False)
rms

In [None]:

#next acitivity, explore all of these methods and compare RMSE of all

tfdf.keras.get_all_models()


In [None]:
#GradientBoostedTreesModel - work in progress

In [None]:
# Create a RandomSearch tuner
tuner_gb = tfdf.tuner.RandomSearch(num_trials=10)

tuner_gb.choice("num_trees", [50, 100, 200, 500, 1000])
tuner_gb.choice("shrinkage", [0.01, 0.05, 0.1, 0.3, 0.5])
tuner_gb.choice("max_depth", [3, 4, 5, 6, 8, 10])
tuner_gb.choice("min_examples", [2, 5, 10, 15])



In [None]:
# Initialize and train Gradient Boosted Trees Regression model
model_gb_regressor = tfdf.keras.GradientBoostedTreesModel(
    task=tfdf.keras.Task.REGRESSION,
    tuner=tuner_gb
)

model_gb_regressor.fit(train_ds)
print("Model training complete.")

In [None]:
tuning_logs_gb = model_gb_regressor.make_inspector().tuning_logs()
# Best hyper-parameters.
tuning_logs_gb[tuning_logs.best].iloc[0]

In [None]:
# Make predictions on test dataset
predictions_gb_log = model_gb_regressor.predict(test_ds)

# Reverse log transformation for predictions
y_pred_gb = np.exp(predictions_gb_log)

# Reverse the log transformation for true variable
y_true_gb = np.exp(y_test)

In [None]:
# Calculate RMSE and evaluate model performance
rmse_gb = np.sqrt(mean_squared_error(y_true_gb, y_pred_gb))
print(f"Root Mean Squared Error (RMSE): {rmse_gb:.2f}")

In [None]:
#Cart model

In [None]:
#train model
model_cm_regression = tfdf.keras.CartModel(task=tfdf.keras.Task.REGRESSION)

model_cm_regression.fit(train_ds)

In [None]:
# Make predictions on the same dataset (or test set)
predictions_log_cm = model_cm_regression.predict(train_ds)

# Reverse the log transformation (use np.exp to get the original scale)
predictions_original_scale_cm = np.exp(predictions_log_cm)


In [None]:
rms = mean_squared_error(y_true, predictions_original_scale_cm, squared=False)
rms

In [None]:
#basic RF

In [None]:
regressor = RandomForestRegressor(n_estimators=100, random_state=0, oob_score=True)
regressor.fit(X_train_one_hot, y_train)

In [None]:
predictions_rf = regressor.predict(X_test_one_hot)
mse = mean_squared_error(np.exp(y_test), np.exp(predictions))
print(f'Mean Squared Error: {mse}')

In [None]:
    fig,ax = plt.subplots(figsize=(6,6))
    ax.set_title('House price predictions',fontsize=20)
    ax.set_ylabel('predicted',fontsize=12)
    ax.set_xlabel('actual',fontsize=12)
    ax.scatter(np.exp(y_test), np.exp(predictions_rf),label='Random Forest')
    ax.scatter(y_true, predictions_original_scale_cm,label='Cart model')
    ax.scatter(y_true, predictions_original_scale,label='GradientBoostedTreesModel')
    ax.scatter(np.exp(y_test), predictions_lm,label='Linear model')
    ax.legend()


In [20]:
# Map model names to their corresponding classes
models = {
    "gradient_boosted_trees": tfdf.keras.GradientBoostedTreesModel,
    "random_forest": tfdf.keras.RandomForestModel
}

# Define model hyperparameter configurations
params = {
    "gradient_boosted_trees": {
        "num_trees": [50, 100, 200, 500, 1000],
        "shrinkage": [0.01, 0.05, 0.1, 0.3, 0.5],
        "max_depth": [4, 6, 8, 10],
        "min_examples": [2, 5, 10, 15]
    },
    "random_forest": {
        "num_trees": [100, 200, 500],
        "max_depth": [-1, 10, 30],
        "min_examples": [2, 5, 10]
    }
}

num_trials = {
    "gradient_boosted_trees": 10,
    "random_forest": 20
}

# Placeholder to store model output
output_logs = []

In [21]:
# Train each model
for model_name, config in params.items():
    output_logs.append(f"Training and tuning {model_name}...")

    num_trial = num_trials.get(model_name, 10)  # Get the number of trials for the model
    tuner = tfdf.tuner.RandomSearch(num_trials=num_trial)
    
    # Loop through the configuration dictionary and set the hyperparameters
    for param, values in config.items():
        tuner.choice(param, values)
    
    model_class = models[model_name]  # Get the corresponding model class
    model = model_class(task=tfdf.keras.Task.REGRESSION, tuner=tuner)
    
    model.fit(train_ds)

    #get best parameters
    tuning_logs = model.make_inspector().tuning_logs()
    best_params = tuning_logs[tuning_logs.best].iloc[0]
    
    predictions_log = model.predict(test_ds)
    y_pred = np.exp(predictions_log)
    y_true = np.exp(y_test)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    
    output_logs.append(f"Best parameters for {model_name}: {best_params}")
    output_logs.append(f"Best RMSE for {model_name}: {rmse:.6f}")


Use /var/folders/dw/bq4phr0s4yz1_3vy2_gppjcm0000gp/T/tmplu8yr3q4 as temporary training directory
Reading training dataset...


2025-01-06 21:27:37.533819: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1840] "goss_alpha" set but "sampling_method" not equal to "GOSS".
2025-01-06 21:27:37.533850: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1850] "goss_beta" set but "sampling_method" not equal to "GOSS".
2025-01-06 21:27:37.533855: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1864] "selective_gradient_boosting_ratio" set but "sampling_method" not equal to "SELGB".


Training dataset read in 0:00:00.323039. Found 1165 examples.
Training model...


I0000 00:00:1736216857.863395 35735652 kernel.cc:782] Start Yggdrasil model training
I0000 00:00:1736216857.863412 35735652 kernel.cc:783] Collect training examples
I0000 00:00:1736216857.863418 35735652 kernel.cc:795] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: NUMERICAL
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

I0000 00:00:1736216857.863561 35735652 kernel.cc:401] Number of batches: 2
I0000 00:00:1736216857.863565 35735652 kernel.cc:402] Number of examples: 1165
I0000 00:00:1736216857.864672 35735652 data_spec_inference.cc:354] 1 item(s) have been pruned (i.e. they are considered out of dictionary) for the column BsmtCond (3 item(s) left) because min_value_count=5 and max_number_of_unique_values=2000
I0000 00:00:1736216857.864684 35735652 data_spec_inference.cc:354] 1 item(s) have be

Model trained in 0:00:04.972681
Compiling model...


I0000 00:00:1736216862.683145 35797271 early_stopping.cc:54] Early stop of the training because the validation loss does not decrease anymore. Best valid-loss: 0.160102
2025-01-06 21:27:42.683174: I external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:277] Truncates the model to 432 tree(s) i.e. 432  iteration(s).
2025-01-06 21:27:42.683377: I external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:339] Final model num-trees:432 valid-loss:0.160102 valid-rmse:0.160102
2025-01-06 21:27:42.693349: I external/ydf/yggdrasil_decision_forests/learner/hyperparameters_optimizer/hyperparameters_optimizer.cc:578] [9/10] Score: -0.160102 / -0.140887 HParams: fields { name: "num_trees" value { integer: 500 } } fields { name: "shrinkage" value { real: 0.01 } } fields { name: "max_depth" value { integer: 10 } } fields { name: "min_examples" value { integer: 10 } }
2025-01-06 21:27:42.693651: I external/ydf/yggdrasil

Model compiled.
Use /var/folders/dw/bq4phr0s4yz1_3vy2_gppjcm0000gp/T/tmpqkkn4nb2 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.307681. Found 1165 examples.
Training model...


I0000 00:00:1736216863.433683 35735652 kernel.cc:782] Start Yggdrasil model training
I0000 00:00:1736216863.433699 35735652 kernel.cc:783] Collect training examples
I0000 00:00:1736216863.433705 35735652 kernel.cc:795] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: NUMERICAL
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

I0000 00:00:1736216863.433835 35735652 kernel.cc:401] Number of batches: 2
I0000 00:00:1736216863.433840 35735652 kernel.cc:402] Number of examples: 1165
I0000 00:00:1736216863.434875 35735652 data_spec_inference.cc:354] 1 item(s) have been pruned (i.e. they are considered out of dictionary) for the column BsmtCond (3 item(s) left) because min_value_count=5 and max_number_of_unique_values=2000
I0000 00:00:1736216863.434886 35735652 data_spec_inference.cc:354] 1 item(s) have be

Model trained in 0:00:37.539297
Compiling model...


I0000 00:00:1736216900.937911 35735652 decision_forest.cc:761] Model loaded with 500 root(s), 448772 node(s), and 79 input feature(s).
I0000 00:00:1736216900.937938 35735652 abstract_model.cc:1404] Engine "RandomForestOptPred" built
2025-01-06 21:28:20.937951: I tensorflow_decision_forests/tensorflow/ops/inference/kernel.cc:1035] Use fast generic engine


Model compiled.


In [22]:
for log in output_logs:
    print(log)


Training and tuning gradient_boosted_trees...
Best parameters for gradient_boosted_trees: score             -0.140887
evaluation_time    1.378276
best                   True
num_trees               200
shrinkage               0.3
max_depth                 6
min_examples              5
Name: 6, dtype: object
Best RMSE for gradient_boosted_trees: 27632.958822
Training and tuning random_forest...
Best parameters for random_forest: score              -0.135769
evaluation_time    19.173921
best                    True
num_trees                500
max_depth                 30
min_examples               2
Name: 11, dtype: object
Best RMSE for random_forest: 30612.963812
