In [61]:
import pandas as pd
from sklearn import datasets, linear_model, metrics
import numpy as np
from sklearn.metrics import mean_squared_error
import tensorflow as tf
import tensorflow_decision_forests as tfdf
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

In [62]:
train = pd.read_csv('../data/processed/train_data.csv')
test = pd.read_csv('../data/processed/test_data.csv')

In [63]:
#split input and output
X_train=train.drop(['SalePrice'], axis=1)
y_train= train['SalePrice']

X_test=test.drop(['SalePrice'], axis=1)
y_test= test['SalePrice']

In [None]:
#one hot encoding - convert categorical columns to numerical for linear regression. 
#there was an error with vanishing columns in test. This fixes it. 
X_train_one_hot = pd.get_dummies(X_train, drop_first=True) 
cols = X_train_one_hot.columns.tolist()
X_test_one_hot = pd.get_dummies(X_test, drop_first=True) 
X_test_one_hot = X_test_one_hot.reindex(columns=cols).fillna(0)


In [None]:
#Linear regression: 
#fit_intercept=False is needed as collinearity of one hot encoding, first row has to be dropped. 
reg = linear_model.LinearRegression(fit_intercept=False)
reg.fit(X_train_one_hot, y_train)

In [None]:
#predict sales price on test set
predictions_lm = reg.predict(X_test_one_hot)

In [None]:
#Apply inverse transformation to bring predictions back to the original scale
predictions_lm = np.exp(predictions_lm)

In [None]:
#RMSE 
rms = mean_squared_error(np.exp(y_test), predictions_lm, squared=False)
rms

In [None]:
#tensorflow decision trees
#!pip install tensorflow_decision_forests

In [64]:
# Convert the pandas dataframe into a TensorFlow dataset
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train, label="SalePrice", task = tfdf.keras.Task.REGRESSION)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test, label="SalePrice", task = tfdf.keras.Task.REGRESSION)

In [None]:
model = tfdf.keras.RandomForestModel(task=tfdf.keras.Task.REGRESSION)
model.compile

In [None]:
model.fit(train_ds)

In [None]:
model.compile(metrics=["accuracy"])
print(model.evaluate(test_ds))

In [None]:
inspector = model.make_inspector()
inspector.evaluation()

In [None]:
#note this RMSE is on the logarmithic data, not the actual data! 

In [None]:
#calculate the actual RMSE
preds = model.predict(test_ds)

In [None]:
predics=np.exp(preds)
y=test['SalePrice']

In [None]:
rms = mean_squared_error(np.exp(y), preds, squared=False)
rms

In [None]:

#next acitivity, explore all of these methods and compare RMSE of all

tfdf.keras.get_all_models()


In [None]:
#GradientBoostedTreesModel - work in progress

In [65]:
# Initialize and train Gradient Boosted Trees Regression model
model_gb_regressor = tfdf.keras.GradientBoostedTreesModel(task=tfdf.keras.Task.REGRESSION)
model_gb_regressor.fit(train_ds)
print("Model training complete.")

Use /var/folders/dw/bq4phr0s4yz1_3vy2_gppjcm0000gp/T/tmpr1zqwhjm as temporary training directory
Reading training dataset...


2024-12-31 12:26:13.967031: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1840] "goss_alpha" set but "sampling_method" not equal to "GOSS".
2024-12-31 12:26:13.967058: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1850] "goss_beta" set but "sampling_method" not equal to "GOSS".
2024-12-31 12:26:13.967065: W external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1864] "selective_gradient_boosting_ratio" set but "sampling_method" not equal to "SELGB".


Training dataset read in 0:00:00.351699. Found 1165 examples.
Training model...


I0000 00:00:1735665974.327336 32980817 kernel.cc:782] Start Yggdrasil model training
I0000 00:00:1735665974.327347 32980817 kernel.cc:783] Collect training examples
I0000 00:00:1735665974.327351 32980817 kernel.cc:795] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: NUMERICAL
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

I0000 00:00:1735665974.327477 32980817 kernel.cc:401] Number of batches: 2
I0000 00:00:1735665974.327480 32980817 kernel.cc:402] Number of examples: 1165
I0000 00:00:1735665974.328529 32980817 data_spec_inference.cc:354] 1 item(s) have been pruned (i.e. they are considered out of dictionary) for the column BsmtCond (3 item(s) left) because min_value_count=5 and max_number_of_unique_values=2000
I0000 00:00:1735665974.328539 32980817 data_spec_inference.cc:354] 1 item(s) have be

Model trained in 0:00:00.945945
Compiling model...


I0000 00:00:1735665975.245539 33084703 early_stopping.cc:54] Early stop of the training because the validation loss does not decrease anymore. Best valid-loss: 0.155743
2024-12-31 12:26:15.245570: I external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:1640] Create final snapshot of the model at iteration 76
2024-12-31 12:26:15.247460: I external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:277] Truncates the model to 47 tree(s) i.e. 47  iteration(s).
2024-12-31 12:26:15.247544: I external/ydf/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.cc:339] Final model num-trees:47 valid-loss:0.155743 valid-rmse:0.155743
I0000 00:00:1735665975.248243 33084703 kernel.cc:926] Export model in log directory: /var/folders/dw/bq4phr0s4yz1_3vy2_gppjcm0000gp/T/tmpr1zqwhjm with prefix caaf8148937b42d4
I0000 00:00:1735665975.249595 33084703 kernel.cc:944] Save model in resources
I0000 00

Model compiled.
Model training complete.


In [68]:
# Make predictions on test dataset
predictions_gb_log = model_gb_regressor.predict(test_ds)

# Reverse log transformation for predictions
y_pred_gb = np.exp(predictions_gb_log)

# Reverse the log transformation for true variable
y_true_gb = np.exp(y_test)



In [69]:
# Calculate RMSE and evaluate model performance
rmse_gb = np.sqrt(mean_squared_error(y_true_gb, y_pred_gb))
print(f"Root Mean Squared Error (RMSE): {rmse_gb:.2f}")

Root Mean Squared Error (RMSE): 27005.18


In [None]:
#Cart model

In [None]:
#train model
model_cm_regression = tfdf.keras.CartModel(task=tfdf.keras.Task.REGRESSION)

model_cm_regression.fit(train_ds)

In [None]:
# Make predictions on the same dataset (or test set)
predictions_log_cm = model_cm_regression.predict(train_ds)

# Reverse the log transformation (use np.exp to get the original scale)
predictions_original_scale_cm = np.exp(predictions_log_cm)


In [None]:
rms = mean_squared_error(y_true, predictions_original_scale_cm, squared=False)
rms

In [None]:
#basic RF

In [None]:
regressor = RandomForestRegressor(n_estimators=100, random_state=0, oob_score=True)
regressor.fit(X_train_one_hot, y_train)

In [None]:
predictions_rf = regressor.predict(X_test_one_hot)
mse = mean_squared_error(np.exp(y_test), np.exp(predictions))
print(f'Mean Squared Error: {mse}')

In [None]:
    fig,ax = plt.subplots(figsize=(6,6))
    ax.set_title('House price predictions',fontsize=20)
    ax.set_ylabel('predicted',fontsize=12)
    ax.set_xlabel('actual',fontsize=12)
    ax.scatter(np.exp(y_test), np.exp(predictions_rf),label='Random Forest')
    ax.scatter(y_true, predictions_original_scale_cm,label='Cart model')
    ax.scatter(y_true, predictions_original_scale,label='GradientBoostedTreesModel')
    ax.scatter(np.exp(y_test), predictions_lm,label='Linear model')
    ax.legend()
