In [None]:
import pandas as pd
from sklearn import datasets, linear_model, metrics
import numpy as np
from sklearn.metrics import mean_squared_error
import tensorflow as tf
import tensorflow_decision_forests as tfdf
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv('../data/processed/train_data.csv', keep_default_na=False)
test = pd.read_csv('../data/processed/test_data.csv', keep_default_na=False)

In [None]:
#split input and output
X_train=train.drop(['SalePrice'], axis=1)
y_train= train['SalePrice']

X_test=test.drop(['SalePrice'], axis=1)
y_test= test['SalePrice']

In [None]:
#one hot encoding - convert categorical columns to numerical for linear regression. 
#there was an error with vanishing columns in test. This fixes it. 
X_train_one_hot = pd.get_dummies(X_train, drop_first=True) 
cols = X_train_one_hot.columns.tolist()
X_test_one_hot = pd.get_dummies(X_test, drop_first=True) 
X_test_one_hot = X_test_one_hot.reindex(columns=cols).fillna(0)

In [None]:
#Linear regression

In [None]:
#fit_intercept=False is needed as collinearity of one hot encoding, first row has to be dropped. 
reg = linear_model.LinearRegression(fit_intercept=False)
reg.fit(X_train_one_hot, y_train)

In [None]:
#predict sales price on test set
predictions_lm = reg.predict(X_test_one_hot)

In [None]:
#Apply inverse transformation to bring predictions back to the original scale
predictions_lm = np.exp(predictions_lm)

In [None]:
#RMSE 
rmse_lm = np.sqrt(mean_squared_error(np.exp(y_test), predictions_lm))
print(f'Root Mean Squared Error for Linear Regression Model: {rmse_lm}')

In [None]:
#Tensorflow Decision Forest Models (TF-DF)
tfdf.keras.get_all_models()

In [None]:
# Convert the pandas dataframe into a TensorFlow dataset
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train, label="SalePrice", task = tfdf.keras.Task.REGRESSION)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test, label="SalePrice", task = tfdf.keras.Task.REGRESSION)

In [None]:
#RandomForestModel

In [None]:
# Create a RandomSearch tuner
tuner_rf = tfdf.tuner.RandomSearch(num_trials=20)

tuner_rf.choice("num_trees", [100, 200, 500])
tuner_rf.choice("max_depth", [-1, 10, 30])
tuner_rf.choice("min_examples", [2, 5, 10])

In [None]:
model_rf = tfdf.keras.RandomForestModel(task=tfdf.keras.Task.REGRESSION, tuner=tuner_rf)
model_rf.compile

In [None]:
model_rf.fit(train_ds)

In [None]:
tuning_logs_rf = model_rf.make_inspector().tuning_logs()
# Best hyper-parameters.
tuning_logs_rf[tuning_logs_rf.best].iloc[0]

In [None]:
model_rf.compile(metrics=["accuracy"])
print(model_rf.evaluate(test_ds))

In [None]:
inspector = model_rf.make_inspector()
inspector.evaluation()

In [None]:
#note this RMSE is on the logarmithic data, not the actual data! 

In [None]:
#calculate the actual RMSE
predictions_rf_log = model_rf.predict(test_ds)

In [None]:
predictions_rf=np.exp(predictions_rf_log)

In [None]:
rmse_rf = np.sqrt(mean_squared_error(np.exp(y_test), predictions_rf))
print(f'Root Mean Squared Error for TF-DF Random Forest: {rmse_rf}')

In [None]:
#GradientBoostedTreesModel

In [None]:
# Create a RandomSearch tuner
tuner_gb = tfdf.tuner.RandomSearch(num_trials=10)

tuner_gb.choice("num_trees", [50, 100, 200, 500, 1000])
tuner_gb.choice("shrinkage", [0.01, 0.05, 0.1, 0.3, 0.5])
tuner_gb.choice("max_depth", [3, 4, 5, 6, 8, 10])
tuner_gb.choice("min_examples", [2, 5, 10, 15])

In [None]:
# Initialize and train Gradient Boosted Trees Regression model
model_gb_regressor = tfdf.keras.GradientBoostedTreesModel(
    task=tfdf.keras.Task.REGRESSION,
    tuner=tuner_gb
)

model_gb_regressor.fit(train_ds)
print("Model training complete.")

In [None]:
tuning_logs_gb = model_gb_regressor.make_inspector().tuning_logs()
# Best hyper-parameters.
tuning_logs_gb[tuning_logs_gb.best].iloc[0]

In [None]:
# Make predictions on test dataset
predictions_gb_log = model_gb_regressor.predict(test_ds)

# Reverse log transformation for predictions
predictions_gb = np.exp(predictions_gb_log)

# Reverse the log transformation for true variable
y=test['SalePrice']

In [None]:
# Calculate RMSE and evaluate model performance
rmse_gb = np.sqrt(mean_squared_error(np.exp(y_test), predictions_gb))
print(f'Root Mean Squared Error for TF-DF Gradient Boosted Trees: {rmse_gb}') 

In [None]:
#CartModel

In [None]:
#train model
model_cm_regression = tfdf.keras.CartModel(task=tfdf.keras.Task.REGRESSION)
model_cm_regression.fit(train_ds)

In [None]:
# Make predictions on the same dataset (or test set)
predictions_log_cm = model_cm_regression.predict(test_ds)

# Reverse the log transformation (use np.exp to get the original scale)
predictions_cm = np.exp(predictions_log_cm)

In [None]:
rmse_cm = np.sqrt(mean_squared_error(np.exp(y_test), predictions_cm))
print(f'Root Mean Squared Error for TF-DF Cart Model: {rmse_cm}')

In [None]:
#Basic sklearn Random Forest

In [None]:
regressor = RandomForestRegressor(n_estimators=100, random_state=0, oob_score=True)
regressor.fit(X_train_one_hot, y_train)

In [None]:
predictions_regressor_log = regressor.predict(X_test_one_hot)
predictions_regressor = np.exp(predictions_regressor_log)

In [None]:
mse_rf = mean_squared_error(np.exp(y_test), predictions_regressor)
print(f'Mean Squared Error for sklearn Random Forest: {mse_rf}')

In [None]:
    fig,ax = plt.subplots(figsize=(6,6))
    ax.set_title('House price predictions',fontsize=20)
    ax.set_ylabel('predicted',fontsize=12)
    ax.set_xlabel('actual',fontsize=12)
    ax.scatter(np.exp(y_test), predictions_regressor,label='Random Forest')
    ax.scatter(np.exp(y_test), predictions_rf,label='TF-DF random forest')
    ax.scatter(np.exp(y_test), predictions_cm,label='Cart model')
    ax.scatter(np.exp(y_test), predictions_gb,label='GradientBoostedTreesModel')
    ax.scatter(np.exp(y_test), predictions_lm,label='Linear model')
    ax.legend()


In [None]:
# Map model names to their corresponding classes
models = {
    "gradient_boosted_trees": tfdf.keras.GradientBoostedTreesModel,
    "random_forest": tfdf.keras.RandomForestModel
}

# Define model hyperparameter configurations
params = {
    "gradient_boosted_trees": {
        "num_trees": [50, 100, 200, 500, 1000],
        "shrinkage": [0.01, 0.05, 0.1, 0.3, 0.5],
        "max_depth": [3, 4, 5, 6, 8, 10],
        "min_examples": [2, 5, 10, 15]
    },
    "random_forest": {
        "num_trees": [100, 200, 500],
        "max_depth": [-1, 10, 30],
        "min_examples": [2, 5, 10]
    }
}

num_trials = {
    "gradient_boosted_trees": 10,
    "random_forest": 20
}

# Placeholder to store model output
output_logs = []

In [None]:
# Train each model
for model_name, config in params.items():
    output_logs.append(f"Training and tuning {model_name}...")

    num_trial = num_trials.get(model_name, 10)  # Get the number of trials for the model
    tuner = tfdf.tuner.RandomSearch(num_trials=num_trial)
    
    # Loop through the configuration dictionary and set the hyperparameters
    for param, values in config.items():
        tuner.choice(param, values)
    
    model_class = models[model_name]  # Get the corresponding model class
    model = model_class(task=tfdf.keras.Task.REGRESSION, tuner=tuner)
    
    model.fit(train_ds)

    #get best parameters
    tuning_logs = model.make_inspector().tuning_logs()
    best_params = tuning_logs[tuning_logs.best].iloc[0]
    
    predictions_log = model.predict(test_ds)
    y_pred = np.exp(predictions_log)
    y_true = np.exp(y_test)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    
    output_logs.append(f"Best parameters for {model_name}: {best_params}")
    output_logs.append(f"Best RMSE for {model_name}: {rmse:.6f}")

In [None]:
for log in output_logs:
    print(log)

In [None]:
#gradient boosting has the lowest RMSE. That will be used for the final model. 

In [None]:
#merge train_ds and test_ds
total_ds=pd.concat([train,test])

In [None]:
total_ds = tfdf.keras.pd_dataframe_to_tf_dataset(total_ds, label="SalePrice", task = tfdf.keras.Task.REGRESSION)


In [None]:
# Create a RandomSearch tuner
tuner_gb = tfdf.tuner.RandomSearch(num_trials=10)

tuner_gb.choice("num_trees", [50, 100, 200, 500, 1000])
tuner_gb.choice("shrinkage", [0.01, 0.05, 0.1, 0.3, 0.5])
tuner_gb.choice("max_depth", [3, 4, 5, 6, 8, 10])
tuner_gb.choice("min_examples", [2, 5, 10, 15])

In [None]:
# Initialize and train Gradient Boosted Trees Regression model
model_gb_regressor = tfdf.keras.GradientBoostedTreesModel(
    task=tfdf.keras.Task.REGRESSION,
    tuner=tuner_gb
)

model_gb_regressor.fit(total_ds)
print("Model training complete.")

In [None]:
tuning_logs_gb = model_gb_regressor.make_inspector().tuning_logs()
# Best hyper-parameters.
tuning_logs_gb[tuning_logs_gb.best].iloc[0]

In [None]:
#load testing data set 
testing = pd.read_csv('../data/raw/test.csv')

In [None]:
#replace LotFrontage with median of that column
median=testing['LotFrontage'].median()
testing['LotFrontage']=testing['LotFrontage'].fillna(median).astype(int)


In [None]:
median=testing['BsmtFinSF1'].median()
testing['BsmtFinSF1']=testing['BsmtFinSF1'].fillna(median).astype(int)

In [None]:
median=testing['BsmtFinSF2'].median()
testing['BsmtFinSF2']=testing['BsmtFinSF2'].fillna(median).astype(int)

In [None]:
median=testing['BsmtUnfSF'].median()
testing['BsmtUnfSF']=testing['BsmtUnfSF'].fillna(median).astype(int)

In [None]:
median=testing['TotalBsmtSF'].median()
testing['TotalBsmtSF']=testing['TotalBsmtSF'].fillna(median).astype(int)

In [None]:
median=testing['BsmtFullBath'].median()
testing['BsmtFullBath']=testing['BsmtFullBath'].fillna(median).astype(int)

In [None]:
median=testing['BsmtHalfBath'].median()
testing['BsmtHalfBath']=testing['BsmtHalfBath'].fillna(median).astype(int)

In [None]:
median=testing['GarageCars'].median()
testing['GarageCars']=testing['GarageCars'].fillna(median).astype(int)

In [None]:
median=testing['GarageArea'].median()
testing['GarageArea']=testing['GarageArea'].fillna(median).astype(int)

In [None]:
testing_tf = tfdf.keras.pd_dataframe_to_tf_dataset(testing, task = tfdf.keras.Task.REGRESSION)

In [None]:
# Make predictions on test dataset
predictions_gb_log = model_gb_regressor.predict(testing_tf).flatten() #convert to 1D for submission

# Reverse log transformation for predictions
predictions_gb = np.exp(predictions_gb_log)


In [None]:
predictions_gb

In [None]:
submission = pd.DataFrame({
    "Id": testing["Id"], 
    "SalePrice": predictions_gb
})

In [None]:
submission.to_csv('../data/results/predictions_gb.csv', index=False)