In [None]:

# for numerical computing
import numpy as np

# for dataframes
import pandas as pd

In [2]:
from sklearn.linear_model import Ridge  # Linear Regression + L2 regularization
from sklearn.linear_model import Lasso  # Linear Regression + L1 regularization
from sklearn.svm import SVR # Support Vector Regressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
# Ignore Warnings
import warnings
warnings.filterwarnings("ignore")


In [None]:
#import xgboost (this is Gradient Boost ML Model)
import os
mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-7.2.0-posix-seh-rt_v5-rev0\\mingw64\\bin'
os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
from xgboost import XGBRegressor
from xgboost import plot_importance  # to plot feature importance

In [None]:
# data preparation, enter in our csv file, or json if we go that route

df = pd.read_csv("__________.csv")

In [None]:
print(df.shape)

In [None]:
# train and test splits (update target variable to our specific model, such as lat/long or radius, or price?)
# Create separate object for target variable
y = df.tx_price
# Create separate object for input features
X = df.drop('tx_price', axis=1)

In [None]:

# Split X and y into train and test sets: 80-20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)



In [None]:
# confirm we have the right number of observations in each subset.

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


In [None]:
# data standardization

train_mean = X_train.mean()
train_std = X_train.std()

In [None]:
# Standardize the train data set

X_train = (X_train - train_mean) / train_std

In [None]:
# Check for mean and std dev.
X_train.describe()

In [None]:

#Note: We use train_mean and train_std_dev to standardize test data set
X_test = (X_test - train_mean) / train_std



In [None]:
# Check for mean and std dev. - not exactly 0 and 1
X_test.describe()

## Model 1 - Baseline Model

In [None]:
## Predict Train results
y_train_pred = np.ones(y_train.shape[0])*y_train.mean()


In [None]:
## Predict Test results
y_pred = np.ones(y_test.shape[0])*y_train.mean()
km

In [None]:
print("Train Results for Baseline Model:")
print("*******************************")
print("Root mean squared error: ", sqrt(mse(y_train.values, y_train_pred)))
print("R-squared: ", r2_score(y_train.values, y_train_pred))
print("Mean Absolute Error: ", mae(y_train.values, y_train_pred))


In [None]:
print("Results for Baseline Model:")
print("*******************************")
print("Root mean squared error: ", sqrt(mse(y_test, y_pred)))
print("R-squared: ", r2_score(y_test, y_pred))
print("Mean Absolute Error: ", mae(y_test, y_pred))

## Model 2 - Random Forest Regression

In [None]:
## Reference for random search on random forest
## https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
tuned_params = {'n_estimators': [100, 200, 300, 400, 500], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}
model = RandomizedSearchCV(RandomForestRegressor(), tuned_params, n_iter=20, scoring = 'neg_mean_absolute_error', cv=5, n_jobs=-1)
model.fit(X_train, y_train)
## This takes around 15 minutes

In [None]:
model.best_estimator_

In [None]:
## Predict Train results
y_train_pred = model.predict(X_train)

In [None]:
## Predict Test results
y_pred = model.predict(X_test)

In [None]:
print("Train Results for Random Forest Regression:")
print("*******************************")
print("Root mean squared error: ", sqrt(mse(y_train.values, y_train_pred)))
print("R-squared: ", r2_score(y_train.values, y_train_pred))
print("Mean Absolute Error: ", mae(y_train.values, y_train_pred))

In [None]:
print("Test Results for Random Forest Regression:")
print("*******************************")
print("Root mean squared error: ", sqrt(mse(y_test, y_pred)))
print("R-squared: ", r2_score(y_test, y_pred))
print("Mean Absolute Error: ", mae(y_test, y_pred))

In [None]:

# RFR Feature Importance, do we need this section?
## Building the model again with the best hyperparameters
model = RandomForestRegressor(n_estimators=200, min_samples_split=10, min_samples_leaf=2)
model.fit(X_train, y_train)

In [None]:
#not sure if we need this importance section?
indices = np.argsort(-model.feature_importances_)
print("The features in order of importance are:")
print(50*'-')
for feature in X.columns[indices]:
    print(feature)


## Model 3 - XGBoost Regression (Gradient Boost)

In [None]:
## Reference for random search on xgboost
## https://gist.github.com/wrwr/3f6b66bf4ee01bf48be965f60d14454d
tuned_params = {'max_depth': [1, 2, 3, 4, 5], 'learning_rate': [0.01, 0.05, 0.1], 'n_estimators': [100, 200, 300, 400, 500], 'reg_lambda': [0.001, 0.1, 1.0, 10.0, 100.0]}
model = RandomizedSearchCV(XGBRegressor(), tuned_params, n_iter=20, scoring = 'neg_mean_absolute_error', cv=5, n_jobs=-1)
model.fit(X_train, y_train)

In [None]:
model.best_estimator_


In [None]:
## Predict Train results
y_train_pred = model.predict(X_train)


In [None]:
## Predict Test results
y_pred = model.predict(X_test)


In [None]:

print("Train Results for XGBoost Regression:")
print("*******************************")
print("Root mean squared error: ", sqrt(mse(y_train.values, y_train_pred)))
print("R-squared: ", rs(y_train.values, y_train_pred))
print("Mean Absolute Error: ", mae(y_train.values, y_train_pred))


In [None]:

print("Test Results for XGBoost Regression:")
print("*******************************")
print("Root mean squared error: ", sqrt(mse(y_test, y_pred)))
print("R-squared: ", r2_score(y_test, y_pred))
print("Mean Absolute Error: ", mae(y_test, y_pred))


In [None]:
# Feature importance, again, not sure if we'll need this part...

## Building the model again with the best hyperparameters
model = XGBRegressor(max_depth=2,learning_rate=0.05,n_estimators=400, reg_lambda=0.001)
model.fit(X_train, y_train)



In [None]:
## Function to include figsize parameter
## Reference: https://stackoverflow.com/questions/40081888/xgboost-plot-importance-figure-size
def my_plot_importance(booster, figsize, **kwargs): 
    from matplotlib import pyplot as plt
    from xgboost import plot_importance
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax, **kwargs)

In [None]:
#not sure if we want a feature importance horizontal bar plot
my_plot_importance(model, (10,10))

