In [None]:
# Dependencies and Setup
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import config
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sqlalchemy import create_engine
from sqlalchemy import inspect
import numpy as np

label_encoder = preprocessing.LabelEncoder()

In [None]:
#Connecting to RDS database using config for all personal data

rds_connection_string = f'{config.connection_string}'
engine = create_engine(rds_connection_string)
insp = inspect(engine)

In [None]:
#Checking if Rental data has been loaded into the database
df = pd.read_sql_query('select * from rental_info where bathroom<=bhk+1', con=engine)
df.head()

In [None]:
# create funcion to process different models and data

def test_model(model, data):
    X_train_scaled, X_test_scaled, y_train, y_test = data
    reg = model.fit(X_train_scaled, y_train)
    print(f'Model: {type(reg).__name__}')
    print(f'Train score: {reg.score(X_train_scaled, y_train)}')
    print(f'Test Score: {reg.score(X_test_scaled, y_test)}\n')
    plt.show()    

In [None]:
# Define the X (features) and y (target) sets

X = df.drop(['rent','suburb','floor_level', 'id'], axis=1)
y = df.rent

In [None]:
# Encode labels in object columns 

X['posted_on']= label_encoder.fit_transform(X['posted_on'])
posted_on_name_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

X['area_type']= label_encoder.fit_transform(X['area_type'])
area_type_name_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

X['city']= label_encoder.fit_transform(X['city'])
city_name_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

X['furnishing_status']= label_encoder.fit_transform(X['furnishing_status'])
furnishing_status_name_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

X['tenant_preferred']= label_encoder.fit_transform(X['tenant_preferred'])
tenant_preferred_name_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

X['point_of_contact']= label_encoder.fit_transform(X['point_of_contact'])
point_of_contact_name_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

In [None]:
# Display X
X.head()

In [None]:
# Split data into test and training
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Using standard scaler we scale and fit to the X_train
scaler = StandardScaler().fit(X_train)

# using scaler create 
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# created varaible 'data' to hold x and y training and testing data
data = [X_train_scaled, X_test_scaled, y_train, y_test]

In [None]:
#Display shape

X_train.shape, X_test.shape

In [None]:
# Display X_train_scaled array

X_train_scaled

In [None]:
# Run several models through the test_model function for a quick analysis

test_model(LinearRegression(), data)
test_model(KNeighborsRegressor(), data)
test_model(RandomForestRegressor(), data)
test_model(ExtraTreesRegressor(), data)
test_model(AdaBoostRegressor(), data)
test_model(SVR(C=1.0, epsilon=0.2), data)

From the results above RandomForestRegressor, AdaBoostRegressor and ExtraTresregessor have good training scores. Upon reviewing the test scores for each it appears that ExtraTreesRegressor is overfitting to the training data which is resulting in a testing score of -0.047. We will continue forward with RandomForestRegressor and AdaBoostRegressor hyperparameter tuning.

### RandomForestRegressor hyperparameter tuning

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 300, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [10,20,30,40,50]

# Minimum number of samples required to split a node
min_samples_split = [2, 5]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3, 4, 5]

# Method of selecting samples for training each tree
bootstrap = [True, False]

In [None]:
# Create the param grid
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(param_grid)

In [None]:
# Import the model
model = RandomForestRegressor()

In [None]:
rf_RandomGrid = RandomizedSearchCV(estimator = model, param_distributions = param_grid, cv = 10, verbose=2, n_jobs = 4, random_state=30)

In [None]:
# Fit the model by using the Randomized search classifier.
rf_RandomGrid.fit(X_train_scaled, y_train)

In [None]:
# Display the params with the best results
print(rf_RandomGrid.best_params_)

In [None]:
# Print accuracy scores
print (f'Train Accuracy - : {rf_RandomGrid.score(X_train_scaled,y_train):.3f}')
print (f'Test Accuracy - : {rf_RandomGrid.score(X_test_scaled,y_test):.3f}')

### AdaBoostRegressor hyperparameter tuning

In [None]:
# Create the params
params = {
    'n_estimators': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 20],
    'learning_rate': [(0.97 + x / 100) for x in range(0, 8)]
}

In [None]:
# Import the model
ab_clf = AdaBoostRegressor(random_state=42)

In [None]:
clf = GridSearchCV(ab_clf, params, cv = 10, verbose=2, n_jobs = 4)

In [None]:
# Fit the model by using the Randomized search classifier.
clf.fit(X_train_scaled, y_train)

In [None]:
# Display the params with the best results
print(clf.best_params_)

In [None]:
# Print accuracy scores
print (f'Train Accuracy - : {clf.score(X_train_scaled,y_train):.3f}')
print (f'Test Accuracy - : {clf.score(X_test_scaled,y_test):.3f}')

## Modelling with best params
<hr>

### RandomForestRegressor
<hr>

In [None]:
# Import the model
rf_model = RandomForestRegressor(n_estimators= 42, min_samples_split= 2, min_samples_leaf= 3, max_features= 'sqrt', max_depth= 50, bootstrap= True)

In [None]:
# Fit the model to the training data
rf_model.fit(X_train_scaled, y_train)

In [None]:
# Display the model score using the testing data
print (f'Test Accuracy - : {rf_model.score(X_test_scaled,y_test):.3f}')

### AdaBoostRegressor
<hr>

In [None]:
ada_model = AdaBoostRegressor(learning_rate=1.01, n_estimators=3, random_state=42)

In [None]:
# Fit the model to the training data
ada_model.fit(X_train_scaled, y_train)

In [None]:
# Display the model score using the testing data
print (f'Test Accuracy - : {ada_model.score(X_test_scaled,y_test):.3f}')

## Final observations
While both RandomForestRegressor and AdaBoostRegressor models have good training figures, the RandomForestRegressor model has a higher testing accuracy score. While this may not be a strong/high accuracy, 0.71 accuracy is sufficeint for our case study. 

## Saving Model and Scaler

In [None]:
import joblib

In [None]:
# Save the model to a file

joblib.dump(rf_model, 'model/rental_trained.joblib')
joblib.dump(scaler, 'model/rental_scaler.joblib')

# Save the encoder mapping for use in the .py file

joblib.dump(posted_on_name_mapping, 'model/posted_on_name_mapping.joblib')
joblib.dump(area_type_name_mapping, 'model/area_type_name_mapping.joblib')
joblib.dump(city_name_mapping, 'model/city_name_mapping.joblib')
joblib.dump(furnishing_status_name_mapping, 'model/furnishing_status_name_mapping.joblib')
joblib.dump(tenant_preferred_name_mapping, 'model/tenant_preferred_name_mapping.joblib')
joblib.dump(point_of_contact_name_mapping, 'model/point_of_contact_name_mapping.joblib')

## Check Model Export is working

In [None]:
# Load the model from a file
model = joblib.load('model/rental_trained.joblib')

In [None]:
# Display the model score using the testing data
print (f'Test Accuracy - : {model.score(X_test_scaled,y_test):.3f}')

## Making a prediction with the model

In [None]:
# Grab just one data point to test with

test = X_train[:1]
test_result = y_train[:1]

In [None]:
# Display test data

print(f"Test data: {test.iloc[0].to_numpy()}, Rental based on test data: {int(test_result)}")

In [None]:
# Scale data for predictions

test_scaled = scaler.transform(test)
test_scaled

In [None]:
# Predict and display prediction

prediction = model.predict(test_scaled).round(2)
print(float(prediction))

In [None]:
# Compare prediction vs actual

print(f"The pridicted rental, for the test property, is {float(prediction)} and the actual rental is {float(test_result)}")
print(f"Which is a difference of {float((test_result-prediction).round(2))}.")