In [1]:
# Dependencies and Setup
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import config
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sqlalchemy import create_engine
from sqlalchemy import inspect
import numpy as np

label_encoder = preprocessing.LabelEncoder()

In [2]:
#Connecting to RDS database using config for all personal data

rds_connection_string = f'{config.connection_string}'
engine = create_engine(rds_connection_string)
insp = inspect(engine)

In [3]:
#Checking if Rental data has been loaded into the database
df = pd.read_sql_query('select * from rental_info where bathroom<=bhk+1', con=engine)
df.head()

Unnamed: 0,id,posted_on,bhk,rent,size,floor_level,area_type,suburb,city,furnishing_status,tenant_preferred,bathroom,point_of_contact
0,1,May,2,10000,1100,Ground,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner
1,2,May,2,20000,800,1,Super Area,Kankurgachi,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
2,3,May,2,17000,1000,1,Super Area,Sector II,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
3,4,July,2,10000,800,1,Super Area,Dum Dum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner
4,5,April,2,7000,600,Ground,Super Area,Thakurpukur,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner


In [4]:
# create funcion to process different models and data

def test_model(model, data):
    X_train_scaled, X_test_scaled, y_train, y_test = data
    reg = model.fit(X_train_scaled, y_train)
    print(f'Model: {type(reg).__name__}')
    print(f'Train score: {reg.score(X_train_scaled, y_train)}')
    print(f'Test Score: {reg.score(X_test_scaled, y_test)}\n')
    plt.show()    

In [5]:
# Define the X (features) and y (target) sets

X = df.drop(['rent','suburb','floor_level', 'id'], axis=1)
y = df.rent

In [6]:
# Encode labels in object columns 

X['posted_on']= label_encoder.fit_transform(X['posted_on']) 
X['area_type']= label_encoder.fit_transform(X['area_type']) 
X['city']= label_encoder.fit_transform(X['city']) 
X['furnishing_status']= label_encoder.fit_transform(X['furnishing_status']) 
X['tenant_preferred']= label_encoder.fit_transform(X['tenant_preferred'])
X['point_of_contact']= label_encoder.fit_transform(X['point_of_contact'])

In [7]:
# Display X
X.head()

Unnamed: 0,posted_on,bhk,size,area_type,city,furnishing_status,tenant_preferred,bathroom,point_of_contact
0,3,2,1100,2,4,2,1,2,2
1,3,2,800,2,4,1,1,1,2
2,3,2,1000,2,4,1,1,1,2
3,1,2,800,2,4,2,1,1,2
4,0,2,600,2,4,2,1,2,2


In [8]:
# Split data into test and training
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Using standard scaler we scale and fit to the X_train
scaler = StandardScaler().fit(X_train)

# using scaler create 
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# created varaible 'data' to hold x and y training and testing data
data = [X_train_scaled, X_test_scaled, y_train, y_test]

In [9]:
#Display shape

X_train.shape, X_test.shape

((2710, 9), (904, 9))

In [10]:
# Display X_train_scaled array

X_train_scaled

array([[-0.06244293, -1.32281518, -0.6923144 , ..., -1.76962839,
         0.04698461, -1.41313797],
       [-1.21360165,  1.09691424,  1.17631   , ...,  0.16646081,
         1.2259504 , -1.41313797],
       [-0.06244293, -0.11295047, -0.12360262, ...,  0.16646081,
        -1.13198117,  0.70793868],
       ...,
       [-0.06244293, -1.32281518, -0.6923144 , ...,  0.16646081,
        -1.13198117,  0.70793868],
       [-0.06244293, -1.32281518, -1.09853709, ..., -1.76962839,
        -1.13198117, -1.41313797],
       [-0.06244293, -1.32281518, -0.77355894, ...,  0.16646081,
        -1.13198117,  0.70793868]])

In [11]:
# Run several models through the test_model function for a quick analysis

test_model(LinearRegression(), data)
test_model(KNeighborsRegressor(), data)
test_model(RandomForestRegressor(), data)
test_model(ExtraTreesRegressor(), data)
test_model(AdaBoostRegressor(), data)
test_model(SVR(C=1.0, epsilon=0.2), data)

Model: LinearRegression
Train score: 0.2129697748381839
Test Score: 0.45730170270617143

Model: KNeighborsRegressor
Train score: 0.40966046678950496
Test Score: 0.6326564969276245

Model: RandomForestRegressor
Train score: 0.8587816519156364
Test Score: 0.5005026676116218

Model: ExtraTreesRegressor
Train score: 0.9982251110226689
Test Score: 0.07017299124631537

Model: AdaBoostRegressor
Train score: 0.8201256524853813
Test Score: 0.5955805003683705

Model: SVR
Train score: -0.043539967464448104
Test Score: -0.09437898207837647



From the results above RandomForestRegressor, AdaBoostRegressor and ExtraTresregessor have good training scores. Upon reviewing the test scores for each it appears that ExtraTreesRegressor is overfitting to the training data which is resulting in a testing score of -0.047. We will continue forward with RandomForestRegressor and AdaBoostRegressor hyperparameter tuning.

### RandomForestRegressor hyperparameter tuning

In [12]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 300, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [10,20,30,40,50]

# Minimum number of samples required to split a node
min_samples_split = [2, 5]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3, 4, 5]

# Method of selecting samples for training each tree
bootstrap = [True, False]

In [13]:
# Create the param grid
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(param_grid)

{'n_estimators': [10, 42, 74, 106, 138, 171, 203, 235, 267, 300], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2, 3, 4, 5], 'bootstrap': [True, False]}


In [14]:
# Import the model
model = RandomForestRegressor()

In [15]:
rf_RandomGrid = RandomizedSearchCV(estimator = model, param_distributions = param_grid, cv = 10, verbose=2, n_jobs = 4, random_state=30)

In [16]:
# Fit the model by using the Randomized search classifier.
rf_RandomGrid.fit(X_train_scaled, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


RandomizedSearchCV(cv=10, estimator=RandomForestRegressor(), n_jobs=4,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 3, 4, 5],
                                        'min_samples_split': [2, 5],
                                        'n_estimators': [10, 42, 74, 106, 138,
                                                         171, 203, 235, 267,
                                                         300]},
                   random_state=30, verbose=2)

In [17]:
# Display the params with the best results
print(rf_RandomGrid.best_params_)

{'n_estimators': 106, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 40, 'bootstrap': True}


In [18]:
# Print accuracy scores
print (f'Train Accuracy - : {rf_RandomGrid.score(X_train_scaled,y_train):.3f}')
print (f'Test Accuracy - : {rf_RandomGrid.score(X_test_scaled,y_test):.3f}')

Train Accuracy - : 0.484
Test Accuracy - : 0.737


### AdaBoostRegressor hyperparameter tuning

In [19]:
# Create the params
params = {
    'n_estimators': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 20],
    'learning_rate': [(0.97 + x / 100) for x in range(0, 8)]
}

In [20]:
# Import the model
ab_clf = AdaBoostRegressor(random_state=42)

In [21]:
clf = GridSearchCV(ab_clf, params, cv = 10, verbose=2, n_jobs = 4)

In [22]:
# Fit the model by using the Randomized search classifier.
clf.fit(X_train_scaled, y_train)

Fitting 10 folds for each of 96 candidates, totalling 960 fits


GridSearchCV(cv=10, estimator=AdaBoostRegressor(random_state=42), n_jobs=4,
             param_grid={'learning_rate': [0.97, 0.98, 0.99, 1.0, 1.01, 1.02,
                                           1.03, 1.04],
                         'n_estimators': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                          20]},
             verbose=2)

In [23]:
# Display the params with the best results
print(clf.best_params_)

{'learning_rate': 1.01, 'n_estimators': 3}


In [24]:
# Print accuracy scores
print (f'Train Accuracy - : {clf.score(X_train_scaled,y_train):.3f}')
print (f'Test Accuracy - : {clf.score(X_test_scaled,y_test):.3f}')

Train Accuracy - : 0.551
Test Accuracy - : 0.514


## Modelling with best params
<hr>

### RandomForestRegressor
<hr>

In [25]:
# Import the model
model = RandomForestRegressor(n_estimators= 42, min_samples_split= 2, min_samples_leaf= 3, max_features= 'sqrt', max_depth= 50, bootstrap= True)

In [26]:
# Fit the model to the training data
model.fit(X_train_scaled, y_train)

RandomForestRegressor(max_depth=50, max_features='sqrt', min_samples_leaf=3,
                      n_estimators=42)

In [27]:
# Display the model score using the testing data
print (f'Test Accuracy - : {model.score(X_test_scaled,y_test):.3f}')

Test Accuracy - : 0.756


### AdaBoostRegressor
<hr>

In [28]:
model = AdaBoostRegressor(learning_rate=1.01, n_estimators=3, random_state=42)

In [29]:
# Fit the model to the training data
model.fit(X_train_scaled, y_train)

AdaBoostRegressor(learning_rate=1.01, n_estimators=3, random_state=42)

In [30]:
# Display the model score using the testing data
print (f'Test Accuracy - : {model.score(X_test_scaled,y_test):.3f}')

Test Accuracy - : 0.514
