In [None]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
import pickle 
import sqlite3

In [None]:
def create_connection(db_file, delete_db=False):
    import os
    if delete_db and os.path.exists(db_file):
        os.remove(db_file)

    conn = None
    try:
        conn = sqlite3.connect(db_file)
        conn.execute("PRAGMA foreign_keys = 1")
    except Error as e:
        print(e)

    return conn

conn = create_connection('database/USRent.db')
sql_statement = "select * from HousingList;"
data = pd.read_sql_query(sql_statement, conn)

In [None]:
df = data.copy(deep=True)

In [None]:
display(df.head())

Remove columns that are not necessary for regression models

In [None]:
df.drop(["listing_id","location_id"], axis=1, inplace=True)

Removing outlier values as they greatly affect the slope of the regression line resulting in poor model. 

In [None]:
df=df[df["price"]>100]
df=df[df["sqfeet"]>=120]
df=df[df["price"]<5000]

Split the data into training and test set 

In [None]:
X = df.drop('price', axis=1)
y = df["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [None]:
X_train.head()

Run various Regression models 

In [None]:
linear = LinearRegression()
linear.fit(X_train, y_train)
y_pred=linear.predict(X_test)
lr_r2=metrics.r2_score(y_test, y_pred)
lr_MSE=metrics.mean_squared_error(y_test, y_pred)
lr_RMSE=np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print(lr_r2, lr_MSE, lr_RMSE)

In [None]:
from sklearn import tree

clf = tree.DecisionTreeRegressor()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
lr_r2 = metrics.r2_score(y_test, y_pred)
lr_MSE = metrics.mean_squared_error(y_test, y_pred)
lr_RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print(lr_r2, lr_MSE, lr_RMSE)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0)
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)
feature_importance = gbr.feature_importances_
lr_r2 = metrics.r2_score(y_test, y_pred)
lr_MSE = metrics.mean_squared_error(y_test, y_pred)
lr_RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print(lr_r2, lr_MSE, lr_RMSE)

Save the best model so that it can be used for inference. 

In [None]:
filename = 'prediction_model.sav'
pickle.dump(clf, open(filename, 'wb'))

Draw a feature importance chart to find the factors that influence the rental price.

In [None]:
feature_importance = gbr.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + 0.5
fig = plt.figure(figsize=(10, 9))
plt.barh(pos, feature_importance[sorted_idx], align="center")
plt.yticks(pos, np.array(X_train.columns)[sorted_idx])
plt.title("Feature Importance (MDI)")