In [66]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import numpy as np
import pickle 
import sqlite3

In [67]:
def create_connection(db_file, delete_db=False):
    import os
    if delete_db and os.path.exists(db_file):
        os.remove(db_file)

    conn = None
    try:
        conn = sqlite3.connect(db_file)
        conn.execute("PRAGMA foreign_keys = 1")
    except Error as e:
        print(e)

    return conn

conn = create_connection('database/USRent.db')
sql_statement = "select * from HousingList;"
data = pd.read_sql_query(sql_statement, conn)

In [63]:
df = data.copy(deep=True)

In [64]:
df.drop(["url","region_url","image_url","description","id","region"], axis = 1, inplace=True)
display(df)

KeyError: "['description'] not found in axis"

In [32]:
df.isnull().sum()
df["laundry_options"]=df["laundry_options"].fillna(df["laundry_options"].mode()[0])
df["parking_options"]=df["parking_options"].fillna(df["parking_options"].mode()[0])
df.fillna(0, inplace=True)
df.isnull().sum()

price                      0
type                       0
sqfeet                     0
beds                       0
baths                      0
cats_allowed               0
dogs_allowed               0
smoking_allowed            0
wheelchair_access          0
electric_vehicle_charge    0
comes_furnished            0
laundry_options            0
parking_options            0
lat                        0
long                       0
state                      0
dtype: int64

In [33]:
df=df[df["price"]>100]
df=df[df["sqfeet"]>=120]
df=df[df["price"]<5000]

In [34]:
label_encoder = preprocessing.LabelEncoder()
categorical_cols = ["type","laundry_options","parking_options","state"]
for col_name in categorical_cols:
    df[col_name] = label_encoder.fit_transform(df[col_name])

print(df.head())

   price  type  sqfeet  beds  baths  cats_allowed  dogs_allowed  \
0   1148     0    1078     3    2.0             1             1   
1   1200     2    1001     2    2.0             0             0   
2   1813     0    1683     2    2.0             1             1   
3   1095     0     708     1    1.0             1             1   
4    289     0     250     0    1.0             1             1   

   smoking_allowed  wheelchair_access  electric_vehicle_charge  \
0                0                  0                        0   
1                0                  0                        0   
2                1                  0                        0   
3                1                  0                        0   
4                1                  1                        0   

   comes_furnished  laundry_options  parking_options      lat     long  state  
0                0                4                1  39.5483 -119.796      4  
1                0                3     

In [35]:
X = df.drop('price', axis=1)
y = df["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [53]:
linear = LinearRegression()
linear.fit(X_train, y_train)
y_pred=linear.predict(X_test)
lr_r2=metrics.r2_score(y_test, y_pred)
lr_MSE=metrics.mean_squared_error(y_test, y_pred)
lr_RMSE=np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print(lr_r2, lr_MSE, lr_RMSE)

0.21168422744907855 236907.4988691645 486.73144429876777


In [54]:
pred_graph_lr=pd.DataFrame({"True Values":y_test, "Predicted Values":y_pred})
pred_graph_lr

Unnamed: 0,True Values,Predicted Values
263535,1100,930.862866
265585,899,1055.103572
198423,1331,1476.831192
144076,1875,1112.798307
366990,710,1343.834526
...,...,...
199412,1400,1205.370019
1354,1845,1676.309956
354870,1000,1200.834629
237088,525,1046.165738


In [55]:
from sklearn.linear_model import Ridge

Ridge = Ridge(alpha=0.5)
Ridge.fit(X_train, y_train)
y_pred=Ridge.predict(X_test)
lr_r2=metrics.r2_score(y_test, y_pred)
lr_MSE=metrics.mean_squared_error(y_test, y_pred)
lr_RMSE=np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print(lr_r2, lr_MSE, lr_RMSE)

0.21168419348016 236907.50907762643 486.73145478551766


In [56]:
pred_graph_lr=pd.DataFrame({"True Values":y_test, "Predicted Values":y_pred})
pred_graph_lr

Unnamed: 0,True Values,Predicted Values
263535,1100,930.863826
265585,899,1055.103040
198423,1331,1476.832241
144076,1875,1112.798999
366990,710,1343.834601
...,...,...
199412,1400,1205.364437
1354,1845,1676.310783
354870,1000,1200.834257
237088,525,1046.166229


In [57]:
from sklearn import tree

clf = tree.DecisionTreeRegressor()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
lr_r2 = metrics.r2_score(y_test, y_pred)
lr_MSE = metrics.mean_squared_error(y_test, y_pred)
lr_RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print(lr_r2, lr_MSE, lr_RMSE)

0.7923590328536186 62401.01226218458 249.80194607365368


In [58]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0)
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)
lr_r2 = metrics.r2_score(y_test, y_pred)
lr_MSE = metrics.mean_squared_error(y_test, y_pred)
lr_RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print(lr_r2, lr_MSE, lr_RMSE)

0.4282692783151518 171818.5783125004 414.5100460935783


In [59]:
pred_graph_lr=pd.DataFrame({"True Values":y_test, "Predicted Values":y_pred})
pred_graph_lr

Unnamed: 0,True Values,Predicted Values
263535,1100,971.869413
265585,899,1070.503425
198423,1331,1506.540847
144076,1875,1532.056863
366990,710,1026.783324
...,...,...
199412,1400,1395.398495
1354,1845,1802.649634
354870,1000,1090.394708
237088,525,963.171210


[ 970.21762075 1076.86829469 1504.77174562 1539.25103364 1024.16705513
 1096.29332405 1051.79027112 1194.66705413 1094.88102879 1409.24937519]
263535    1100
265585     899
198423    1331
144076    1875
366990     710
284085     759
40507      910
51262     1175
62171     1208
218520    1660
Name: price, dtype: int64
