In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

Load Google Drive Content

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df = pd.read_csv("/content/drive/MyDrive/housing_price_prediction/hpp_dataset.csv")

In [4]:
df.head(10)

Unnamed: 0,id,url,region,region_url,price,type,sqfeet,beds,baths,cats_allowed,...,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,image_url,description,lat,long,state
0,7039061606,https://bham.craigslist.org/apa/d/birmingham-h...,birmingham,https://bham.craigslist.org,1195,apartment,1908,3,2.0,1,...,0,0,0,laundry on site,street parking,https://images.craigslist.org/00L0L_80pNkyDeG0...,Apartments In Birmingham AL Welcome to 100 Inv...,33.4226,-86.7065,al
1,7041970863,https://bham.craigslist.org/apa/d/birmingham-w...,birmingham,https://bham.craigslist.org,1120,apartment,1319,3,2.0,1,...,0,0,0,laundry on site,off-street parking,https://images.craigslist.org/00707_uRrY9CsNMC...,Find Your Way to Haven Apartment Homes Come ho...,33.3755,-86.8045,al
2,7041966914,https://bham.craigslist.org/apa/d/birmingham-g...,birmingham,https://bham.craigslist.org,825,apartment,1133,1,1.5,1,...,0,0,0,laundry on site,street parking,https://images.craigslist.org/00h0h_b7Bdj1NLBi...,Apartments In Birmingham AL Welcome to 100 Inv...,33.4226,-86.7065,al
3,7041966936,https://bham.craigslist.org/apa/d/birmingham-f...,birmingham,https://bham.craigslist.org,800,apartment,927,1,1.0,1,...,0,0,0,laundry on site,street parking,https://images.craigslist.org/00808_6ghZ8tSRQs...,Apartments In Birmingham AL Welcome to 100 Inv...,33.4226,-86.7065,al
4,7041966888,https://bham.craigslist.org/apa/d/birmingham-2...,birmingham,https://bham.craigslist.org,785,apartment,1047,2,1.0,1,...,0,0,0,laundry on site,street parking,https://images.craigslist.org/00y0y_21c0FOvUXm...,Apartments In Birmingham AL Welcome to 100 Inv...,33.4226,-86.7065,al
5,7041966868,https://bham.craigslist.org/apa/d/birmingham-s...,birmingham,https://bham.craigslist.org,900,apartment,1298,2,2.0,1,...,0,0,0,laundry on site,street parking,https://images.craigslist.org/00606_g79izH5xxk...,Apartments In Birmingham AL Welcome to 100 Inv...,33.4226,-86.7065,al
6,7041966702,https://bham.craigslist.org/apa/d/birmingham-2...,birmingham,https://bham.craigslist.org,925,apartment,1350,2,2.0,1,...,0,0,0,laundry on site,street parking,https://images.craigslist.org/00505_cAu36QXQV4...,Apartments In Birmingham AL Welcome to 100 Inv...,33.4226,-86.7065,al
7,7041964186,https://bham.craigslist.org/apa/d/calera-st-av...,birmingham,https://bham.craigslist.org,1085,house,2308,3,2.0,1,...,0,0,0,w/d hookups,,https://images.craigslist.org/00D0D_g4BE3jPMBr...,"This is a NEW and LARGE 3BR, 2BTH renovation i...",33.0969,-86.7601,al
8,7041960353,https://bham.craigslist.org/apa/d/birmingham-n...,birmingham,https://bham.craigslist.org,1269,apartment,1156,3,2.0,1,...,0,0,0,w/d in unit,,https://images.craigslist.org/00a0a_lTBiptw032...,Perfect Location and a Perfect Price!!! Come ...,33.4237,-86.8015,al
9,7041959413,https://bham.craigslist.org/apa/d/birmingham-a...,birmingham,https://bham.craigslist.org,799,apartment,703,1,1.0,1,...,0,0,0,w/d in unit,,https://images.craigslist.org/00707_9dHoTSYoQn...,HWY 31 and I-65 access for easy commute! Be Do...,33.4237,-86.8015,al


# Craigslist Rental Listings Dataset Description
This dataset contains information on rental listings from Craigslist, covering various regions. The dataset is structured with the following attributes:

Id: The unique identifier for each listing.

url: The URL link to the listing.

region: The Craigslist region where the listing is located.

region_url: The URL link to the region.

price: The monthly rent for the listing (Target Column).

type: The type of housing (e.g., apartment, house, etc.).

sqfeet: The total square footage of the property.

beds: The number of bedrooms.

baths: The number of bathrooms.

cats_allowed: Boolean indicating if cats are allowed (1 = yes, 0 = no).

dogs_allowed: Boolean indicating if dogs are allowed.

smoking_allowed: Boolean indicating if smoking is allowed.

wheelchair_access: Boolean indicating if the property has wheelchair access.

electric_vehicle_charge: Boolean indicating if the property has an electric vehicle charger.

comes_furnished: Boolean indicating if the property comes furnished.

laundry_options: Available laundry options.

parking_options: Available parking options.

image_url: The URL link to images of the property.

description: Description provided by the poster.

lat: Latitude coordinates of the listing.

long: Longitude coordinates of the listing.

state: The state where the listing is located.

### Clean dataset




In [5]:
# remove columns with less or no predictive power, or increases the complexity of the feature set unnecessarily
df = df.drop(columns=['id','url','region','region_url','image_url','description', 'lat', 'long'])
print('After dropping unwanted columns')
print(df.head(1))

After dropping unwanted columns
   price       type  sqfeet  beds  baths  cats_allowed  dogs_allowed  \
0   1195  apartment    1908     3    2.0             1             1   

   smoking_allowed  wheelchair_access  electric_vehicle_charge  \
0                1                  0                        0   

   comes_furnished  laundry_options parking_options state  
0                0  laundry on site  street parking    al  


In [6]:
# view column types
df.dtypes

price                        int64
type                        object
sqfeet                       int64
beds                         int64
baths                      float64
cats_allowed                 int64
dogs_allowed                 int64
smoking_allowed              int64
wheelchair_access            int64
electric_vehicle_charge      int64
comes_furnished              int64
laundry_options             object
parking_options             object
state                       object
dtype: object

In [7]:
df.describe()

Unnamed: 0,price,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished
count,265190.0,265190.0,265190.0,265190.0,265190.0,265190.0,265190.0,265190.0,265190.0,265190.0
mean,12272.85,1093.678,1.912414,1.483468,0.716822,0.69655,0.733896,0.078759,0.01436,0.048644
std,5376352.0,23068.88,3.6919,0.630208,0.450543,0.459749,0.44192,0.269362,0.118968,0.215124
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,817.0,752.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1060.0,950.0,2.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
75%,1450.0,1156.0,2.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0
max,2768307000.0,8388607.0,1100.0,75.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
# Check for empty values
missing_values_per_column = df.isnull().sum()
print(missing_values_per_column)

price                          0
type                           0
sqfeet                         0
beds                           0
baths                          0
cats_allowed                   0
dogs_allowed                   0
smoking_allowed                0
wheelchair_access              0
electric_vehicle_charge        0
comes_furnished                0
laundry_options            54311
parking_options            95135
state                          1
dtype: int64


Laundry options and parking options have considerable amount of null values. Predictive power of laundry options is less, hence we will drop the column laundry_options. However parking options has a higher effect on housing price, hence we will drop rows with missing values in column parking_options instead of dropping the entire column.

In [9]:
#drop laundry options
df = df.drop(columns=['laundry_options'])

In [10]:
#drop rows with empty values
row_count_before_dropna = len(df.index)
df.dropna(inplace = True)
row_count_after_dropna = len(df.index)
print(row_count_before_dropna - row_count_after_dropna, ' rows dropped!')

95136  rows dropped!


In [11]:
# min max scaling
columns_to_scale = ['sqfeet', 'beds', 'baths']
scaler = MinMaxScaler()

scaled_columns = scaler.fit_transform(df[columns_to_scale])
scaled_df = pd.DataFrame(scaled_columns, columns=columns_to_scale, index=df.index)
df = pd.concat([df.drop(columns=columns_to_scale), scaled_df], axis=1)

In [12]:
print('After min max scaling')
df.describe()

After min max scaling


Unnamed: 0,price,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,sqfeet,beds,baths
count,170054.0,170054.0,170054.0,170054.0,170054.0,170054.0,170054.0,170054.0,170054.0,170054.0
mean,18330.22,0.763175,0.73803,0.652263,0.100186,0.018859,0.060304,0.000137,0.001758,0.019682
std,6713660.0,0.425135,0.439708,0.476253,0.300248,0.136026,0.238051,0.003434,0.004149,0.008619
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,820.0,1.0,0.0,0.0,0.0,0.0,0.0,8.9e-05,0.000909,0.013333
50%,1079.0,1.0,1.0,1.0,0.0,0.0,0.0,0.000113,0.001818,0.013333
75%,1500.0,1.0,1.0,1.0,0.0,0.0,0.0,0.000138,0.001818,0.026667
max,2768307000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
#one hot encoding
df = pd.get_dummies(df, columns = ['type','parking_options', 'state'])

In [14]:
print('After one hot encoding ...')
df.head(3)

After one hot encoding ...


Unnamed: 0,price,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,sqfeet,beds,baths,...,state_nd,state_ne,state_nh,state_nj,state_nm,state_nv,state_ny,state_oh,state_ok,state_or
0,1195,1,1,1,0,0,0,0.000227,0.002727,0.026667,...,False,False,False,False,False,False,False,False,False,False
1,1120,1,1,1,0,0,0,0.000157,0.002727,0.026667,...,False,False,False,False,False,False,False,False,False,False
2,825,1,1,1,0,0,0,0.000135,0.000909,0.02,...,False,False,False,False,False,False,False,False,False,False


In [15]:
print('Dimensions of final dataset after cleaning', df.shape)

Dimensions of final dataset after cleaning (170054, 67)


In [16]:
X = df.drop(columns=['price'])
Y = df['price']

In [17]:
X.shape

(170054, 66)

In [18]:
Y.shape

(170054,)

In [19]:
X.head(10)

Unnamed: 0,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,sqfeet,beds,baths,type_apartment,...,state_nd,state_ne,state_nh,state_nj,state_nm,state_nv,state_ny,state_oh,state_ok,state_or
0,1,1,1,0,0,0,0.000227,0.002727,0.026667,True,...,False,False,False,False,False,False,False,False,False,False
1,1,1,1,0,0,0,0.000157,0.002727,0.026667,True,...,False,False,False,False,False,False,False,False,False,False
2,1,1,1,0,0,0,0.000135,0.000909,0.02,True,...,False,False,False,False,False,False,False,False,False,False
3,1,1,1,0,0,0,0.000111,0.000909,0.013333,True,...,False,False,False,False,False,False,False,False,False,False
4,1,1,1,0,0,0,0.000125,0.001818,0.013333,True,...,False,False,False,False,False,False,False,False,False,False
5,1,1,1,0,0,0,0.000155,0.001818,0.026667,True,...,False,False,False,False,False,False,False,False,False,False
6,1,1,1,0,0,0,0.000161,0.001818,0.026667,True,...,False,False,False,False,False,False,False,False,False,False
11,1,1,1,0,0,0,0.000164,0.002727,0.026667,True,...,False,False,False,False,False,False,False,False,False,False
12,1,1,1,0,0,0,0.000157,0.002727,0.026667,True,...,False,False,False,False,False,False,False,False,False,False
13,1,1,0,0,0,0,8e-05,0.001818,0.013333,False,...,False,False,False,False,False,False,False,False,False,False


In [20]:
Y.head(10)

0     1195
1     1120
2      825
3      800
4      785
5      900
6      925
11    1355
12    1120
13     685
Name: price, dtype: int64

In [21]:
#Splitting the datset into training and testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state=42)

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

In [40]:
from sklearn.metrics import mean_squared_error
linear = LinearRegression()
linear.fit(X_train, Y_train)
pred=linear.predict(X_test)

linear_rsq = round(r2_score(Y_test,pred),2)
linear_mse = round(mean_squared_error(Y_test, pred),2)

In [41]:
lasso = Lasso()
lasso.fit(X_train, Y_train)
lasso_pred=lasso.predict(X_test)

lasso_rsq = round(r2_score(Y_test,lasso_pred),2)
lasso_mse = round(mean_squared_error(Y_test, lasso_pred),2)

  model = cd_fast.enet_coordinate_descent(


In [42]:
dec = DecisionTreeRegressor()
dec.fit(X_train, Y_train)
dec_pred=dec.predict(X_test)

dec_rsq = round(r2_score(Y_test,dec_pred),2)
dec_mse = round(mean_squared_error(Y_test, dec_pred),2)

In [43]:
el = ElasticNet()
el.fit(X_train, Y_train)
el_pred=el.predict(X_test)

el_rsq = round(r2_score(Y_test,el_pred),2)
el_mse = round(mean_squared_error(Y_test, el_pred),2)

In [44]:
kn = KNeighborsRegressor()
kn.fit(X_train, Y_train)
kn_pred=kn.predict(X_test)

kn_rsq = round(r2_score(Y_test, kn_pred),2)
kn_mse = round(mean_squared_error(Y_test, kn_pred),2)


In [39]:
list_reg=['Linear','Lasso','ElasticNet','Decision Tree','KNeighbors']
data = {'Regressor': list_reg, 'R Squared': [linear_rsq, lasso_rsq, el_rsq,dec_rsq,kn_rsq], 'MSE':[linear_mse, lasso_mse, el_mse,dec_mse,kn_mse]}
reg_df=pd.DataFrame(data)
reg_df

Unnamed: 0,Regressor,R Squared,MSE
0,Linear,0.85,12.5
1,Lasso,0.85,12.5
2,ElasticNet,0.75,20.4
3,Decision Tree,0.95,8.3
4,KNeighbors,0.6,32.1
