In [1]:
import pandas as pd
import numpy as np
from statistics import mode

In [2]:
df = pd.read_csv('estate.csv')
df.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,prev_sold_date,price
0,for_sale,3.0,2.0,0.12,Adjuntas,Puerto Rico,601.0,920.0,,105000.0
1,for_sale,4.0,2.0,0.08,Adjuntas,Puerto Rico,601.0,1527.0,,80000.0
2,for_sale,2.0,1.0,0.15,Juana Diaz,Puerto Rico,795.0,748.0,,67000.0
3,for_sale,4.0,2.0,0.1,Ponce,Puerto Rico,731.0,1800.0,,145000.0
4,for_sale,6.0,2.0,0.05,Mayaguez,Puerto Rico,680.0,,,65000.0


In [3]:
df.dtypes

status             object
bed               float64
bath              float64
acre_lot          float64
city               object
state              object
zip_code          float64
house_size        float64
prev_sold_date     object
price             float64
dtype: object

In [4]:
df.isna().sum()

status                  0
bed                351405
bath               365813
acre_lot           461540
city                  304
state                   0
zip_code              518
house_size         660180
prev_sold_date    1061623
price                 271
dtype: int64

In [5]:
columns = df.columns

In [6]:
columns

Index(['status', 'bed', 'bath', 'acre_lot', 'city', 'state', 'zip_code',
       'house_size', 'prev_sold_date', 'price'],
      dtype='object')

In [7]:
obj_col = ''
flo_col = ''

for i in columns:
    if df[i].dtype == 'object':
        obj_col += i + ' '
    else:
        flo_col += i + ' '

obj_col_list = obj_col.strip().split()
flo_col_list = flo_col.strip().split()



In [8]:
obj_col_list

['status', 'city', 'state', 'prev_sold_date']

In [9]:
flo_col_list

['bed', 'bath', 'acre_lot', 'zip_code', 'house_size', 'price']

In [10]:
for i in obj_col_list:
    if len(df[i].dropna()) > 0:  # Ensure mode exists
        df[i].fillna(mode(df[i]), inplace=True)


In [11]:

for i in flo_col_list:
    df[i].fillna(np.mean(df[i]), inplace=True)

In [12]:
df.isna().sum()

status                  0
bed                     0
bath                    0
acre_lot                0
city                    0
state                   0
zip_code                0
house_size              0
prev_sold_date    1061623
price                   0
dtype: int64

In [13]:
df.drop('prev_sold_date', axis = 1, inplace = True)

In [14]:
df.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,for_sale,3.0,2.0,0.12,Adjuntas,Puerto Rico,601.0,920.0,105000.0
1,for_sale,4.0,2.0,0.08,Adjuntas,Puerto Rico,601.0,1527.0,80000.0
2,for_sale,2.0,1.0,0.15,Juana Diaz,Puerto Rico,795.0,748.0,67000.0
3,for_sale,4.0,2.0,0.1,Ponce,Puerto Rico,731.0,1800.0,145000.0
4,for_sale,6.0,2.0,0.05,Mayaguez,Puerto Rico,680.0,2140.842381,65000.0


In [15]:
df.status.value_counts()

status
for_sale          2172823
ready_to_build      28843
Name: count, dtype: int64

In [16]:
obj_col_list

['status', 'city', 'state', 'prev_sold_date']

In [17]:
obj_col_list.pop(3)

'prev_sold_date'

In [18]:
from sklearn.preprocessing import LabelEncoder

In [19]:
encode = LabelEncoder()

In [20]:
for i in obj_col_list:
    df[i] = encode.fit_transform(df[i])

In [21]:
df.dtypes

status          int32
bed           float64
bath          float64
acre_lot      float64
city            int32
state           int32
zip_code      float64
house_size    float64
price         float64
dtype: object

In [22]:
df.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,0,3.0,2.0,0.12,27,16,601.0,920.0,105000.0
1,0,4.0,2.0,0.08,27,16,601.0,1527.0,80000.0
2,0,2.0,1.0,0.15,2218,16,795.0,748.0,67000.0
3,0,4.0,2.0,0.1,3592,16,731.0,1800.0,145000.0
4,0,6.0,2.0,0.05,2720,16,680.0,2140.842381,65000.0


In [23]:
x = df.drop('price', axis =1)
y = df.price

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
x_tr, x_te, y_tr, y_te = train_test_split(x,y, test_size= 0.2, random_state= 42)

In [26]:
from sklearn.linear_model import LinearRegression

In [27]:
model = LinearRegression()

In [28]:
model.fit(x_tr, y_tr)

In [29]:
y_pred = model.predict(x_te)

In [30]:
from sklearn.metrics import r2_score

In [31]:
r2_score(y_te, y_pred)

0.10294204833195941

In [32]:
from sklearn.tree import DecisionTreeRegressor

In [33]:
model = DecisionTreeRegressor()

In [34]:
model.fit(x_tr, y_tr)

In [35]:
y_pred = model.predict(x_te)

In [36]:
r2_score(y_te, y_pred)

0.9544516329727004