In [151]:
import pandas as pd
import numpy as np

In [152]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [153]:
data = pd.read_csv("test.csv")

In [154]:
data

Unnamed: 0,beds,baths,size,size_units,lot_size,lot_size_units,zip_code,price
0,3,3.0,2850.0,sqft,4200.00,sqft,98119,1175000.0
1,4,5.0,3040.0,sqft,5002.00,sqft,98106,1057500.0
2,3,1.0,1290.0,sqft,6048.00,sqft,98125,799000.0
3,3,2.0,2360.0,sqft,0.28,acre,98188,565000.0
4,3,3.5,1942.0,sqft,1603.00,sqft,98107,1187000.0
...,...,...,...,...,...,...,...,...
500,5,4.5,5580.0,sqft,0.30,acre,98146,3800000.0
501,3,2.5,1390.0,sqft,1570.00,sqft,98126,575000.0
502,3,2.5,2950.0,sqft,0.47,acre,98118,3105000.0
503,5,5.0,3010.0,sqft,4887.00,sqft,98115,1807000.0


In [155]:
data.shape

(505, 8)

In [156]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 505 entries, 0 to 504
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   beds            505 non-null    int64  
 1   baths           505 non-null    float64
 2   size            505 non-null    float64
 3   size_units      505 non-null    object 
 4   lot_size        428 non-null    float64
 5   lot_size_units  428 non-null    object 
 6   zip_code        505 non-null    int64  
 7   price           505 non-null    float64
dtypes: float64(4), int64(2), object(2)
memory usage: 31.7+ KB


In [157]:
for columns in data.columns:
    print(data[columns].value_counts())
    print("*" *20)

beds
3    173
2    126
4    108
1     55
5     33
7      6
6      3
9      1
Name: count, dtype: int64
********************
baths
2.0    140
1.0    110
2.5     96
3.0     52
1.5     40
3.5     36
4.0     11
5.0      8
4.5      5
6.0      3
5.5      2
7.0      1
6.5      1
Name: count, dtype: int64
********************
size
2480.0    4
1240.0    4
1540.0    4
1200.0    4
1800.0    4
         ..
921.0     1
1892.0    1
2230.0    1
2867.0    1
1301.0    1
Name: count, Length: 375, dtype: int64
********************
size_units
sqft    505
Name: count, dtype: int64
********************
lot_size
5000.00    14
4000.00    14
6000.00     8
1.00        6
0.25        5
           ..
6656.00     1
8057.00     1
6755.00     1
9130.00     1
4887.00     1
Name: count, Length: 314, dtype: int64
********************
lot_size_units
sqft    369
acre     59
Name: count, dtype: int64
********************
zip_code
98103    48
98115    39
98117    38
98125    29
98122    29
98199    25
98126    22
98144    21

In [158]:
data.isna().sum()


beds               0
baths              0
size               0
size_units         0
lot_size          77
lot_size_units    77
zip_code           0
price              0
dtype: int64

In [159]:
data.drop(columns=["lot_size", "lot_size_units"], inplace = True)

In [160]:
data.describe()

Unnamed: 0,beds,baths,size,zip_code,price
count,505.0,505.0,505.0,505.0,505.0
mean,2.954455,2.219802,1851.843564,98125.366337,979582.2
std,1.214947,1.013404,922.55609,24.875054,608475.9
min,1.0,1.0,376.0,98101.0,170000.0
25%,2.0,1.5,1171.0,98108.0,619990.0
50%,3.0,2.0,1690.0,98118.0,840000.0
75%,4.0,2.5,2400.0,98126.0,1155000.0
max,9.0,7.0,6139.0,98199.0,6250000.0


In [161]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 505 entries, 0 to 504
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   beds        505 non-null    int64  
 1   baths       505 non-null    float64
 2   size        505 non-null    float64
 3   size_units  505 non-null    object 
 4   zip_code    505 non-null    int64  
 5   price       505 non-null    float64
dtypes: float64(3), int64(2), object(1)
memory usage: 23.8+ KB


In [162]:
data.isna().sum()

beds          0
baths         0
size          0
size_units    0
zip_code      0
price         0
dtype: int64

In [163]:
data.head()

Unnamed: 0,beds,baths,size,size_units,zip_code,price
0,3,3.0,2850.0,sqft,98119,1175000.0
1,4,5.0,3040.0,sqft,98106,1057500.0
2,3,1.0,1290.0,sqft,98125,799000.0
3,3,2.0,2360.0,sqft,98188,565000.0
4,3,3.5,1942.0,sqft,98107,1187000.0


In [164]:
data.shape

(505, 6)

In [165]:
data['price_per_sqft'] = data['price'] * 100000 / data['size']

In [166]:
data['price_per_sqft']

0      4.122807e+07
1      3.478618e+07
2      6.193798e+07
3      2.394068e+07
4      6.112255e+07
           ...     
500    6.810036e+07
501    4.136691e+07
502    1.052542e+08
503    6.003322e+07
504    6.879324e+07
Name: price_per_sqft, Length: 505, dtype: float64

In [167]:
data.describe()

Unnamed: 0,beds,baths,size,zip_code,price,price_per_sqft
count,505.0,505.0,505.0,505.0,505.0,505.0
mean,2.954455,2.219802,1851.843564,98125.366337,979582.2,55566820.0
std,1.214947,1.013404,922.55609,24.875054,608475.9,23630020.0
min,1.0,1.0,376.0,98101.0,170000.0,16555380.0
25%,2.0,1.5,1171.0,98108.0,619990.0,42124540.0
50%,3.0,2.0,1690.0,98118.0,840000.0,53007520.0
75%,4.0,2.5,2400.0,98126.0,1155000.0,65708810.0
max,9.0,7.0,6139.0,98199.0,6250000.0,265957400.0


In [168]:
data.shape

(505, 7)

In [169]:
data.drop(columns=['price_per_sqft'],inplace = True)

In [170]:
data.drop(columns=['size_units'],inplace = True)

In [171]:
data.head()

Unnamed: 0,beds,baths,size,zip_code,price
0,3,3.0,2850.0,98119,1175000.0
1,4,5.0,3040.0,98106,1057500.0
2,3,1.0,1290.0,98125,799000.0
3,3,2.0,2360.0,98188,565000.0
4,3,3.5,1942.0,98107,1187000.0


In [172]:
x= data.drop(columns=["price"])
y= data['price']

In [193]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import r2_score
from sklearn.compose import make_column_transformer


In [194]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size= 0.2, random_state=40)

In [195]:
print(x_train.shape)
print(y_train.shape)

(404, 4)
(404,)


In [196]:
column_trans = make_column_transformer((OneHotEncoder(sparse=False ), ['beds']), remainder= "passthrough")

In [197]:
scaler = StandardScaler()

In [198]:
lr = LinearRegression()

In [199]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [200]:
lr= LinearRegression()
lr.fit(x_scaled, y)

In [201]:
pipe = make_pipeline(column_trans, scaler, lr)

In [202]:
pipe.fit(x_train, y_train) 
import warnings
warnings.filterwarnings('ignore')

In [203]:
pipe.fit(x_train, y_train)
import warnings
warnings.filterwarnings('ignore')

In [204]:
y_pred_lr = pipe.predict(x_test)

In [205]:
r2_score(y_test,y_pred_lr)

0.33359379538863143

In [206]:
lasso = Lasso()

In [207]:
pipe = make_pipeline(column_trans, scaler, lasso)

In [208]:
pipe.fit(x_train, y_train)

In [211]:
y_pred_lasso= pipe.predict(x_test)
r2_score(y_test,y_pred_lasso)

0.33222041744316466

In [212]:
ridge = Ridge()

In [214]:
pipe = make_pipeline(column_trans, scaler,ridge)

In [215]:
pipe.fit(x_train, y_train)

In [216]:
y_pred_ridge = pipe.predict(x_test)
r2_score(y_test, y_pred_ridge)

0.33176576649518186

In [219]:
print("no regularization : " , r2_score(y_test, y_pred_lr))
print("Lasso : " , r2_score(y_test, y_pred_lasso))
print("Ridge : " , r2_score(y_test, y_pred_ridge))
      

no regularization :  0.33359379538863143
Lasso :  0.33222041744316466
Ridge :  0.33176576649518186


In [220]:
import pickle 