In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder , MinMaxScaler , StandardScaler , PolynomialFeatures 
from sklearn.linear_model import LinearRegression , Lasso , Ridge
from sklearn.metrics import mean_absolute_error , mean_squared_error , r2_score 
import seaborn as sns
import numpy as np



In [2]:
train = pd.read_csv("train_v2.csv")
X_train = train.drop(["price","neighbour_hood"] , axis=1)
y_train = train["price"]

In [3]:
test = pd.read_csv("test_v2.csv")


In [4]:
X_train.isna().sum()

build_year                 0
area                       0
floor                      0
rooms                      0
Parking                    0
Elevator                   0
Warehouse                  0
Luxury_features            0
mean                       0
std                       50
median                     0
amin                       0
amax                       0
neighbour_hood_encoded     0
dtype: int64

In [5]:
X_train.fillna(0 , inplace=True)


# Drop outliers from test

In [6]:
test.describe()

Unnamed: 0,build_year,area,floor,rooms,Parking,Elevator,Warehouse,Luxury_features,price,mean,std,median,amin,amax,neighbour_hood_encoded
count,5853.0,5853.0,5853.0,5853.0,5853.0,5853.0,5853.0,5853.0,5853.0,5840.0,5821.0,5840.0,5840.0,5840.0,5853.0
mean,1393.184521,139.908252,4.371604,2.389031,0.65932,0.626175,0.588758,0.202973,86448550.0,78000580.0,20292210.0,76202340.0,24823500.0,140228700.0,14.283957
std,7.333277,162.836499,5.887542,0.801909,0.473978,0.483859,0.492101,0.402247,126327900.0,29842140.0,9445204.0,28813490.0,19676490.0,51984620.0,5.322865
min,1362.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,100000.0,10000000.0,0.0,10000000.0,843220.0,10000000.0,-1.0
25%,1387.0,85.0,2.0,2.0,0.0,0.0,0.0,0.0,54364000.0,59341150.0,12174630.0,59545450.0,8783783.0,96700000.0,12.0
50%,1395.0,115.0,3.0,2.0,1.0,1.0,1.0,0.0,73033710.0,80210100.0,16991740.0,77419000.0,16500000.0,131313100.0,16.0
75%,1400.0,158.0,5.0,3.0,1.0,1.0,1.0,0.0,100000000.0,91821520.0,29784170.0,89285710.0,42100000.0,200000000.0,18.0
max,1402.0,10000.0,146.0,10.0,1.0,1.0,1.0,1.0,8800000000.0,151600000.0,54517520.0,162400000.0,135000000.0,204700000.0,21.0


In [7]:
test["area"].sort_values()

1253        1
2801       15
5040       20
1402       20
5780       35
        ...  
1491     1200
2055     1500
275      1533
5622     2900
5582    10000
Name: area, Length: 5853, dtype: int64

In [8]:
np.percentile(test["area"] , 95)

300.0

In [9]:
np.percentile(test["area"] , 0.1)

35.852000000000004

In [10]:
test = test[(test["area"] > 35) &(test["area"] < 300) ].copy()

In [11]:
test["price"].sort_values()

3029        100000
2926       2350000
3285       3000000
1473       3118000
5781       3118000
           ...    
2821     375000000
3722     472727272
821      679577464
5168    1000000000
751     1000000000
Name: price, Length: 5525, dtype: int64

In [12]:
np.percentile(test["price"] , 99)

230200000.0

In [13]:
np.percentile(test["price"] , 1)

12500000.0

In [14]:
test = test[(test["price"] > 12500000.0) &(test["price"] < 230200000.0) ].copy()

In [15]:
test.isna().sum()

build_year                 0
neighbour_hood             3
area                       0
floor                      0
rooms                      0
Parking                    0
Elevator                   0
Warehouse                  0
Luxury_features            0
price                      0
mean                      10
std                       28
median                    10
amin                      10
amax                      10
neighbour_hood_encoded     0
dtype: int64

In [16]:
drop_ind = test[test["neighbour_hood_encoded"]==-1].index
test.drop(drop_ind , axis= 0 , inplace=True)

In [17]:
test.isna().sum()

build_year                 0
neighbour_hood             0
area                       0
floor                      0
rooms                      0
Parking                    0
Elevator                   0
Warehouse                  0
Luxury_features            0
price                      0
mean                       0
std                       18
median                     0
amin                       0
amax                       0
neighbour_hood_encoded     0
dtype: int64

In [18]:
test.fillna(0 , inplace= True)
X_test = test.drop(["price" , "neighbour_hood"] , axis=1)
y_test = test["price"]


# Model 1: simple Linear Regression

In [19]:
model = LinearRegression()
model.fit(X_train , y_train)
y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)

In [20]:
mean_absolute_error(y_pred = y_pred , y_true= y_test)

14017696.788173012

In [21]:
mean_absolute_error(y_pred = y_pred_train , y_true= y_train)

13236664.427022291

In [22]:
y_train.mean()

73037784.84105167

In [23]:
mean_absolute_error(y_pred = y_pred , y_true= y_test)/y_train.mean()

0.1919239037530916

# Model 2: simple Linear Regression + Scaling

In [24]:
scaler = StandardScaler()
X_train_scaled_minmax = scaler.fit_transform(X_train)
X_test_scaled_minmax = scaler.transform(X_test)

model = LinearRegression()
model.fit(X_train_scaled_minmax , y_train)
y_pred_train = model.predict(X_train_scaled_minmax)
y_pred = model.predict(X_test_scaled_minmax)

In [25]:
mean_absolute_error(y_pred = y_pred , y_true= y_test)

14017696.787927654

In [26]:
mean_absolute_error(y_pred = y_pred_train , y_true= y_train)

13236664.426895365

In [27]:
y_train.mean()

73037784.84105167

In [28]:
mean_absolute_error(y_pred = y_pred , y_true= y_test)/y_train.mean()

0.19192390374973226

# Model 3: simple Linear Regression+Polynomial + Scaling

In [29]:
pf = PolynomialFeatures(degree=2 , interaction_only= True , include_bias= False)


In [30]:

scaler = StandardScaler()
X_train_scaled_minmax = pf.fit_transform(X_train)
X_test_scaled_minmax = pf.fit_transform(X_test)
X_train_scaled_minmax = scaler.fit_transform(X_train_scaled_minmax)
X_test_scaled_minmax = scaler.transform(X_test_scaled_minmax)


model = LinearRegression()
model.fit(X_train_scaled_minmax , y_train)
y_pred_train = model.predict(X_train_scaled_minmax)
y_pred = model.predict(X_test_scaled_minmax)

In [31]:
mean_absolute_error(y_pred = y_pred , y_true= y_test)

12545391.68120321

In [32]:
mean_absolute_error(y_pred = y_pred_train , y_true= y_train)

11803082.682680445

In [33]:
y_train.mean()

73037784.84105167

In [34]:
1 - mean_absolute_error(y_pred = y_pred , y_true= y_test)/y_train.mean()

0.828234225497048

# Model 4: Ridge+Polynomial + Scaling

In [41]:
pf = PolynomialFeatures(degree=2 , interaction_only= True , include_bias= False)


In [42]:

scaler = StandardScaler()
X_train_scaled_minmax = pf.fit_transform(X_train)
X_test_scaled_minmax = pf.fit_transform(X_test)
X_train_scaled_minmax = scaler.fit_transform(X_train_scaled_minmax)
X_test_scaled_minmax = scaler.transform(X_test_scaled_minmax)


model = Lasso(alpha=10000 , max_iter=10000000 , tol= 0.01)
model.fit(X_train_scaled_minmax , y_train)
y_pred_train = model.predict(X_train_scaled_minmax)
y_pred = model.predict(X_test_scaled_minmax)

In [37]:
model.coef_

array([ 7.52471764e+06, -1.54919055e+07, -0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00, -4.21439616e+06,  0.00000000e+00, -2.44224305e+06,
       -6.24865463e+05, -8.81955713e+06, -0.00000000e+00,  0.00000000e+00,
       -2.10885200e+06, -0.00000000e+00,  8.55215352e+06, -3.36328862e+06,
        5.17216787e+05,  0.00000000e+00, -0.00000000e+00,  5.40507104e+04,
       -0.00000000e+00,  2.48489631e+07,  0.00000000e+00,  1.71306480e+06,
        2.09384126e+05,  0.00000000e+00,  1.35903347e+06,  2.19290965e+06,
        3.39580289e+06, -1.37280336e+06, -1.39608658e+06, -1.04429098e+06,
        5.99505312e+05, -7.65459792e+06,  2.30930501e+07, -0.00000000e+00,
       -1.15375376e+06,  5.07252722e+06,  1.19499899e+06, -3.64143946e+06,
        4.79440865e+06, -0.00000000e+00, -2.45979733e+06, -1.47471723e+06,
        0.00000000e+00, -1.17149447e+06,  0.00000000e+00,  1.86695685e+06,
        1.07916512e+06,  1.16008612e+06,  5.94545759e+04,  8.63382173e+05,
        6.92042033e+04,  

In [215]:
mean_absolute_error(y_pred = y_pred , y_true= y_test)

13114677.032274464

In [39]:
mean_absolute_error(y_pred = y_pred_train , y_true= y_train)

12271475.980353104

In [217]:
y_train.mean()

73037784.84105167

In [40]:
1 - mean_absolute_error(y_pred = y_pred , y_true= y_test)/y_train.mean()

0.8209214867378558

In [194]:
r2_score(y_pred = y_pred , y_true= y_test)

0.7396207432908164

In [None]:
np.argmin(np.abs(y_pred - y_test))

2310

In [None]:
np.min(np.abs(y_pred - y_test))

3974.8389780297875

In [None]:
np.argmax(np.abs(y_pred - y_test))

1909

In [None]:
np.max(np.abs(y_pred - y_test))

127345541.43240748

In [None]:
test.iloc[1909]

build_year                           1396
neighbour_hood                      مهران
area                                  126
floor                                 1.0
rooms                                   2
Parking                                 1
Elevator                                1
Warehouse                               1
Luxury_features                         0
price                            60317000
mean                      64591103.894737
std                        10707892.44523
median                         63942307.0
amin                           44705882.0
amax                          109914000.0
neighbour_hood_encoded                 13
Name: 1909, dtype: object

In [None]:
y_test.loc[1909]

60317000

In [None]:
y_pred[1909]

100121172.1567611