In [1]:
import numpy as np
import pandas as pd


In [2]:
df = pd.read_csv('Divar_preproccessed.csv',low_memory=False)

In [3]:
df.shape

(601996, 32)

In [4]:
df = df[df.fixed_price]

In [5]:
df.shape

(559563, 32)

In [6]:
df = df.drop(['Unnamed: 0','fixed_price'] , axis=1)

In [7]:
# df.info()

In [8]:
X_sell = df[df.is_for_sell].drop(['is_for_sell','price'],axis=1)
y_sell = df[df.is_for_sell]['price']

X_rent = df[~df.is_for_sell].drop(['is_for_sell','price'],axis=1)
y_rent = df[~df.is_for_sell]['price']

In [9]:
from sklearn.preprocessing import StandardScaler

X_sell = pd.DataFrame(StandardScaler().fit_transform(X_sell),columns=X_sell.columns)
X_rent = pd.DataFrame(StandardScaler().fit_transform(X_rent),columns=X_rent.columns)

In [10]:
# X_sell.describe()

### sell

In [11]:
from sklearn.model_selection import train_test_split

x_train,x_val,y_train,y_val = train_test_split(X_sell,y_sell,test_size=.2,random_state=42)

In [12]:

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

y_train = np.log1p(y_train)
y_val = np.log1p(y_val)

poly = PolynomialFeatures(degree=2, include_bias=False)
x_train_poly = poly.fit_transform(x_train)
x_val_poly = poly.transform(x_val)

model = Ridge(alpha=1.0, random_state=42)
model.fit(x_train_poly, y_train)


y_train_pred_log = model.predict(x_train_poly)
y_val_pred_log = model.predict(x_val_poly)

y_train_pred = np.expm1(y_train_pred_log)
y_val_pred = np.expm1(y_val_pred_log)


print('Train R2:', r2_score(y_train, y_train_pred))
print('Validation R2:', r2_score(y_val, y_val_pred))
print('Validation MAE:', mean_absolute_error(y_val, y_val_pred))
print('Validation RMSE:', np.sqrt(mean_squared_error(y_val, y_val_pred)))

Train R2: -5.908263128133433e+26
Validation R2: -4.083980616869665e+32
Validation MAE: 155232420773389.25
Validation RMSE: 4.2107351295325224e+16


In [13]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    n_estimators=300,      
    max_depth=8,            
    n_jobs=-1,            
    random_state=42
)

model.fit(x_train, y_train)

y_train_pred = model.predict(x_train)
y_val_pred = model.predict(x_val)


print('Train R2:', r2_score(y_train, y_train_pred))
print('Validation R2:', r2_score(y_val, y_val_pred))
print('Validation MAE:', mean_absolute_error(y_val, y_val_pred))
print('Validation RMSE:', np.sqrt(mean_squared_error(y_val, y_val_pred)))

Train R2: 0.18796100600305732
Validation R2: 0.1521250231978446
Validation MAE: 0.8227976749355472
Validation RMSE: 1.9185892050535174


In [14]:
df.describe()['price']

count    5.595630e+05
mean     2.464879e+10
std      1.816022e+12
min      1.000000e+00
25%      2.300000e+07
50%      1.450000e+09
75%      4.000000e+09
max      1.030000e+15
Name: price, dtype: float64

In [15]:
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation

y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

lgb_train = lgb.Dataset(x_train, y_train_log)
lgb_val = lgb.Dataset(x_val, y_val_log, reference=lgb_train)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': -1,
    'min_data_in_leaf': 20,
    'verbose': -1
}

gbm = lgb.train(
    params,
    lgb_train,
    num_boost_round=1000,
    valid_sets=[lgb_train, lgb_val],
    callbacks=[early_stopping(stopping_rounds=50), log_evaluation(100)]
)

y_train_pred_log = gbm.predict(x_train, num_iteration=gbm.best_iteration)
y_val_pred_log = gbm.predict(x_val, num_iteration=gbm.best_iteration)

y_train_pred = np.expm1(y_train_pred_log)
y_val_pred = np.expm1(y_val_pred_log)

print('Train R2:', r2_score(y_train, y_train_pred))
print('Validation R2:', r2_score(y_val, y_val_pred))
print('Validation MAE:', mean_absolute_error(y_val, y_val_pred))
print('Validation RMSE:', np.sqrt(mean_squared_error(y_val, y_val_pred)))

Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 0.137066	valid_1's rmse: 0.142204
[200]	training's rmse: 0.135517	valid_1's rmse: 0.142056
[300]	training's rmse: 0.134368	valid_1's rmse: 0.142011
Early stopping, best iteration is:
[298]	training's rmse: 0.134395	valid_1's rmse: 0.142004
Train R2: 0.225764009212288
Validation R2: 0.16437032991379663
Validation MAE: 0.8655712187111032
Validation RMSE: 1.904684349882061


In [16]:
q1 = y_sell.quantile(0.01)  
q99 = y_sell.quantile(0.99) 
X_sell = X_sell.reset_index(drop=True)
y_sell = y_sell.reset_index(drop=True)

mask = (y_sell >= q1) & (y_sell <= q99)
X_sell = X_sell[mask]
y_sell = y_sell[mask]

In [17]:
x_train,x_val,y_train,y_val = train_test_split(X_sell,y_sell,test_size=.2,random_state=42)

In [18]:
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation

y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

lgb_train = lgb.Dataset(x_train, y_train_log)
lgb_val = lgb.Dataset(x_val, y_val_log, reference=lgb_train)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': -1,
    'min_data_in_leaf': 20,
    'verbose': -1
}

gbm = lgb.train(
    params,
    lgb_train,
    num_boost_round=1000,
    valid_sets=[lgb_train, lgb_val],
    callbacks=[early_stopping(stopping_rounds=50), log_evaluation(100)]
)

y_train_pred_log = gbm.predict(x_train, num_iteration=gbm.best_iteration)
y_val_pred_log = gbm.predict(x_val, num_iteration=gbm.best_iteration)

y_train_pred = np.expm1(y_train_pred_log)
y_val_pred = np.expm1(y_val_pred_log)

print('Train R2:', r2_score(y_train, y_train_pred))
print('Validation R2:', r2_score(y_val, y_val_pred))
print('Validation MAE:', mean_absolute_error(y_val, y_val_pred))
print('Validation RMSE:', np.sqrt(mean_squared_error(y_val, y_val_pred)))

Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 1.13478	valid_1's rmse: 1.14457
[200]	training's rmse: 1.11589	valid_1's rmse: 1.13212
[300]	training's rmse: 1.10506	valid_1's rmse: 1.12758
[400]	training's rmse: 1.09642	valid_1's rmse: 1.1243
[500]	training's rmse: 1.0882	valid_1's rmse: 1.12143
[600]	training's rmse: 1.08064	valid_1's rmse: 1.119
[700]	training's rmse: 1.07367	valid_1's rmse: 1.11704
[800]	training's rmse: 1.06743	valid_1's rmse: 1.11556
[900]	training's rmse: 1.0617	valid_1's rmse: 1.11474
[1000]	training's rmse: 1.05641	valid_1's rmse: 1.11405
Did not meet early stopping. Best iteration is:
[1000]	training's rmse: 1.05641	valid_1's rmse: 1.11405
Train R2: 0.5443975959146568
Validation R2: 0.5143486560162636
Validation MAE: 2133610097.6698034
Validation RMSE: 5677126833.8265705


In [19]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    n_estimators=300,      
    max_depth=8,            
    n_jobs=-1,            
    random_state=42
)

model.fit(x_train, y_train)

y_train_pred = model.predict(x_train)
y_val_pred = model.predict(x_val)


print('Train R2:', r2_score(y_train, y_train_pred))
print('Validation R2:', r2_score(y_val, y_val_pred))
print('Validation MAE:', mean_absolute_error(y_val, y_val_pred))
print('Validation RMSE:', np.sqrt(mean_squared_error(y_val, y_val_pred)))

Train R2: 0.5609293316613361
Validation R2: 0.5347281819916417
Validation MAE: 2585857005.870631
Validation RMSE: 5556734840.465718


In [20]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    n_estimators=300,      
    max_depth=10,            
    n_jobs=-1,            
    random_state=42
)

model.fit(x_train, y_train)

y_train_pred = model.predict(x_train)
y_val_pred = model.predict(x_val)


print('Train R2:', r2_score(y_train, y_train_pred))
print('Validation R2:', r2_score(y_val, y_val_pred))
print('Validation MAE:', mean_absolute_error(y_val, y_val_pred))
print('Validation RMSE:', np.sqrt(mean_squared_error(y_val, y_val_pred)))

sell_model = model

Train R2: 0.6449574466531169
Validation R2: 0.5790048286603025
Validation MAE: 2380350348.14675
Validation RMSE: 5285728559.744901


In [21]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    n_estimators=300,      
    max_depth=18,            
    n_jobs=-1,            
    random_state=42
)

model.fit(x_train, y_train)

y_train_pred = model.predict(x_train)
y_val_pred = model.predict(x_val)


print('Train R2:', r2_score(y_train, y_train_pred))
print('Validation R2:', r2_score(y_val, y_val_pred))
print('Validation MAE:', mean_absolute_error(y_val, y_val_pred))
print('Validation RMSE:', np.sqrt(mean_squared_error(y_val, y_val_pred)))

Train R2: 0.897657777856536
Validation R2: 0.6307222919464388
Validation MAE: 2028262821.8953586
Validation RMSE: 4950429104.923231


In [22]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    n_estimators=500,      
    max_depth=18,            
    n_jobs=-1,            
    random_state=42
)

model.fit(x_train, y_train)

y_train_pred = model.predict(x_train)
y_val_pred = model.predict(x_val)


print('Train R2:', r2_score(y_train, y_train_pred))
print('Validation R2:', r2_score(y_val, y_val_pred))
print('Validation MAE:', mean_absolute_error(y_val, y_val_pred))
print('Validation RMSE:', np.sqrt(mean_squared_error(y_val, y_val_pred)))

Train R2: 0.8978177396605045
Validation R2: 0.6303510469011726
Validation MAE: 2028586974.0882704
Validation RMSE: 4952916881.149332


In [23]:
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.ensemble import RandomForestRegressor

# rf = RandomForestRegressor(
#     random_state=42,
#     n_jobs=-1,
#     n_estimators=300,
# )

# param_dist = {
#     'max_depth': [10, 15, 20, 25],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['sqrt', 0.3, 0.5]
# }

# search = RandomizedSearchCV(
#     estimator=rf,
#     param_distributions=param_dist,
#     n_iter=20,      
#     cv=3,
#     scoring='r2',
#     verbose=1,
#     random_state=42,
#     n_jobs=-1
# )

# search.fit(x_train, y_train)

# best_rf = search.best_estimator_
# print('Best params:', search.best_params_)

# y_train_pred = best_rf.predict(x_train)
# y_val_pred = best_rf.predict(x_val)

# print('Train R2:', r2_score(y_train, y_train_pred))
# print('Validation R2:', r2_score(y_val, y_val_pred))
# print('Validation MAE:', mean_absolute_error(y_val, y_val_pred))
# print('Validation RMSE:', np.sqrt(mean_squared_error(y_val, y_val_pred)))


### rent

In [24]:
y_rent.describe()

count    1.916030e+05
mean     3.693189e+10
std      2.981470e+12
min      1.030000e+00
25%      8.450000e+06
50%      1.450000e+07
75%      2.800000e+07
max      1.030000e+15
Name: price, dtype: float64

In [25]:
q1 = y_rent.quantile(0.01)  
q99 = y_rent.quantile(0.99) 
X_rent = X_rent.reset_index(drop=True)
y_rent = y_rent.reset_index(drop=True)

mask = (y_rent >= q1) & (y_rent <= q99)
X_rent = X_rent[mask]
y_rent = y_rent[mask]

x_train,x_val,y_train,y_val = train_test_split(X_rent,y_rent,test_size=.2,random_state=42)

In [26]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    n_estimators=300,      
    max_depth=8,            
    n_jobs=-1,            
    random_state=42
)

model.fit(x_train, y_train)

y_train_pred = model.predict(x_train)
y_val_pred = model.predict(x_val)


print('Train R2:', r2_score(y_train, y_train_pred))
print('Validation R2:', r2_score(y_val, y_val_pred))
print('Validation MAE:', mean_absolute_error(y_val, y_val_pred))
print('Validation RMSE:', np.sqrt(mean_squared_error(y_val, y_val_pred)))

Train R2: 0.6396442604098369
Validation R2: 0.6205708430271696
Validation MAE: 10135555.49371257
Validation RMSE: 21527859.67592459


In [27]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    n_estimators=300,      
    max_depth=10,            
    n_jobs=-1,            
    random_state=42
)

model.fit(x_train, y_train)

y_train_pred = model.predict(x_train)
y_val_pred = model.predict(x_val)


print('Train R2:', r2_score(y_train, y_train_pred))
print('Validation R2:', r2_score(y_val, y_val_pred))
print('Validation MAE:', mean_absolute_error(y_val, y_val_pred))
print('Validation RMSE:', np.sqrt(mean_squared_error(y_val, y_val_pred)))

rent_model = model

Train R2: 0.7167174450209719
Validation R2: 0.6611113721976827
Validation MAE: 9251194.803010456
Validation RMSE: 20345295.45786954


In [28]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    n_estimators=300,      
    max_depth=15,            
    n_jobs=-1,            
    random_state=42
)

model.fit(x_train, y_train)

y_train_pred = model.predict(x_train)
y_val_pred = model.predict(x_val)


print('Train R2:', r2_score(y_train, y_train_pred))
print('Validation R2:', r2_score(y_val, y_val_pred))
print('Validation MAE:', mean_absolute_error(y_val, y_val_pred))
print('Validation RMSE:', np.sqrt(mean_squared_error(y_val, y_val_pred)))

Train R2: 0.8754618377023871
Validation R2: 0.6916256862355052
Validation MAE: 8390106.973998759
Validation RMSE: 19407723.736183103


In [29]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    n_estimators=300,      
    max_depth=18,            
    n_jobs=-1,            
    random_state=42
)

model.fit(x_train, y_train)

y_train_pred = model.predict(x_train)
y_val_pred = model.predict(x_val)


print('Train R2:', r2_score(y_train, y_train_pred))
print('Validation R2:', r2_score(y_val, y_val_pred))
print('Validation MAE:', mean_absolute_error(y_val, y_val_pred))
print('Validation RMSE:', np.sqrt(mean_squared_error(y_val, y_val_pred)))

Train R2: 0.9260103729662579
Validation R2: 0.695484516635311
Validation MAE: 8235594.119723277
Validation RMSE: 19285912.55539861


In [30]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    n_estimators=500,      
    max_depth=15,            
    n_jobs=-1,            
    random_state=42
)

model.fit(x_train, y_train)

y_train_pred = model.predict(x_train)
y_val_pred = model.predict(x_val)


print('Train R2:', r2_score(y_train, y_train_pred))
print('Validation R2:', r2_score(y_val, y_val_pred))
print('Validation MAE:', mean_absolute_error(y_val, y_val_pred))
print('Validation RMSE:', np.sqrt(mean_squared_error(y_val, y_val_pred)))

Train R2: 0.87566395269647
Validation R2: 0.6919445381655918
Validation MAE: 8386418.741908082
Validation RMSE: 19397687.57179686


### final models

In [31]:
sell_model

0,1,2
,n_estimators,300
,criterion,'squared_error'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [32]:
rent_model

0,1,2
,n_estimators,300
,criterion,'squared_error'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True
