In [None]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import time
from requests.exceptions import ConnectionError
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

First of all we should find the number of pages, in oder to get loop through them and get the data

In [None]:
url = 'https://krisha.kz/prodazha/kvartiry/almaty-medeuskij/?page=1'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')
pattern = r'\d+(?=\D*Дальше)'
filtr = r'[^0-9./]'
pages = soup.find_all('nav', class_="paginator")
for page in pages:
    val = page.get_text(strip = True)
    match = re.search(pattern, val)
numPages = int(match.group())
numPages

224

Here we loop through pages to get the all price data

In [None]:
all_prices = []
vals = []
val = []
session = requests.Session()
base_url = 'https://krisha.kz/prodazha/kvartiry/almaty-medeuskij/'
for pagenum in range(1,numPages//2):
    try:
        new_url = f'{base_url}?page={pagenum}'
        new_page = session.get(new_url)
        if new_page.status_code == 200:
          new_soup = BeautifulSoup(new_page.text, 'html.parser')
          # Extract prices
          prices = new_soup.find_all('div', class_='a-card__price')
          arr = [int(re.sub(r'\D', '', price.get_text(strip=True))) for price in prices]
          all_prices.extend(arr)

          # Extract info
          info = new_soup.find_all('div', class_="a-card__header-left")
          val = [re.sub(filtr, ' ',inf.get_text(strip=True, separator=',')) for inf in info]
          val = [va.split() for va in val]
          vals.extend(val)

        else:
          print(f"Error: Unable to fetch page {pagenum}. Status code: {new_page.status_code}")
          continue  # Move to the next page
    except ConnectionError as e:
      print(f"ConnectionError: {e}")
      time.sleep(5)  # Add a delay of 5 seconds before retrying
      continue  # Move to the next page
df = pd.DataFrame(vals,columns = ['Number of rooms', 'Area', 'Floor'])
df.sample(20)

Unnamed: 0,Number of rooms,Area,Floor
54,1,69.54,
528,4,148.0,20/21
72,3,115.6,
1257,2,62.0,6/7
1441,4,139.7,4/4
2200,1,40.0,3/12
762,2,70.0,3
1485,4,181.0,10/16
59,2,60.0,
94,1,58.2,


In [None]:
y = np.array(all_prices).astype(float)
y.dtype

dtype('float64')

Lets transform the features

In [None]:
def is_last_floor(arr):
  if len(arr) < 2:
    return False
  return True if int(arr[0])/int(arr[1]) == 1 else False
def is_first_floor(arr):
  if len(arr) < 2:
    return False
  return True if int(arr[0])/int(arr[1]) != 1 else False

In [None]:
df['Number of the floor'] = df['Floor'].map(lambda x: int(x.split('/')[0]), na_action='ignore')
df['Is_Last_floor'] = df['Floor'].map(lambda x: is_last_floor(x.split('/')), na_action='ignore')
df['Is_First_Floor'] = df['Floor'].map(lambda x: is_first_floor(x.split('/')), na_action='ignore')
df.drop('Floor', axis = 1, inplace = True)
df.sample(20)

Unnamed: 0,Number of rooms,Area,Number of the floor,Is_Last_floor,Is_First_Floor
2121,3,75.0,4.0,False,True
1269,3,80.0,5.0,True,False
584,3,76.0,3.0,False,True
533,3,59.7,5.0,True,False
1084,4,200.0,5.0,False,True
2119,3,147.0,4.0,False,True
1660,3,75.0,9.0,False,True
134,3,94.0,10.0,False,True
201,1,44.5,2.0,False,True
454,3,81.6,4.0,False,True


In [None]:
df = df.astype({'Number of rooms': 'category', 'Area':'float64'})

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2220 entries, 0 to 2219
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   Number of rooms      2220 non-null   category
 1   Area                 2220 non-null   float64 
 2   Number of the floor  2050 non-null   float64 
 3   Is_Last_floor        2050 non-null   object  
 4   Is_First_Floor       2050 non-null   object  
dtypes: category(1), float64(2), object(2)
memory usage: 72.0+ KB


Lets preprocess the data

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ("encoder", OneHotEncoder())
])
cat_attribs = [x for x in df.columns if df[x].dtype !='float64']
num_attribs = [x for x in df.columns if df[x].dtype =='float64']

pipe = ColumnTransformer([
    ('num', num_pipe, num_attribs),
    ('cat', cat_pipe, cat_attribs)
])

In [83]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df,y,train_size = 0.8)
X_train, X_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [None]:
X_train_trans = pipe.fit_transform(X_train)
x_test_trans, x_val_trans = pipe.transform(x_test), pipe.transform(X_val)

Now lets train the models (The most juicy part)

In [85]:
import xgboost as xgb
model = xgb.XGBRegressor(n_estimators=1000, learning_rate=0.05)
model.fit(X_train_trans, y_train,
          early_stopping_rounds=50,
          eval_set=[(X_val, y_val)],
          eval_metric='rmse',
          verbose=True)

[0]	validation_0-rmse:78618074.50960
[1]	validation_0-rmse:75700762.76437
[2]	validation_0-rmse:72908890.08152
[3]	validation_0-rmse:70430362.09182
[4]	validation_0-rmse:68121166.27858
[5]	validation_0-rmse:65951522.23967
[6]	validation_0-rmse:63900807.45087
[7]	validation_0-rmse:62106187.85214
[8]	validation_0-rmse:60347566.94648
[9]	validation_0-rmse:58682639.90509
[10]	validation_0-rmse:57243096.71825
[11]	validation_0-rmse:55876374.45180
[12]	validation_0-rmse:54491884.99693
[13]	validation_0-rmse:53271238.61674
[14]	validation_0-rmse:52079136.88406
[15]	validation_0-rmse:51088500.47884
[16]	validation_0-rmse:50076743.88349
[17]	validation_0-rmse:49139039.81122
[18]	validation_0-rmse:48264042.97962
[19]	validation_0-rmse:47442098.39631
[20]	validation_0-rmse:46668049.83957
[21]	validation_0-rmse:45986239.96208




[22]	validation_0-rmse:45404087.30174
[23]	validation_0-rmse:44854894.04160
[24]	validation_0-rmse:44302709.83995
[25]	validation_0-rmse:43834303.22528
[26]	validation_0-rmse:43413606.87394
[27]	validation_0-rmse:43019168.37964
[28]	validation_0-rmse:42621771.77652
[29]	validation_0-rmse:42249469.16997
[30]	validation_0-rmse:41991809.06068
[31]	validation_0-rmse:41699561.26945
[32]	validation_0-rmse:41264001.22594
[33]	validation_0-rmse:41065026.71942
[34]	validation_0-rmse:40860375.14801
[35]	validation_0-rmse:40688106.21828
[36]	validation_0-rmse:40545750.20483
[37]	validation_0-rmse:40429863.32636
[38]	validation_0-rmse:40323633.46646
[39]	validation_0-rmse:40326028.35037
[40]	validation_0-rmse:40179798.61325
[41]	validation_0-rmse:40044960.45381
[42]	validation_0-rmse:39938960.02249
[43]	validation_0-rmse:39851975.92501
[44]	validation_0-rmse:39846192.82562
[45]	validation_0-rmse:39802849.69750
[46]	validation_0-rmse:39713326.92960
[47]	validation_0-rmse:39662144.10536
[48]	validat

In [86]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, x_test_trans, y_test, cv=6, scoring='r2')
scores.mean()

array([-0.49380888,  0.68851542,  0.66026201,  0.65173834,  0.73646877,
        0.70059225])

In [63]:
from sklearn.model_selection import GridSearchCV
parameters = {'learning_rate':[0.01, 0.05, 0.1, 0.2, 0.3],'max_depth':[3, 6, 9, 12], 'n_estimators':[50, 100, 250, 500], 'reg_alpha':[0, 0.1, 1, 100], 'reg_lambda':[0, 0.1, 1, 10]}
clf = GridSearchCV(model, parameters)
clf.fit(x_val_trans,y_val)

In [87]:
best_params = clf.best_params_
best_params

{'learning_rate': 0.2,
 'max_depth': 3,
 'n_estimators': 50,
 'reg_alpha': 0,
 'reg_lambda': 10}

In [93]:
model = xgb.XGBRegressor(learning_rate = 0.2, max_depth = 3, n_estimators = 50, reg_alpha = 0, reg_lambda = 10)
scores = cross_val_score(model, x_test_trans, y_test, cv=3, scoring='r2')
scores.mean()

0.7384988764943876

In [94]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(x_train,y_train)
lin = cross_val_score(reg, x_test_trans, y_test, cv=6, scoring='r2')
lin.mean()

0.7150027387415344

Though we did not achieve the significant increase in accuracy, the main purpose of this project was to gain the practical experience in XGboost library. However, there is still not too much data too confirm the true accuracy difference.