In [61]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

import warnings
warnings.filterwarnings('ignore')

In [62]:
df = pd.read_csv('real_estate_data.csv')
print(df.shape)
df.head(10)

(403487, 17)


Unnamed: 0,id,type,sub_type,start_date,end_date,listing_type,tom,building_age,total_floor_count,floor_no,room_count,size,address,furnished,heating_type,price,price_currency
0,1,Konut,Rezidans,12/10/18,1/9/19,2,30,0.0,20 ve üzeri,2,2+1,90.0,İstanbul/Kartal/Kordonboyu,,Fancoil,3500.0,TRY
1,2,Konut,Daire,2/13/19,,1,14,0.0,20 ve üzeri,20 ve üzeri,1+0,43.0,İstanbul/Kartal/Kordonboyu,,Fancoil,490000.0,TRY
2,3,Konut,Daire,10/9/18,11/8/18,1,30,0.0,1,Yüksek Giriş,2+1,,Tekirdağ/Çorlu/Reşadiye,,Fancoil,155000.0,TRY
3,4,Konut,Rezidans,9/10/18,10/10/18,1,30,3.0,20 ve üzeri,20 ve üzeri,6+1,450.0,İstanbul/Beşiktaş/Levent,,Fancoil,32500000.0,TRY
4,5,Konut,Rezidans,12/10/18,1/9/19,1,30,0.0,20 ve üzeri,2,2+1,90.0,İstanbul/Kartal/Kordonboyu,,Fancoil,1450000.0,TRY
5,6,Konut,Rezidans,11/9/18,12/9/18,1,30,2.0,10-20 arası,10,1+1,45.0,İstanbul/Maltepe/Altayçeşme,,Fancoil,780000.0,TRY
6,7,Konut,Daire,1/4/19,,2,54,0.0,20 ve üzeri,14,3+1,160.0,İstanbul/Kartal/Kordonboyu,,Fancoil,3750.0,TRY
7,8,Konut,Villa,10/3/18,1/3/19,1,92,0.0,4,,4+1,,İzmir/Urla/M. Fevzi Çakmak,,Fancoil,1500000.0,TRY
8,9,Konut,Daire,2/16/19,,1,11,,2,Kot 2,3+1,140.0,Çanakkale/Ayvacık/Küçükkuyu Bld. (Mıhlı),,Fancoil,1500000.0,TRY
9,10,Konut,Daire,12/26/18,12/26/18,1,0,1.0,1,Asma Kat,2+2,550.0,İstanbul/Fatih/Sarıdemir,,Fancoil,84256.0,GBP


## Описание данных
* id - уникальный идентификатор
* type - тип
* sub_type - подтип
* start_data - дата начала
* end_date - дата конца
* listing_type - тип
* tom - том
* building_age - возраст
* total_floor_count - количество этажей
* floor_no - номер этажа
* room_count - количество комнат
* size - размер
* address - адрес
* furnished - 
* heating_type - тип 
* price - цена
* price_currency - валюта

In [63]:
data = pd.DataFrame()

data.index =  df.columns
data['zeros'] = df.isna().sum()
data['dtypes'] = df.dtypes
data['unique'] = df.nunique()
data['shape'] = df.shape[0]
data

Unnamed: 0,zeros,dtypes,unique,shape
id,0,int64,403487,403487
type,0,object,1,403487
sub_type,0,object,12,403487
start_date,0,object,181,403487
end_date,137189,object,181,403487
listing_type,0,int64,3,403487
tom,0,int64,181,403487
building_age,27390,object,14,403487
total_floor_count,28021,object,12,403487
floor_no,35296,object,44,403487


In [64]:
df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 403487 entries, 0 to 403486
Data columns (total 17 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 403487 non-null  int64  
 1   type               403487 non-null  object 
 2   sub_type           403487 non-null  object 
 3   start_date         403487 non-null  object 
 4   end_date           266298 non-null  object 
 5   listing_type       403487 non-null  int64  
 6   tom                403487 non-null  int64  
 7   building_age       376097 non-null  object 
 8   total_floor_count  375466 non-null  object 
 9   floor_no           368191 non-null  object 
 10  room_count         403487 non-null  object 
 11  size               257481 non-null  float64
 12  address            403487 non-null  object 
 13  furnished          0 non-null       float64
 14  heating_type       375517 non-null  object 
 15  price              402772 non-null  float64
 16  pr

In [65]:
df.duplicated().sum()

np.int64(0)

In [66]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,403487.0,201744.0,116476.8,1.0,100872.5,201744.0,302615.5,403487.0
listing_type,403487.0,1.294235,0.4677333,1.0,1.0,1.0,2.0,3.0
tom,403487.0,57.022739,44.35893,0.0,29.0,40.0,90.0,180.0
size,257481.0,279.349094,9429.195,1.0,85.0,110.0,140.0,948235.0
furnished,0.0,,,,,,,
price,402772.0,354641.661933,4809503.0,-250.0,2500.0,199000.0,342000.0,2000000000.0


In [67]:
print(df.isnull().sum())
df = df.dropna(subset=['price'])

id                        0
type                      0
sub_type                  0
start_date                0
end_date             137189
listing_type              0
tom                       0
building_age          27390
total_floor_count     28021
floor_no              35296
room_count                0
size                 146006
address                   0
furnished            403487
heating_type          27970
price                   715
price_currency          715
dtype: int64


In [68]:
df['size'].fillna(df['size'].mean(), inplace=True)
df['heating_type'].fillna(df['heating_type'].mode()[0], inplace=True)

In [69]:
categorical_cols = ['type', 'sub_type', 'room_count', 'heating_type', 'furnished', 'price_currency']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
print(df_encoded.head())

   id start_date  end_date  listing_type  tom building_age total_floor_count  \
0   1   12/10/18    1/9/19             2   30            0       20 ve üzeri   
1   2    2/13/19       NaN             1   14            0       20 ve üzeri   
2   3    10/9/18   11/8/18             1   30            0                 1   
3   4    9/10/18  10/10/18             1   30            3       20 ve üzeri   
4   5   12/10/18    1/9/19             1   30            0       20 ve üzeri   

       floor_no        size                     address  ...  \
0             2   90.000000  İstanbul/Kartal/Kordonboyu  ...   
1   20 ve üzeri   43.000000  İstanbul/Kartal/Kordonboyu  ...   
2  Yüksek Giriş  279.415351     Tekirdağ/Çorlu/Reşadiye  ...   
3   20 ve üzeri  450.000000    İstanbul/Beşiktaş/Levent  ...   
4             2   90.000000  İstanbul/Kartal/Kordonboyu  ...   

   heating_type_Kombi (Elektrikli)  heating_type_Merkezi Sistem  \
0                            False                        False   


In [70]:
df_encoded = df_encoded.drop(columns=['start_date', 'end_date'])

In [71]:
X = df_encoded.drop('price', axis=1)
y = df_encoded['price']
numeric_cols = X.select_dtypes(include=['int64','float64']).columns

In [72]:
scaler = StandardScaler()

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [75]:
base_estimator = DecisionTreeRegressor(max_depth=5, random_state=42)

In [76]:
bagging_model = BaggingRegressor(estimator=base_estimator, n_estimators=10, bootstrap=True, random_state=42)
bagging_model.fit(X_train, y_train)

ValueError: could not convert string to float: '6-10 arası'

In [None]:
bagging_score = bagging_model.score(X_test, y_test)
print(f"Bagging model R^2 on test set: {bagging_score:.3f}")