### Подключение модулей

In [None]:
!pip install polars numpy matplotlib seaborn scikit-learn



In [None]:
import polars as ps
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import GradientBoostingRegressor, BaggingRegressor, StackingRegressor, RandomForestRegressor, BaggingClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

### Загрузка данных

In [8]:
from google.colab import drive
drive.mount('/content/drive')
!mkdir -p ~/.kaggle
!cp /content/drive/MyDrive/Kaggle_API/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
!mkdir -p /content/data
!kaggle datasets download -d zingatbi/zingat-real-estate -p /content/data --unzip --force

Dataset URL: https://www.kaggle.com/datasets/zingatbi/zingat-real-estate
License(s): unknown
Downloading zingat-real-estate.zip to /content/data
  0% 0.00/8.24M [00:00<?, ?B/s]
100% 8.24M/8.24M [00:00<00:00, 2.06GB/s]


In [None]:
# df = ps.read_csv('real_estate_data.csv')
# print(df.shape)
# df.head(10)
import glob
csv_path = glob.glob('/content/data/*.csv')[0]
df = ps.read_csv(csv_path)

### Описание данных
* id - уникальный идентификатор
* type - тип недвижимости
* sub_type - подтип
* start_data - дата начала объявления
* end_date - дата конца объявления
* listing_type - тип объявления
* tom - время на сайте
* building_age - возраст здания
* total_floor_count - количество этажей в здании
* floor_no - номер этажа
* room_count - количество комнат
* size - размер недвижимости
* address - адрес недвижимости
* furnished - наличие мебели
* heating_type - тип обогрева
* price - цена
* price_currency - валюта

In [None]:
data = ps.DataFrame()

data.index =  df.columns
data['zeros'] = df.isna().sum()
data['dtypes'] = df.dtypes
data['unique'] = df.nunique()
data['shape'] = df.shape[0]
data

Unnamed: 0,zeros,dtypes,unique,shape
id,0,int64,403487,403487
type,0,object,1,403487
sub_type,0,object,12,403487
start_date,0,object,181,403487
end_date,137189,object,181,403487
listing_type,0,int64,3,403487
tom,0,int64,181,403487
building_age,27390,object,14,403487
total_floor_count,28021,object,12,403487
floor_no,35296,object,44,403487


In [12]:
df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 403487 entries, 0 to 403486
Data columns (total 17 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 403487 non-null  int64  
 1   type               403487 non-null  object 
 2   sub_type           403487 non-null  object 
 3   start_date         403487 non-null  object 
 4   end_date           266298 non-null  object 
 5   listing_type       403487 non-null  int64  
 6   tom                403487 non-null  int64  
 7   building_age       376097 non-null  object 
 8   total_floor_count  375466 non-null  object 
 9   floor_no           368191 non-null  object 
 10  room_count         403487 non-null  object 
 11  size               257481 non-null  float64
 12  address            403487 non-null  object 
 13  furnished          0 non-null       float64
 14  heating_type       375517 non-null  object 
 15  price              402772 non-null  float64
 16  pr

In [13]:
df.duplicated().sum()

np.int64(0)

In [14]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,403487.0,201744.0,116476.8,1.0,100872.5,201744.0,302615.5,403487.0
listing_type,403487.0,1.294235,0.4677333,1.0,1.0,1.0,2.0,3.0
tom,403487.0,57.022739,44.35893,0.0,29.0,40.0,90.0,180.0
size,257481.0,279.349094,9429.195,1.0,85.0,110.0,140.0,948235.0
furnished,0.0,,,,,,,
price,402772.0,354641.661933,4809503.0,-250.0,2500.0,199000.0,342000.0,2000000000.0


### Трансформация

In [15]:
df.drop(columns=['id', 'type', 'furnished', 'start_date', 'end_date', 'address'], inplace=True)

In [16]:
df = df[df['price_currency'] == 'TRY']

In [17]:
def transform_subtype(val):
    match val:
        case 'Rezidans':
            return 1
        case 'Daire':
            return 2
        case 'Villa':
            return 3
        case 'Müstakil Ev':
            return 4
        case 'Kooperatif':
            return 5
        case 'Yazlık':
            return 6
        case 'Komple Bina':
            return 7
        case 'Prefabrik Ev':
            return 8
        case 'Komple Bina':
            return 9
        case 'Köşk / Konak / Yalı':
            return 10
        case 'Çiftlik Evi':
            return 11
        case 'Yalı Dairesi':
            return 12
        case 'Loft':
            return 13
        case _:
            print(val)

In [18]:
df['sub_type'] = df['sub_type'].apply(transform_subtype)

In [19]:
def transform_heating(val):
    match val:
        case 'Fancoil':
            return 1
        case 'Yok':
            return 2
        case 'Kalorifer (Doğalgaz)':
            return 3
        case 'Kalorifer (Kömür)':
            return 4
        case 'Kombi (Elektrikli)':
            return 5
        case 'Klima':
            return 6
        case 'Kombi (Doğalgaz)':
            return 7
        case 'Merkezi Sistem (Isı Payı Ölçer)':
            return 8
        case 'Merkezi Sistem':
            return 9
        case 'Soba (Kömür)':
            return 10
        case 'Yerden Isıtma':
            return 11
        case 'Soba (Doğalgaz)':
            return 12
        case 'Güneş Enerjisi':
            return 13
        case 'Kalorifer (Akaryakıt)':
            return 14
        case 'Jeotermal':
            return 15
        case 'Kat Kaloriferi':
            return 16
        case _:
            return 17

In [20]:
df['heating_type'] = df['heating_type'].apply(transform_heating)

In [None]:
age_mapping = {
    '6-10 arası': 8,
    '11-15 arası': 13,
    '16-20 arası': 18,
    '21-25 arası': 23,
    '26-30 arası': 28,
    '31-35 arası': 33,
    '36-40 arası': 38,
    '40 ve üzeri': 45,
}
df['building_age'] = df['building_age'].replace(age_mapping)

df['building_age'] = ps.to_numeric(df['building_age'], errors='coerce')
building_age_median = df['building_age'].median()
df['building_age'] = df['building_age'].fillna(building_age_median)

In [None]:
floor_mapping = {
    '10-20 arası': 15,
    '20 ve üzeri': 25,
}

df['total_floor_count'] = df['total_floor_count'].replace(floor_mapping)
df['total_floor_count'] = ps.to_numeric(df['total_floor_count'])
df['total_floor_count'].fillna(round(df['total_floor_count'].mean()), inplace=True)
df['total_floor_count'] = df['total_floor_count'].astype(int)

In [None]:
floor_mapping = {
    '20 ve üzeri': 25,
    'Yüksek Giriş': 1,
    'Kot 2': 2,
    'Bahçe katı': 1,
    'Müstakil': 1,
    'Zemin Kat': 1,
    'Giriş Katı': 1,
    'Kot 4': 4,
    'Kot 1': 1,
    'Kot 3': 3,
    'Bodrum Kat': 0
};

df['floor_no'] = df['floor_no'].replace(floor_mapping)
df['floor_no'] = ps.to_numeric(df['floor_no'], errors='coerce')
df['floor_no'].fillna(round(df['floor_no'].mean()), inplace=True)

In [24]:
['2+1', '1+0', '6+1', '1+1', '3+1', '4+1', '9+2', '5+2', '6+3',
       '6+2', '4+3', '9+5', '5+1', '3+2', '2+2', '4+2', '7+2', '9+4',
       '8+3', '+', '8+1', '10+1', '10+0', '9+3', '8+4', '7+3', '10+2',
       '8+2', '7+1', '5+3', '9+1', '0+0', '10+3', '10+4', '11+3', '15+5'],

(['2+1',
  '1+0',
  '6+1',
  '1+1',
  '3+1',
  '4+1',
  '9+2',
  '5+2',
  '6+3',
  '6+2',
  '4+3',
  '9+5',
  '5+1',
  '3+2',
  '2+2',
  '4+2',
  '7+2',
  '9+4',
  '8+3',
  '+',
  '8+1',
  '10+1',
  '10+0',
  '9+3',
  '8+4',
  '7+3',
  '10+2',
  '8+2',
  '7+1',
  '5+3',
  '9+1',
  '0+0',
  '10+3',
  '10+4',
  '11+3',
  '15+5'],)

In [None]:
room_mapping = {
    '2+1': 1,
    '1+0': 2,
    '6+1': 3,
    '1+1': 4,
    '3+1': 5,
    '4+1': 6,
    '9+2': 7,
    '5+2': 8,
    '6+3': 9,
    '6+2': 10,
    '4+3': 11,
    '9+5': 12,
    '5+1': 13,
    '3+2': 14,
    '2+2': 15,
    '4+2': 16,
    '7+2': 17,
    '9+4': 18,
    '8+3': 19,
    '8+1': 20,
    '10+1': 21,
    '10+0': 22,
    '9+3': 23,
    '8+4': 24,
    '7+3': 25,
    '10+2': 26,
    '8+2': 27,
    '7+1': 28,
    '5+3': 29,
    '9+1': 30,
    '0+0': 35,
    '10+3': 31,
    '10+4': 32,
    '11+3': 33,
    '15+5': 34,
}

df['room_count'] = df['room_count'].replace(room_mapping)
df['room_count'] = ps.to_numeric(df['room_count'], errors='coerce')
df['room_count'].fillna(round(df['room_count'].mean()), inplace=True)
df['room_count'] = df['room_count'].astype(int)

In [26]:
df.dropna(subset=['size'], inplace=True)

In [27]:
df.drop(columns=['price_currency'], inplace=True)

In [28]:
df.to_csv('real_estate_data_clean.csv', index=False)

In [29]:
df.head(25)

Unnamed: 0,sub_type,listing_type,tom,building_age,total_floor_count,floor_no,room_count,size,heating_type,price
0,1,2,30,0.0,25,2.0,1,90.0,1,3500.0
1,2,1,14,0.0,25,25.0,2,43.0,1,490000.0
3,1,1,30,3.0,25,25.0,3,450.0,1,32500000.0
4,1,1,30,0.0,25,2.0,1,90.0,1,1450000.0
5,1,1,30,2.0,15,10.0,4,45.0,1,780000.0
6,2,2,54,0.0,25,14.0,5,160.0,1,3750.0
8,2,1,11,3.0,2,2.0,5,140.0,1,1500000.0
10,4,1,13,0.0,1,1.0,5,125.0,1,2450000.0
11,1,2,13,2.0,25,25.0,5,165.0,1,7500.0
15,3,2,94,8.0,2,3.0,6,200.0,1,3600.0


### Обучение

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
from sklearn.ensemble import (BaggingRegressor, BaggingClassifier,
                              GradientBoostingRegressor, GradientBoostingClassifier,
                              StackingRegressor, StackingClassifier,
                              RandomForestRegressor, RandomForestClassifier)
from sklearn.linear_model import LinearRegression, LogisticRegression
import numpy as np

In [31]:
# Regression target
X_reg = df.drop(columns=['price'])
y_reg = df['price']

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42)

In [32]:
# Classification target
X_clf = df.drop(columns=['sub_type'])
y_clf = df['sub_type']

X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=42, stratify=y_clf)

### Bagging

In [33]:
bagging_reg = BaggingRegressor(RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))
bagging_reg.fit(X_train_reg, y_train_reg)
y_pred_bag_reg = bagging_reg.predict(X_test_reg)
print("Bagging Regressor RMSE:", np.sqrt(mean_squared_error(y_test_reg, y_pred_bag_reg)))
print("Bagging Regressor R²:", r2_score(y_test_reg, y_pred_bag_reg))

Bagging Regressor RMSE: 4646757.185393809
Bagging Regressor R²: -0.039533925394650504


In [34]:
bagging_clf = BaggingClassifier(RandomForestClassifier(n_estimators=40, random_state=42, n_jobs=-1))
bagging_clf.fit(X_train_clf, y_train_clf)
y_pred_bag_clf = bagging_clf.predict(X_test_clf)
print("\nBagging Classifier accuracy:", accuracy_score(y_test_clf, y_pred_bag_clf))
print(classification_report(y_test_clf, y_pred_bag_clf))


Bagging Classifier accuracy: 0.9431946664587313
              precision    recall  f1-score   support

           1       0.86      0.35      0.49       996
           2       0.96      0.99      0.98     45224
           3       0.74      0.81      0.77      2635
           4       0.75      0.45      0.56      1155
           5       1.00      0.36      0.53        11
           6       0.70      0.29      0.41       735
           7       0.96      0.71      0.81       314
           8       0.96      0.91      0.93       109
          10       0.94      0.47      0.63        34
          11       1.00      0.35      0.52        57
          12       1.00      0.17      0.30        23
          13       1.00      0.20      0.33         5

    accuracy                           0.94     51298
   macro avg       0.91      0.50      0.61     51298
weighted avg       0.94      0.94      0.94     51298



### Boosting

In [35]:
gbr = GradientBoostingRegressor(random_state=42)
gbr.fit(X_train_reg, y_train_reg)
y_pred_gbr = gbr.predict(X_test_reg)
print("\nGradient Boosting Regressor RMSE:", np.sqrt(mean_squared_error(y_test_reg, y_pred_gbr)))
print("Gradient Boosting Regressor R²:", r2_score(y_test_reg, y_pred_gbr))


Gradient Boosting Regressor RMSE: 4562384.978385105
Gradient Boosting Regressor R²: -0.002126545026072124


In [36]:
gbc = GradientBoostingClassifier(random_state=42)
gbc.fit(X_train_clf, y_train_clf)
y_pred_gbc = gbc.predict(X_test_clf)
print("Gradient Boosting Classifier accuracy:", accuracy_score(y_test_clf, y_pred_gbc))
print(classification_report(y_test_clf, y_pred_gbc))


Gradient Boosting Classifier accuracy: 0.9238956684471129
              precision    recall  f1-score   support

           1       0.68      0.19      0.30       996
           2       0.95      0.99      0.97     45224
           3       0.66      0.72      0.69      2635
           4       0.60      0.29      0.39      1155
           5       0.60      0.27      0.38        11
           6       0.48      0.06      0.11       735
           7       0.87      0.54      0.67       314
           8       0.78      0.78      0.78       109
          10       0.23      0.15      0.18        34
          11       0.25      0.12      0.16        57
          12       0.20      0.09      0.12        23
          13       0.14      0.20      0.17         5

    accuracy                           0.92     51298
   macro avg       0.54      0.37      0.41     51298
weighted avg       0.91      0.92      0.91     51298



### Stacking

In [38]:
estimators_reg = [
    ('rf', RandomForestRegressor(n_estimators=40, random_state=42, n_jobs=-1)),
    ('gbr', GradientBoostingRegressor(random_state=42))
]
stack_reg = StackingRegressor(estimators=estimators_reg, final_estimator=LinearRegression())
stack_reg.fit(X_train_reg, y_train_reg)
y_pred_stack_reg = stack_reg.predict(X_test_reg)
print("Stacking Regressor RMSE:", np.sqrt(mean_squared_error(y_test_reg, y_pred_stack_reg)))
print("Stacking Regressor R²:", r2_score(y_test_reg, y_pred_stack_reg))


Stacking Regressor RMSE: 4550294.032328327
Stacking Regressor R²: 0.0031779624420082975


In [41]:
estimators_clf = [
    ('rf', RandomForestClassifier(n_estimators=60, random_state=42, n_jobs=-1)),
    ('gbc', GradientBoostingClassifier(random_state=42))
]
stack_clf = StackingClassifier(estimators=estimators_clf,
                               final_estimator=LogisticRegression(max_iter=1000), n_jobs=-1)
stack_clf.fit(X_train_clf, y_train_clf)
y_pred_stack_clf = stack_clf.predict(X_test_clf)

In [42]:
print("Stacking Classifier accuracy:", accuracy_score(y_test_clf, y_pred_stack_clf))
print(classification_report(y_test_clf, y_pred_stack_clf))



Stacking Classifier accuracy: 0.947093453935826
              precision    recall  f1-score   support

           1       0.80      0.48      0.60       996
           2       0.97      0.99      0.98     45224
           3       0.77      0.80      0.78      2635
           4       0.72      0.51      0.60      1155
           5       1.00      0.36      0.53        11
           6       0.69      0.39      0.50       735
           7       0.94      0.76      0.85       314
           8       0.97      0.90      0.93       109
          10       0.88      0.68      0.77        34
          11       0.76      0.51      0.61        57
          12       1.00      0.30      0.47        23
          13       0.00      0.00      0.00         5

    accuracy                           0.95     51298
   macro avg       0.79      0.56      0.63     51298
weighted avg       0.94      0.95      0.94     51298

