# Imports

In [None]:
! pip install featuretools

Collecting featuretools
  Downloading featuretools-1.28.0-py3-none-any.whl (619 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/619.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/619.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m358.4/619.2 kB[0m [31m5.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m614.4/619.2 kB[0m [31m6.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.2/619.2 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting holidays<0.33,>=0.13 (from featuretools)
  Downloading holidays-0.32-py3-none-any.whl (754 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m754.4/754.4 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting woodwork>=0.23.0 (from featu

In [None]:
! pip install optuna



In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import featuretools as ft
import lightgbm as lgb
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import optuna

# Data loading

In [None]:
! gdown 1I4UJIytuZtFO_0o-YKz7kqwAro97KLM_

Downloading...
From: https://drive.google.com/uc?id=1I4UJIytuZtFO_0o-YKz7kqwAro97KLM_
To: /content/estate_data.zip
  0% 0.00/1.74M [00:00<?, ?B/s] 60% 1.05M/1.74M [00:00<00:00, 8.66MB/s]100% 1.74M/1.74M [00:00<00:00, 12.8MB/s]


In [None]:
! unzip estate_data.zip

Archive:  estate_data.zip
  inflating: Home Sale Data.csv      


# Data preprocessing

In [None]:
data = pd.read_csv('Home Sale Data.csv', sep=';')

## Basic data preprocessing

In [None]:
data['Studio'] = data['Number of rooms'].str.contains('Studio').astype(int)
data.loc[data['Number of rooms'] == '10 and more than', 'Number of rooms'] = data.loc[data['Number of rooms'] == '10 and more than', 'Number of rooms'].copy().str[:2]
data['Number of rooms'] = data['Number of rooms'].apply(lambda x: sum(map(float, [_ for _ in x.split('+') if _.isdigit()])))

for floor in data['Floor location'].unique():
    if not floor.isdigit():
        data[floor] = np.int64(data['Floor location'] == floor)
        data.loc[data['Floor location'] == floor, 'Floor location'] = 0
data['Floor location'] = data['Floor location'].astype(int)

data.loc[data['Number of floors'] == '30  and more than', 'Number of floors'] = data.loc[data['Number of floors'] == '30  and more than', 'Number of floors'].copy().str[:2]
data['Number of floors'] = data['Number of floors'].astype(int)

data.loc[data['Number of bathrooms'] == '6 and more than', 'Number of bathrooms'] = data.loc[data['Number of bathrooms'] == '6 and more than', 'Number of bathrooms'].copy().str[:2]
data.loc[data['Number of bathrooms'] == 'Absent', 'Number of bathrooms'] = 0
data['Number of bathrooms'] = data['Number of bathrooms'].astype(int)

data['Price'] = data['Price'].str[:-3]
data['Price'] = data['Price'].str.replace('.', '')
data['Price'] = data['Price'].astype(int)
data['Price'] = data['Price'].apply(np.log)

  data['Price'] = data['Price'].str.replace('.', '')


## Datetime converting

In [None]:
data['Adrtisement Date'] = data['Adrtisement Date'].astype('datetime64')
data['Pick Up Data Time'] = data['Pick Up Data Time'].astype('datetime64')

  data['Adrtisement Date'] = data['Adrtisement Date'].astype('datetime64')


## Data encoding

In [None]:
categorical_columns = [column for column in data.columns if data[column].dtype == 'object']

In [None]:
encoder = OneHotEncoder(sparse_output=False, drop='if_binary')
encoded_data = encoder.fit_transform(data[categorical_columns])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out())

In [None]:
data = data.drop(categorical_columns, axis=1)
data = pd.concat([data.reset_index(drop=True), encoded_df], axis=1)

## Feature engineering

In [None]:
columns_to_process = [
    'm² (Gross)', 'm² (Net)', 'Number of rooms', 'Floor location',
    'Number of floors', 'Number of bathrooms', 'Adrtisement Date',
    'Pick Up Data Time'
]
columns_to_ignore = data.drop(columns_to_process, axis=1).columns.to_list()

In [None]:
primitives = [
    'square_root', 'add_numeric', 'divide_numeric', 'multiply_numeric',
    'natural_logarithm', 'subtract_numeric', 'is_weekend', 'is_month_end',
    'day_of_year', 'month', 'is_federal_holiday', 'is_month_start', 'week',
    'weekday', 'is_leap_year', 'nth_week_of_month', 'day', 'year', 'season'
]

In [None]:
es = ft.EntitySet(id='data')
es = es.add_dataframe(dataframe_name="data", dataframe=data, index="index")

feature_matrix, feature_defs = ft.dfs(
    entityset=es, target_dataframe_name='data',
    trans_primitives = primitives,
    ignore_columns={"data": columns_to_ignore},
)

feature_matrix.head()

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0_level_0,m² (Gross),m² (Net),Number of rooms,Floor location,Number of floors,Number of bathrooms,Floor location + Number of bathrooms,Floor location + Number of floors,Floor location + Number of rooms,Floor location + m² (Gross),...,m² (Gross) - m² (Net),m² (Net) - Number of bathrooms,m² (Net) - Number of floors,m² (Net) - Number of rooms,WEEK(Adrtisement Date),WEEK(Pick Up Data Time),WEEKDAY(Adrtisement Date),WEEKDAY(Pick Up Data Time),YEAR(Adrtisement Date),YEAR(Pick Up Data Time)
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,50,40,2.0,5,13,1,6.0,18.0,7.0,55.0,...,10.0,39.0,27.0,38.0,14,2,2,2,2020,2020
1,42,42,2.0,0,3,1,1.0,3.0,2.0,42.0,...,0.0,41.0,39.0,40.0,23,2,0,4,2020,2020
2,55,45,2.0,0,7,1,1.0,7.0,2.0,55.0,...,10.0,44.0,38.0,43.0,1,2,1,4,2019,2020
3,62,46,2.0,0,5,1,1.0,5.0,2.0,62.0,...,16.0,45.0,41.0,44.0,52,2,6,4,2019,2020
4,70,55,2.0,0,4,1,1.0,4.0,2.0,70.0,...,15.0,54.0,51.0,53.0,1,2,2,1,2020,2020


In [None]:
feature_matrix_train_enc, features_enc = ft.encode_features(
    feature_matrix, feature_defs
)
feature_matrix_train_enc.head()

Unnamed: 0_level_0,m² (Gross),m² (Net),Number of rooms,Floor location,Number of floors,Number of bathrooms,Floor location + Number of bathrooms,Floor location + Number of floors,Floor location + Number of rooms,Floor location + m² (Gross),...,WEEKDAY(Pick Up Data Time) = 2,WEEKDAY(Pick Up Data Time) = 1,WEEKDAY(Pick Up Data Time) = 3,WEEKDAY(Pick Up Data Time) is unknown,YEAR(Adrtisement Date) = 2020,YEAR(Adrtisement Date) = 2019,YEAR(Adrtisement Date) = 2018,YEAR(Adrtisement Date) is unknown,YEAR(Pick Up Data Time) = 2020,YEAR(Pick Up Data Time) is unknown
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,50,40,2.0,5,13,1,6.0,18.0,7.0,55.0,...,True,False,False,False,True,False,False,False,True,False
1,42,42,2.0,0,3,1,1.0,3.0,2.0,42.0,...,False,False,False,False,True,False,False,False,True,False
2,55,45,2.0,0,7,1,1.0,7.0,2.0,55.0,...,False,False,False,False,False,True,False,False,True,False
3,62,46,2.0,0,5,1,1.0,5.0,2.0,62.0,...,False,False,False,False,False,True,False,False,True,False
4,70,55,2.0,0,4,1,1.0,4.0,2.0,70.0,...,False,True,False,False,True,False,False,False,True,False


In [None]:
processed_data = pd.concat([feature_matrix_train_enc, data[columns_to_ignore]], axis=1)

## Train/test split

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(
    processed_data.drop('Price', axis=1), processed_data['Price'],
    test_size=.3, random_state=17
)

## Price scaling

In [None]:
scaler = StandardScaler()
train_labels = scaler.fit_transform(train_labels.values.reshape(-1, 1))
test_labels = scaler.transform(test_labels.values.reshape(-1, 1))

## Feature selection

In [None]:
feature_selector = lgb.LGBMRegressor(random_state=17, verbosity=-1)
feature_selector.fit(train_features, train_labels)



In [None]:
importances_df = pd.DataFrame({'name': train_features.columns, 'coef': feature_selector.feature_importances_})

In [None]:
print(importances_df.sort_values(by='coef', ascending=False).head(50).to_markdown())

|      | name                                   |   coef |
|-----:|:---------------------------------------|-------:|
|  372 | District_Beşiktaş                      |     74 |
|  387 | District_Sarıyer                       |     60 |
|  381 | District_Kartal                        |     59 |
|   15 | m² (Gross) + Number of floors          |     58 |
|  365 | District_Bakırköy                      |     57 |
|  371 | District_Beyoğlu                       |     55 |
|  398 | District_Şişli                         |     53 |
|   39 | Number of rooms / m² (Gross)           |     47 |
|  328 | Throat                                 |     36 |
|  377 | District_Fatih                         |     35 |
|  396 | District_Üsküdar                       |     33 |
| 1008 | Building Age_0                         |     33 |
|  370 | District_Beylikdüzü                    |     31 |
|  361 | District_Arnavutköy                    |     30 |
|  380 | District_Kadıköy                       |     30

In [None]:
useless_features = importances_df.loc[importances_df['coef'] == 0, 'name']

In [None]:
train_features = train_features.drop(useless_features, axis=1)
test_features = test_features.drop(useless_features, axis=1)

# Data modeling

## Initial model

Model training

In [None]:
model = lgb.LGBMRegressor(random_state=17, verbosity=-1)
model.fit(train_features, train_labels)



Model evaluation

In [None]:
mean_squared_error(test_labels, model.predict(test_features))

0.03572483119121389

## Hyperparameters tuning

In [None]:
def objective(trial):
    (split_train_features, split_val_features,
     split_train_labels, split_val_labels) = train_test_split(
        train_features, train_labels, test_size=0.25
    )

    param = {
        "verbosity": -1,
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    model = lgb.LGBMRegressor(
        random_state=17, **param
    )
    model.fit(
        split_train_features, split_train_labels,
        eval_set=[(split_val_features, split_val_labels)],
    )
    score = model.best_score_['valid_0']['l2']
    return score

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

In [None]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 100
Best trial:
  Value: 0.03663771272039226
  Params: 
    lambda_l1: 0.0008473727994710632
    lambda_l2: 1.3528725287500655e-07
    num_leaves: 104
    feature_fraction: 0.9487317946398935
    bagging_fraction: 0.7537798500073645
    bagging_freq: 7
    min_child_samples: 17


Training tuned model

In [None]:
tuned_model = lgb.LGBMRegressor(random_state=17, verbosity=-1, **trial.params)
tuned_model.fit(train_features, train_labels)



Tuned model evaluation

In [None]:
mean_squared_error(test_labels, tuned_model.predict(test_features))

0.03327672736013775

Lab 3: MAPE

In [None]:
inversed_labels = np.exp(scaler.inverse_transform(test_labels))
inversed_predictions = np.exp(
    scaler.inverse_transform(tuned_model.predict(test_features).reshape(-1, 1))
)

In [None]:
mean_squared_error(inversed_labels, inversed_predictions)

360702954169.4948