In [1]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf

# from sklearn.impute import SimpleImputer
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

print(f"TensorFlow (v{tf.version.VERSION})")

TensorFlow (v2.3.0)


In [2]:
physical_device = tf.config.list_physical_devices('GPU')[0]
try:
  tf.config.experimental.set_memory_growth(physical_device, True)
except:
  tf.print("Invalid device or cannot modify virtual devices once initialized. Device name {}".format(
      physical_device.name
  ))
  pass

In [3]:
tf.config.list_logical_devices()

[LogicalDevice(name='/device:CPU:0', device_type='CPU'),
 LogicalDevice(name='/device:XLA_CPU:0', device_type='XLA_CPU'),
 LogicalDevice(name='/device:GPU:0', device_type='GPU'),
 LogicalDevice(name='/device:XLA_GPU:0', device_type='XLA_GPU')]

In [116]:
df = pd.read_csv("data/train.csv")

df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,city,floor,id,lat,lng,osm_amenity_points_in_0.001,osm_amenity_points_in_0.005,osm_amenity_points_in_0.0075,osm_amenity_points_in_0.01,osm_building_points_in_0.001,...,reform_mean_floor_count_1000,reform_mean_floor_count_500,reform_mean_year_building_1000,reform_mean_year_building_500,region,total_square,street,date,realty_type,price_type
0,Пермь,,COL_0,57.998207,56.292797,4,19,35,52,0,...,5.762963,5.530612,1964.118519,1960.959184,Пермский край,32.0,S27289,2020-01-05,10,0
1,Шатура,,COL_1,55.574284,39.543835,3,24,37,59,0,...,2.894366,3.527778,1952.321678,1957.222222,Московская область,280.0,S17052,2020-01-05,10,0
2,Ярославль,,COL_2,57.61914,39.850525,1,30,67,128,0,...,6.141414,7.222222,1968.15,1973.37037,Ярославская область,297.4,S16913,2020-01-05,110,0
3,Новокузнецк,,COL_3,53.897083,87.108604,0,0,5,21,0,...,8.581081,9.0,1992.716216,2014.0,Кемеровская область,190.0,S10148,2020-01-05,110,0
4,Москва,,COL_4,55.80259,37.48711,1,23,64,153,0,...,7.263889,5.684211,1963.229167,1960.5,Москва,60.2,S1338,2020-01-05,10,0


In [117]:
targets = df['per_square_meter_price']

In [118]:
df = df.drop(columns=[
    "id", "date", "per_square_meter_price",
    "osm_finance_points_in_0.001", "osm_finance_points_in_0.005", "osm_finance_points_in_0.0075", "osm_finance_points_in_0.01",
    "osm_hotels_points_in_0.005", "osm_hotels_points_in_0.0075", "osm_hotels_points_in_0.01",
    "osm_train_stop_points_in_0.005", "osm_train_stop_points_in_0.0075", "osm_train_stop_points_in_0.01",
    "osm_transport_stop_points_in_0.005", "osm_transport_stop_points_in_0.0075", "osm_transport_stop_points_in_0.01",
    "osm_crossing_points_in_0.001", "osm_crossing_points_in_0.005", "osm_crossing_points_in_0.0075", "osm_crossing_points_in_0.01",
    "osm_crossing_closest_dist"
])

In [119]:
def parse_floor(floor):
    try:
        return str(int(float(floor)))
    except Exception as e:
        if isinstance(floor, str):
            splits = [x.strip() for x in floor.split(',') if x.strip() != '']
            if len(splits) == 1:
                if re.match("[0-9]\s*-\s*[0-9]", floor):
                    return "multifloors"
                if re.match("[0-9]\s*-\s*\D", floor):
                    return floor.split('-')[0].strip()
                if "+" in floor:
                    return "multifloors"
                return str(floor)
            else:
                return "multifloors"
        if np.isnan(floor):
            return "nan"

In [120]:
df["floor"] = df['floor'].map(lambda x: parse_floor(x))
df['street'] = df['street'].fillna('nan')
df

Unnamed: 0,city,floor,lat,lng,osm_amenity_points_in_0.001,osm_amenity_points_in_0.005,osm_amenity_points_in_0.0075,osm_amenity_points_in_0.01,osm_building_points_in_0.001,osm_building_points_in_0.005,...,reform_house_population_500,reform_mean_floor_count_1000,reform_mean_floor_count_500,reform_mean_year_building_1000,reform_mean_year_building_500,region,total_square,street,realty_type,price_type
0,Пермь,,57.998207,56.292797,4,19,35,52,0,0,...,765.0,5.762963,5.530612,1964.118519,1960.959184,Пермский край,32.000000,S27289,10,0
1,Шатура,,55.574284,39.543835,3,24,37,59,0,0,...,514.0,2.894366,3.527778,1952.321678,1957.222222,Московская область,280.000000,S17052,10,0
2,Ярославль,,57.619140,39.850525,1,30,67,128,0,0,...,573.0,6.141414,7.222222,1968.150000,1973.370370,Ярославская область,297.400000,S16913,110,0
3,Новокузнецк,,53.897083,87.108604,0,0,5,21,0,0,...,54.0,8.581081,9.000000,1992.716216,2014.000000,Кемеровская область,190.000000,S10148,110,0
4,Москва,,55.802590,37.487110,1,23,64,153,0,1,...,619.0,7.263889,5.684211,1963.229167,1960.500000,Москва,60.200000,S1338,10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279787,Томск,1,56.459183,84.979334,2,33,111,222,0,0,...,979.0,5.714286,5.882353,1972.260870,1973.460000,Томская область,358.320073,S11114,10,1
279788,Санкт-Петербург,1,59.936954,30.356383,10,274,718,1340,0,2,...,1659.0,4.719388,4.706667,1876.994898,1873.186667,Санкт-Петербург,119.637556,S28440,110,1
279789,Калининград,3,54.729233,20.514968,0,12,34,84,0,2,...,460.0,3.950413,4.885714,1964.258333,1970.571429,Калининградская область,312.789725,S6671,10,1
279790,Кемерово,1,55.360680,86.081460,5,57,100,134,0,0,...,429.0,4.691489,4.125000,1957.425532,1954.625000,Кемеровская область,89.201305,S17667,110,1


In [121]:
features = df.columns

numeric_features = df._get_numeric_data().columns
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('imputer', SimpleImputer(strategy="constant", fill_value=-1))
])

categorical_features = list(set(features) - set(numeric_features))
categorical_transformer = Pipeline(steps=[
    ('encoder', OrdinalEncoder()),
    ('imputer', SimpleImputer(strategy="constant", fill_value=-1))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        # ('imputer', SimpleImputer(strategy="constant", fill_value=-1), features)
    ])

data_tranform = Pipeline(steps=[
    ('preprocessor', preprocessor),
])

model = GradientBoostingRegressor(verbose=3, n_estimators=200)

data = data_tranform.fit_transform(df)

In [122]:
x_train, x_test, y_train, y_test = train_test_split(data, targets.values, train_size=0.7)

In [123]:
BATCH_SIZE = 512

train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.batch(BATCH_SIZE)

test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
test_dataset = test_dataset.batch(BATCH_SIZE)

print(train_dataset)
print(test_dataset)

<BatchDataset shapes: ((None, 56), (None,)), types: (tf.float64, tf.float64)>
<BatchDataset shapes: ((None, 56), (None,)), types: (tf.float64, tf.float64)>


In [184]:
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer((56,)),
    tf.keras.layers.Reshape((56, 1)),
    tf.keras.layers.Conv1D(24, 1, kernel_initializer='glorot_uniform'),
    tf.keras.layers.Conv1D(24, 4, padding='same', kernel_initializer='glorot_uniform'),
    tf.keras.layers.PReLU(),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, kernel_initializer='glorot_uniform', kernel_regularizer='l2'),
    
    tf.keras.layers.Activation('relu'),
    tf.keras.layers.GaussianDropout(0.3),
    tf.keras.layers.Dense(128,  kernel_initializer='glorot_uniform'),
    tf.keras.layers.PReLU(),
    tf.keras.layers.Dense(1)
])

model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_7 (Reshape)          (None, 56, 1)             0         
_________________________________________________________________
conv1d_16 (Conv1D)           (None, 56, 24)            48        
_________________________________________________________________
conv1d_17 (Conv1D)           (None, 56, 24)            2328      
_________________________________________________________________
p_re_lu_23 (PReLU)           (None, 56, 24)            1344      
_________________________________________________________________
batch_normalization_8 (Batch (None, 56, 24)            96        
_________________________________________________________________
flatten_8 (Flatten)          (None, 1344)              0         
_________________________________________________________________
dense_28 (Dense)             (None, 512)             

In [185]:
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-2,
    decay_steps=10000,
    decay_rate=0.9)

model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.Huber()
)

In [186]:
class VerboseCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        if epoch == 0 or (epoch+1) % 10 == 0:
            print("Epoch {:4d}: loss - {:.3f}, val_loss - {:.3f}".format(
                epoch+1, logs['loss'], logs['val_loss']
            ))

In [187]:
checkpoint_filepath = 'tf_models/checkpoints/{epoch}-{val_loss}.h5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_loss',
    mode='min',
    save_best_only=True,
    verbose=0
)

In [206]:
model.fit(
    train_dataset,
    epochs=100,
    shuffle=True,
    validation_data=test_dataset,
    callbacks=[
        VerboseCallback(),
        model_checkpoint_callback
    ],
    verbose=0
)

Epoch    1: loss - 34879.145, val_loss - 35778.988
Epoch   10: loss - 34834.145, val_loss - 36059.051
Epoch   20: loss - 34868.512, val_loss - 35724.816
Epoch   30: loss - 34684.719, val_loss - 35817.691
Epoch   40: loss - 34689.711, val_loss - 35732.395
Epoch   50: loss - 34587.570, val_loss - 35596.582
Epoch   60: loss - 34358.301, val_loss - 35705.801
Epoch   70: loss - 34385.375, val_loss - 36147.090
Epoch   80: loss - 34285.598, val_loss - 35424.012
Epoch   90: loss - 34090.820, val_loss - 35306.184
Epoch  100: loss - 34229.738, val_loss - 35200.008


<tensorflow.python.keras.callbacks.History at 0x1c5c23836c8>

In [207]:
test_model = tf.keras.models.load_model('tf_models/checkpoints/100-35200.0078125.h5')
test_model.evaluate(test_dataset)



35200.0078125

In [208]:
def get_deviation(y_pred: float, y_true: float) -> float:
    return (y_true - y_pred) / y_true

def get_hit(deviation: float, w: float=1.1) -> float:
    if deviation < -0.6:
        return 9 * w
    if deviation >= -0.6 and deviation < -0.15:
        return w * ((1 + deviation / 0.15) ** 2)
    if deviation >= -0.15 and deviation < 0.15:
        return 0.0
    if deviation >= 0.15 and deviation < 0.6:
        return (deviation / 0.15 - 1) ** 2
    if deviation >= 0.6:
        return 9.0

def score(y_pred: list, y_true: list, w: float=1.1) -> float:
    y_pred = list(y_pred)
    y_true = list(y_true)
    cum_sum = 0
    for i in range(len(y_pred)):
        dev = get_deviation(y_pred[i], y_true[i])
        cum_sum += get_hit(dev, w)
    return cum_sum / len(y_pred)

pred = test_model.predict(x_test)

score(pred, y_test)

array([3.165604])

In [209]:
pred = test_model.predict(x_train)

score(pred, y_train)

array([2.87753186])

In [210]:
pred = test_model.predict(data)

score(pred, targets)

array([2.96233765])