In [142]:
import pandas as pd
import json
import numpy as np
import data_imputation as di
import scoring_functions as sf

# Load data

In [143]:
apartments_train = pd.read_csv('data/apartments_train.csv')
buildings_train = pd.read_csv('data/buildings_train.csv')
apartments_test = pd.read_csv('data/apartments_test.csv')
buildings_test = pd.read_csv('data/buildings_test.csv')

# load meta info
with open('data/apartments_meta.json') as f:
    apartments_meta = json.load(f)
    
with open('data/buildings_meta.json') as f:
    buildings_meta = json.load(f)

meta_info = apartments_meta + buildings_meta

In [144]:
train = pd.merge(
    apartments_train, 
    buildings_train.set_index('id'), 
    how='left', 
    left_on='building_id', 
    right_index=True
)
test = pd.merge(
    apartments_test, 
    buildings_test.set_index('id'), 
    how='left', 
    left_on='building_id', 
    right_index=True
)

In [145]:
ind = test[test["longitude"].isna()].index
test.at[ind, "latitude"] = 55.56776345324702
test.at[ind, "longitude"] = 37.48171529826662

In [146]:
ind = train[train.district.isna()].index
train.at[ind, "district"] = 12
ind = test[test.district.isna()].index
test.at[ind, "district"] = 12

In [147]:
train["log_price"] = np.log1p(train.price)
train["price_per_m2"] = train.price / train.area_total

# Feature Engineering

In [148]:
import feature_engineering as fe

def construct_features(data, feature_config):
    if feature_config["distance_to_center"]:
        fe.distance_to_center_feature(data)
    if feature_config["distance_to_metro"]:
        add_metro_lines = feature_config["add_metro_lines"]
        fe.distance_to_metro_feature(data, add_metro_lines)
    if feature_config["distance_to_hospital"]:
        fe.distance_to_hospital_feature(data)
    if feature_config["bathrooms"]:
        fe.bathrooms_feature(data)
    if feature_config["bearing"]:
        fe.bearing_feature(data)
    if feature_config["elevator"]:
        fe.elevator_feature(data)
    if feature_config["has_seller"]:
        fe.has_seller_feature(data)
    if feature_config["room_size_avg"]:
        fe.room_size_avg_feature(data)
    if feature_config["area_total_bins"]:
        fe.area_total_bins_feature(data)
    if feature_config["distance_to_airport"]:
        fe.distance_to_airport_feature(data)
    if feature_config["distance_to_state_uni"]:
        fe.distance_to_state_university_feature(data)
    if feature_config["distance_to_tech_uni"]:
        fe.distance_to_tech_uni_feature(data)
    if feature_config["area_total_log"]:
        fe.area_total_log_feature(data)
    if feature_config["remaining_area"]:
        fe.remaining_area_feature(data)
    fe.wealthy_districts_features(data, feature_config)
    return data

# Imputation

In [149]:
def impute_area_living_bins(train_data, test_data):
    feature = "area_living"
    di.impute_with_bin_mean(data=train_data, train_data=train_data, test_data=test_data, feature=feature, bin_feature="area_total")
    di.impute_with_bin_mean(data=test_data, train_data=train_data, test_data=test_data, feature=feature, bin_feature="area_total")
    d = train_data[train_data.area_living.isna()]
    for ind, row in d.iterrows():
        train_data.at[ind, feature] = 0.85 * row.area_total
    d = test_data[test_data.area_living.isna()]
    for ind, row in d.iterrows():
        test_data.at[ind, feature] = 0.85 * row.area_total
    print("Remaining nan values in {} for {}: {}".format("train_data", feature, len(train_data[train_data[feature].isna()])))
    print("Remaining nan values in {} for {}: {}".format("test_data", feature, len(test_data[test_data[feature].isna()])))
    return train_data, test_data

In [150]:
def impute_area_kitchen_bins(train_data, test_data):
    feature = "area_kitchen"
    di.impute_with_bin_mean(data=train_data, train_data=train_data, test_data=test_data, feature=feature, bin_feature="area_total")
    di.impute_with_bin_mean(data=test_data, train_data=train_data, test_data=test_data, feature=feature, bin_feature="area_total")
    d = train_data[train_data.area_kitchen.isna()]
    for ind, row in d.iterrows():
        train_data.at[ind, feature] = 0.05 * row.area_total
    d = test_data[test_data.area_kitchen.isna()]
    for ind, row in d.iterrows():
        test_data.at[ind, feature] = 0.05 * row.area_total
    print("Remaining nan values in {} for {}: {}".format("train_data", feature, len(train_data[train_data[feature].isna()])))
    print("Remaining nan values in {} for {}: {}".format("test_data", feature, len(test_data[test_data[feature].isna()])))
    return train_data, test_data

In [151]:
def impute_bathrooms_bins(train_data, test_data):
    feature = "bathrooms"
    di.impute_with_bin_mean(data=train_data, train_data=train_data, test_data=test_data, feature=feature, bin_feature="area_total")
    di.impute_with_bin_mean(data=test_data, train_data=train_data, test_data=test_data, feature=feature, bin_feature="area_total")
    ind = train_data[train_data.area_total <= 800].index
    train_data.at[ind, feature] = 5
    ind = train_data[train_data.area_total > 800].index
    train_data.at[ind, feature] = 6
    ind = test_data[test_data.area_total <= 800].index
    test_data.at[ind, feature] = 5
    ind = test_data[test_data.area_total > 800].index
    test_data.at[ind, feature] = 6
    print("Remaining nan values in {} for {}: {}".format("train_data", feature, len(train_data[train_data[feature].isna()])))
    print("Remaining nan values in {} for {}: {}".format("test_data", feature, len(test_data[test_data[feature].isna()])))
    return train_data, test_data

In [152]:
def impute_constructed_bins(train_data, test_data):
    feature = "constructed"
    d = train_data[train_data.constructed.isna()]
    ind = d[d.new == 1].index
    train_data.at[ind, feature] = 2021
    d = test_data[test_data.constructed.isna()]
    ind = d[d.new == 1].index
    test_data.at[ind, feature] = 2021
    di.impute_with_bin_mean(data=train_data, train_data=train_data, test_data=test_data, feature=feature, bins=12, bin_feature="district")
    di.impute_with_bin_mean(data=test_data, train_data=train_data, test_data=test_data, feature=feature, bins=12, bin_feature="district")
    print("Remaining nan values in {} for {}: {}".format("train_data", feature, len(train_data[train_data[feature].isna()])))
    print("Remaining nan values in {} for {}: {}".format("test_data", feature, len(test_data[test_data[feature].isna()])))
    return train_data, test_data

In [153]:
def impute_material_bins(train_data, test_data):
    feature = "material"
    di.impute_with_bin_mean(data=train_data, train_data=train_data, test_data=test_data, feature=feature, bins=20, bin_feature="constructed")
    di.impute_with_bin_mean(data=test_data, train_data=train_data, test_data=test_data, feature=feature, bins=20, bin_feature="constructed")
    print("Remaining nan values in {} for {}: {}".format("train_data", feature, len(train_data[train_data[feature].isna()])))
    print("Remaining nan values in {} for {}: {}".format("test_data", feature, len(test_data[test_data[feature].isna()])))
    return train_data, test_data

In [154]:
def impute_material_category(train_data, test_data):
    feature = "material"
    ind = train_data[train_data.material.isna()].index
    train_data.at[ind, feature] = 7
    ind = test_data[test_data.material.isna()].index
    test_data.at[ind, feature] = 7
    print("Remaining nan values in {} for {}: {}".format("train_data", feature, len(train_data[train_data[feature].isna()])))
    print("Remaining nan values in {} for {}: {}".format("test_data", feature, len(test_data[test_data[feature].isna()])))
    return train_data, test_data

In [155]:
def impute_heating_bins(train_data, test_data):
    feature = "heating"
    di.impute_with_bin_mean(data=train_data, train_data=train_data, test_data=test_data, feature=feature, bins=20, bin_feature="constructed")
    di.impute_with_bin_mean(data=test_data, train_data=train_data, test_data=test_data, feature=feature, bins=20, bin_feature="constructed")
    print("Remaining nan values in {} for {}: {}".format("train_data", feature, len(train_data[train_data[feature].isna()])))
    print("Remaining nan values in {} for {}: {}".format("test_data", feature, len(test_data[test_data[feature].isna()])))
    return train_data, test_data

In [156]:
def impute_heating_category(train_data, test_data):
    feature = "heating"
    ind = train_data[train_data.heating.isna()].index
    train_data.at[ind, feature] = 4
    ind = test_data[test_data.heating.isna()].index
    test_data.at[ind, feature] = 4
    print("Remaining nan values in {} for {}: {}".format("train_data", feature, len(train_data[train_data[feature].isna()])))
    print("Remaining nan values in {} for {}: {}".format("test_data", feature, len(test_data[test_data[feature].isna()])))
    return train_data, test_data

In [157]:
def impute_parking_bins(train_data, test_data):
    feature = "parking"
    di.impute_with_bin_mean(data=train_data, train_data=train_data, test_data=test_data, feature=feature, bin_feature="area_total")
    di.impute_with_bin_mean(data=test_data, train_data=train_data, test_data=test_data, feature=feature, bin_feature="area_total")
    ind = train_data[train_data.parking.isna()].index
    train_data.at[ind, feature] = 0
    ind = test_data[test_data.parking.isna()].index
    test_data.at[ind, feature] = 0
    print("Remaining nan values in {} for {}: {}".format("train_data", feature, len(train_data[train_data[feature].isna()])))
    print("Remaining nan values in {} for {}: {}".format("test_data", feature, len(test_data[test_data[feature].isna()])))
    return train_data, test_data

In [158]:
def impute_parking_category(train_data, test_data):
    feature = "parking"
    ind = train_data[train_data.parking.isna()].index
    train_data.at[ind, feature] = 3
    ind = test_data[test_data.parking.isna()].index
    test_data.at[ind, feature] = 3
    print("Remaining nan values in {} for {}: {}".format("train_data", feature, len(train_data[train_data[feature].isna()])))
    print("Remaining nan values in {} for {}: {}".format("test_data", feature, len(test_data[test_data[feature].isna()])))
    return train_data, test_data

In [159]:
def impute_elevator_without_bins(train_data, test_data):
    feature = "elevator_without"
    di.impute_with_bin_mean(data=train_data, train_data=train_data, test_data=test_data, feature=feature, bins=20, bin_feature="stories")
    di.impute_with_bin_mean(data=test_data, train_data=train_data, test_data=test_data, feature=feature, bins=20, bin_feature="stories")
    print("Remaining nan values in {} for {}: {}".format("train_data", feature, len(train_data[train_data[feature].isna()])))
    print("Remaining nan values in {} for {}: {}".format("test_data", feature, len(test_data[test_data[feature].isna()])))
    return train_data, test_data

In [204]:
def impute_remaining_nans(train_data, test_data):
    train_data.loc[:, "seller"] = train_data.loc[:, "seller"].fillna(4)
    test_data.loc[:, "seller"] = test_data.loc[:, "seller"].fillna(4)
    train_data.loc[:, "layout"] = train_data.loc[:, "layout"].fillna(3)
    test_data.loc[:, "layout"] = test_data.loc[:, "layout"].fillna(3)
    train_data.loc[:, "condition"] = train_data.loc[:, "condition"].fillna(4)
    test_data.loc[:, "condition"] = test_data.loc[:, "condition"].fillna(4)
    print("Remaining nan values in {} for {}: {}".format("train_data", "seller", len(train_data[train_data.seller.isna()])))
    print("Remaining nan values in {} for {}: {}".format("test_data", "seller", len(test_data[test_data.seller.isna()])))
    print("Remaining nan values in {} for {}: {}".format("train_data", "layout", len(train_data[train_data.layout.isna()])))
    print("Remaining nan values in {} for {}: {}".format("test_data", "layout", len(test_data[test_data.layout.isna()])))
    print("Remaining nan values in {} for {}: {}".format("train_data", "condition", len(train_data[train_data.condition.isna()])))
    print("Remaining nan values in {} for {}: {}".format("test_data", "condition", len(test_data[test_data.condition.isna()])))
    feats = ["ceiling", "bathrooms_shared", "bathrooms_private", "windows_court", "windows_street", "garbage_chute", "balconies", "loggias", "phones", "elevator_passenger", "elevator_service"]
    for feat in feats:
        di.impute_with_bin_mean(data=train_data, train_data=train_data, test_data=test_data, feature=feat, bin_feature="area_total")
        di.impute_with_bin_mean(data=test_data, train_data=train_data, test_data=test_data, feature=feat, bin_feature="area_total")
        fill_value = train_data[train_data[feat].notna()][feat].mean(axis=0).round()
        train_data.loc[:, feat] = train_data.loc[:, feat].fillna(fill_value)
        fill_value = test_data[test_data[feat].notna()][feat].mean(axis=0).round()
        test_data.loc[:, feat] = test_data.loc[:, feat].fillna(fill_value)
        print("Remaining nan values in {} for {}: {}".format("train_data", feat, len(train_data[train_data[feat].isna()])))
        print("Remaining nan values in {} for {}: {}".format("test_data", feat, len(test_data[test_data[feat].isna()])))
        
    return train_data, test_data

# Outliers

In [161]:
def remove_outliers(train_data):
    pass

In [162]:
def remove_min_max_outliers(train_data, test_data, features):
    t = train_data.copy()
    test_feat_min_max = pd.DataFrame({"max": test_data.describe().loc["max"], "min": test_data.describe().loc["min"]}).T
    for feature in features:
        max_val = test_feat_min_max[feature]["max"]
        min_val = test_feat_min_max[feature]["min"]
        max_ind = t[t[feature] > max_val].index
        min_ind = t[t[feature] < min_val].index
        t.drop(max_ind, inplace=True)
        t.drop(min_ind, inplace=True)

    print("Samples before: {} | Samples after: {}".format(len(train_data), len(t)))
    return t

# Categorical Data

In [163]:
def one_hot_encoding(train_data, test_data, config):
    district_categories = {
        3: "East",
        6: "South-West",
        5: "South",
        4: "South-East",
        0: "Central",
        2: "North-East",
        1: "North",
        8: "North-West",
        7: "West",
        11: "Novomoskovsk",
        10: "Troitsk",
        9: "Zelenograd",
        12: "Other_district"
    }
    material_categories = {
        0: "Bricks",
        1: "Wood",
        2: "Monolith",
        3: "Panel",
        4: "Block",
        5: "Monolithic_brick",
        6: "Stalin_project",
        7: "Other_material"
    }
    heating_categories = {
        0: "Central",
        1: "Individual",
        2: "Boiler",
        3: "Autonomous_boiler",
        4: "Other_heating"
    }
    parking_categories = {
        0: "Ground",
        1: "Underground",
        2: "Multilevel",
        3: "No_parking"
    }
    seller_categories = {
        0: "Owner",
        1: "Company",
        2: "Agents",
        3: "Developer",
        4: "Other_seller"
    }
    layout_categoires = {
        0: "Adjacent",
        1: "Isolated",
        2: "Adjacent_isolated",
        3: "Other_layout"
    }
    condition_categories = {
        0: "Undecorated",
        1: "Decorated",
        2: "Euro_repair",
        3: "Special_design",
        4: "Other_condition"
    }
    categorical_features = {
        "parking": parking_categories,
        "heating": heating_categories,
        "material": material_categories,
        "district": district_categories,
        "seller": seller_categories,
        "layout": layout_categoires,
        "condition": condition_categories
    }
    for feat in config["one_hot_encode_features"]:
        fe.one_hot_encode(train_data, categorical_features[feat], feat, remove_org=False)
        fe.one_hot_encode(test_data, categorical_features[feat], feat, remove_org=False)
    return train_data, test_data

# Feature Selection

In [164]:
def get_features(train_data, nan_threshold, corr_threshold):
    nan_count = [len(train_data[train_data[feature].isna()]) for feature in train_data.corr()["log_price"].index]
    corr_df = pd.DataFrame(
        {
            "corr": train_data.corr(method="spearman")["log_price"], 
            "nan_count": nan_count,
            "nan_percentage": (np.array(nan_count) / train_data.shape[0])
        }
    ).sort_values(by="nan_count", ascending=False)
    rel_feats = corr_df[(corr_df.nan_percentage <= nan_threshold) & (corr_df["corr"].abs() >= corr_threshold)]
    return rel_feats.index

# Training

In [165]:
from sklearn.neighbors import BallTree
from tqdm import tqdm
def val_split_using_test(train_data, test_data, target_features, ratio=0.2):
    d_train = train_data.copy()
    d_test = test_data.copy()
    
    val_sample_count = round(ratio * train_data.shape[0])
    features = ["area_total", "latitude", "longitude", "floor", "rooms", "district"]
    test_samples = d_test.sample(n=val_sample_count)
    val_set = []
    for i, test_sample in tqdm(test_samples.iterrows(), total=val_sample_count):
        tree = BallTree(d_train[features])
        ind = tree.query([test_sample[features]], return_distance=False)[0][0]
        val_set.append(d_train.loc[ind])
        d_train.drop(index=ind, inplace=True)
        d_train.reset_index(drop=True, inplace=True)
    d_val =  pd.DataFrame(val_set).reset_index()
    train_y = d_train[target_features]
    val_y = d_val[target_features]
    return d_train, d_val, train_y, val_y

In [166]:
from sklearn.model_selection import KFold
from tqdm import tqdm

def cross_validation(models, train_data, features, model_targets, k=5):
    
    rsmle_scores = {}
    
    for model_name, model in models.items():
        model_scores, model_train_scores = single_model_cross_validation(model=model, k=k, X_train=train_data[features], y_train=train_data[["log_price", "price_per_m2", "price"]], target_feature=model_targets[model_name])
        print("Model: {} | Mean train rsmle: {} | Standard deviation: {}".format(model_name, model_train_scores.mean(), model_train_scores.std()))
        print("Model: {} | Mean val rsmle: {} | Standard deviation: {}".format(model_name, model_scores.mean(), model_scores.std()))
        rsmle_scores[model_name] = model_scores.mean()
    return rsmle_scores
    
def single_model_cross_validation(model, k, X_train, y_train, target_feature):
    kf = KFold(
        n_splits=k,
        shuffle=True,
        random_state=42
    )
    scores = []
    train_scores = []
    train_data = X_train.values
    area_total_values = X_train.area_total.values
    ground_truth = y_train[target_feature].values
    for i, (train_index, test_index) in tqdm(enumerate(kf.split(train_data)), total=k):
        model.fit(train_data[train_index], ground_truth[train_index])
        train_pred = model.predict(train_data[train_index])
        test_pred = model.predict(train_data[test_index])
        if target_feature == "log_price":
            y_true = (np.e ** ground_truth[test_index]) - 1
            test_pred = (np.e ** test_pred) - 1
            y_true_train = (np.e ** ground_truth[train_index]) - 1
            train_pred = (np.e ** train_pred) - 1
        elif target_feature == "price_per_m2":
            y_true = ground_truth[test_index] * area_total_values[test_index]
            test_pred = test_pred * area_total_values[test_index]
            y_true_train = ground_truth[train_index] * area_total_values[train_index]
            train_pred = train_pred * area_total_values[train_index]
        else:
            raise Exception("Unknown target feature!")
        train_rsmle = sf.root_mean_squared_log_error(y_true=y_true_train, y_pred=train_pred)
        rsmle = sf.root_mean_squared_log_error(y_true=y_true, y_pred=test_pred)
        scores.append(rsmle)
        train_scores.append(train_rsmle)
    train_scores = np.array(train_scores)
    scores = np.array(scores)
    return scores, train_scores

In [167]:
def train_models(X_train, X_val, y_train, y_val, models, model_config):
    rsmle_scores = {}
    val_preds = []
    for model_name, model in models.items():
        target_value = model_config[model_name]
        model.fit(X_train, y_train[target_value])
        train_pred = model.predict(X_train)
        val_pred = model.predict(X_val)
        if target_value == "log_price":
            train_pred = (np.e ** train_pred) - 1
            val_pred = (np.e ** val_pred) - 1
        elif target_value == "price_per_m2":
            train_pred *= X_train.area_total
            val_pred *= X_val.area_total
        rsmle_train = sf.root_mean_squared_log_error(y_true=y_train["price"], y_pred=train_pred)
        rsmle_val = sf.root_mean_squared_log_error(y_true=y_val["price"], y_pred=val_pred)
        # rsmle_scores["{}_train".format(model_name)] = rsmle_train
        rsmle_scores[model_name] = rsmle_val
        print("Model: {} | RSMLE Train: {} | RSMLE Val: {}".format(model_name, rsmle_train, rsmle_val))
        val_preds.append(val_pred)
    
    # averaging
    acc = pd.DataFrame(rsmle_scores, index=[0])
    acc = acc.T
    acc.columns = ["rsmle"]
    avg_prediction = np.average(val_preds, 
        weights = 1 / acc['rsmle'] ** 4,
        axis=0
    )
    avg_rsmle = sf.root_mean_squared_log_error(y_true=y_val["price"], y_pred=avg_prediction)
    print("Avg. RSMLE Val: {}".format(avg_rsmle))
    
    return rsmle_scores

In [224]:
feature_config = {
    "distance_to_center": True,
    "distance_to_metro": True, # slight improvement
    "distance_to_hospital": True, 
    "elevator": False, # not improving
    "has_seller": False, # deteriorating
    "bathrooms": False, # not improving
    "bearing": True, # slight improvement
    "top_floor": False,
    "distance_to_airport": False, # no improvement
    "add_metro_lines": True, # slight improvement
    "room_size_avg": False, # deteriorating
    "area_total_bins": False, # not improving
    "area_total_log": False, # not improving,
    "remaining_area": False, # deteriorates significantly,
    "distance_to_state_uni": True, #improved slightly
    "distance_to_tech_uni": True, # improved slightly
    "is_in_khamovniki": True, # improving
    "is_in_yakimanka": True, # improving
    "is_in_arbat": True,
    "is_in_presnensky": True,
    "is_in_tverskoy": True,
    "distance_to_ulitsa_ostozhenka": False
}

print("Constructing features...")
train_with_feats = train.copy()
test_with_feats = test.copy()
train_data = construct_features(train_with_feats, feature_config)
test_data = construct_features(test_with_feats, feature_config)

Constructing features...


In [225]:
def prepare_data(train_data, test_data, config):
    
    ###### data imputation ######
    print("Imputing missing data...")
    if config["area_living"] == "impute_bins":
        train_data, test_data = impute_area_living_bins(train_data, test_data)
    if config["area_kitchen"] == "impute_bins":
        train_data, test_data = impute_area_kitchen_bins(train_data, test_data)
    if config["bathrooms"] == "impute_bins":
        train_data, test_data = impute_bathrooms_bins(train_data, test_data)
    if config["constructed"] == "impute_bins":
        train_data, test_data = impute_constructed_bins(train_data, test_data)
    if config["material"] == "impute_bins":
        train_data, test_data = impute_material_bins(train_data, test_data)
    elif config["material"] == "impute_category":
        train_data, test_data = impute_material_category(train_data, test_data)
    if config["heating"] == "impute_bins":
        train_data, test_data = impute_heating_bins(train_data, test_data)
    elif config["heating"] == "impute_category":
        train_data, test_data = impute_heating_category(train_data, test_data)
    if config["parking"] == "impute_bins":
        train_data, test_data = impute_parking_bins(train_data, test_data)
    elif config["parking"] == "impute_category":
        train_data, test_data = impute_parking_category(train_data, test_data)
    if config["elevator_without"] == "impute_bins":
        train_data, test_data = impute_elevator_without_bins(train_data, test_data)
    train_data, test_data = impute_remaining_nans(train_data, test_data)
    
    
    ###### categorical features ######
    print("One-hot-encoding categorical features...")
    if config["one_hot_encoding"]:
        train_data, test_data = one_hot_encoding(train_data, test_data, config)
    
    ###### feature selection ######
    print("Feature selection...")
    features = get_features(train_data=train_data, nan_threshold=config["nan_threshold"], corr_threshold=config["corr_threshold"])
    features = features.drop(["id", "building_id", "heating", "material", "log_price", "price", "price_per_m2", "parking", "district"], errors="ignore")
    features = features.drop(config["drop_features"], errors="ignore")
    print("Features: {}".format(features.to_numpy()))
    
    ###### outliers ######
    print("Removing outliers...")
    if config["remove_outliers"]:
        train_data = remove_outliers(train_data)
    if config["remove_min_max_outliers"]:
        train_data = remove_min_max_outliers(train_data, test_data, features)
    return train_data, test_data, features

In [226]:
from sklearn.model_selection import train_test_split

def training(config):
    train_data = train_with_feats.copy()
    test_data = test_with_feats.copy()
    
    ###### prepare data ######
    train_data, test_data, features = prepare_data(train_data, test_data, config)
    
    ###### training ######
    if config["training_method"] == "based_on_test":
        X_train, X_val, y_train, y_val = val_split_using_test(train_data, test_data, ["log_price", "price_per_m2", "price"], config["split_ratio"])
        scores = train_models(X_train[features], X_val[features], y_train, y_val, config["models"], config["model_targets"])
    elif config["training_method"] == "cross_validation":
        scores = cross_validation(models=config["models"], train_data=train_data, features=features, model_targets=config["model_targets"])
    elif config["training_method"] == "random":
        X_train, X_val, y_train, y_val = train_test_split(train_data[features], train_data[["log_price", "price_per_m2", "price"]], test_size=config["split_ratio"])
        scores = train_models(X_train, X_val, y_train, y_val, config["models"], config["model_targets"])
    elif config["training_method"] == "kfold_stacking":
        scores = kfold_stacking(train_data[features], train_data[["log_price", "price_per_m2", "price"]], config["models"], config["model_targets"])
    else:
        pass
    
    ###### prediction ######
    if config["predict"]:
        predictions = []
        for model_name, model in config["models"].items():
            target_feature = config["model_targets"][model_name]
            if config["retrain_on_full_dataset"]:
                model.fit(train_data[features], train_data[target_feature])
            model_pred = model.predict(test_data[features])
            if target_feature == "log_price":
                model_pred = (np.e ** model_pred) - 1
            else:
                model_pred *= test_data.area_total
            predictions.append(model_pred)
        if config["predict_method"] == "weighted_average":
            if not scores:
                raise Exception("Scores missing. Averaging not possible.")
            else:
                scores = pd.DataFrame(scores, index=[0]).T
                scores.columns = ["rsmle"]
                pred = np.average(predictions, 
                    weights = 1 / scores['rsmle'] ** 4,
                    axis=0
                )
        pred = np.mean(predictions, axis=0)
        submission = pd.DataFrame()
        submission['id'] = test_data.id
        submission['price_prediction'] = pred
        submission.to_csv('submission.csv', index=False)
    
    return scores

# Models

In [241]:
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgbm
from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

model1 = RandomForestRegressor(
    n_estimators=200,
    criterion='mse',
    n_jobs=2,
    random_state=42,
    #max_features=24,
    #max_features='log2',
    #max_features=41,
    max_features=55,
    bootstrap=False,
    max_depth=23,
    #max_depth=24,
    #max_depth=28,
    min_samples_split=2,
    #min_samples_split=5,
    #min_samples_split=3,
    #min_samples_leaf=2
    min_samples_leaf=1,
    min_weight_fraction_leaf=1.6746306849395426e-07
)
# params={'max_depth': 23, 'min_samples_split': 2, 'max_features': 55, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 1.6746306849395426e-07}

model2 = lgbm.LGBMRegressor(
    random_state=42,
    learning_rate=0.1,
    n_estimators=3000,
    boosting_type='gbdt',
    n_jobs=2,
    num_leaves=20, # 20 previous
    #num_leaves=214,
    #min_data_in_leaf=25, # 25 previous
    #min_data_in_leaf=19,
    #min_data_int_leaf=7
    min_data_in_leaf=12,
    #max_depth=25,
    max_depth=17,
    #max_depth=33,
    #max_bin=150, # 150 previous
    #max_bin=354,
    #max_bin=156,
    max_bin=160,
    #min_child_samples=53,
    #min_child_samples=61,
    min_child_samples=99,
    #reg_alpha=0.0010639765274088186,
    #reg_alpha=0.049327251551963845,
    reg_alpha=2.1439788525591673e-07,
    #reg_lambda=9.735845024294461
    #reg_lambda=0.004337386035445012
    reg_lambda=1.0826317913353088e-07,
)
# test_val_split
# params={'num_leaves': 119, 'min_data_in_leaf': 12, 'max_depth': 15, 'max_bin': 160, 'lambda_l1': 2.1439788525591673e-07, 'lambda_l2': 1.0826317913353088e-07, 'min_child_samples': 99}

model3 = CatBoostRegressor(
    n_estimators=2000,
    #learning_rate=0.2,
    learning_rate=0.22343174523534018,
    #l2_leaf_reg=3,
    #l2_leaf_reg=0.00814449542977775,
    l2_leaf_reg=0.0001351899206097861,
    #bagging_temperature=1,
    #bagging_temperature=20,
    bagging_temperature=8,
    #min_data_in_leaf=21,
    min_data_in_leaf=14,
    thread_count=2,
    depth=7,
    #depth=10,
    silent=True,
    random_seed=42,
)
# 'depth': 10, 'l2_leaf_reg': 0.0001351899206097861, 'bagging_temperature': 8, 'min_data_in_leaf': 14

model4 = GradientBoostingRegressor(
    #learning_rate=0.2
    learning_rate=0.3830043942496965,
    n_estimators=100,
    criterion='friedman_mse',
    #subsample=0.9,
    max_depth=10,
    #max_depth=28,
    #min_samples_split=18,
    subsample=0.9115382776453557,
    #min_samples_leaf=11
)
# test_val_split
# params={'subsample': 240, 'max_depth': 28, 'min_samples_split': 18, 'subsample': 0.9115382776453557, 'learning_rate': 0.3830043942496965, 'min_samples_leaf': 11}

models = {
    "random_forest": model1,
    "lgbm": model2,
    "cat_boost": model3,
    "gbm": model4,
}

In [242]:
model_targets = {
    "random_forest": "price_per_m2",
    "lgbm": "log_price",
    "cat_boost": "log_price",
    "gbm": "log_price",
    #"xgb": "price_per_m2",
    #"dec_tree": "price_per_m2"
}

config = {
    "feature_config": feature_config,
    "models": models,
    "model_targets": model_targets,
    "drop_features": ["area_living", "area_kitchen", "distance_to_ulitsa_ostozhenka", "parking", "material", "seller", "layout", "condition"],
    "area_living": "no", # deteriorating
    "area_kitchen": "no", # deteriorating
    "bathrooms": "no", # not improving 
    "constructed": "impute_bins", # impute_bins (significant improvement)
    "material": "impute_bins", # impute_bins, impute_category (signitifcant improvement)
    "heating": "impute_category", # impute_bins, impute_category (deteriorating)
    "parking": "impute_category", # impute_bins, impute_category
    "elevator_without": "no", # deteriorating
    "remove_outliers": False,
    "remove_min_max_outliers": True,
    "one_hot_encoding": True,
    "one_hot_encode_features": ["parking", "district", "material", "seller", "condition", "layout"],
    "nan_threshold": 0,
    "corr_threshold": 0,
    "training_method": "cross_validation", # random, cross_validation, kfold_stacking
    "predict": True,
    "retrain_on_full_dataset": True,
    "predict_method": "average", #average
    "split_ratio": 0.2,
}

In [243]:
rsmle_scores = training(config)

Imputing missing data...
Remaining nan values in train_data for constructed: 0
Remaining nan values in test_data for constructed: 0
Remaining nan values in train_data for material: 0
Remaining nan values in test_data for material: 0
Remaining nan values in train_data for heating: 0
Remaining nan values in test_data for heating: 0
Remaining nan values in train_data for parking: 0
Remaining nan values in test_data for parking: 0
Remaining nan values in train_data for seller: 0
Remaining nan values in test_data for seller: 0
Remaining nan values in train_data for layout: 0
Remaining nan values in test_data for layout: 0
Remaining nan values in train_data for condition: 0
Remaining nan values in test_data for condition: 0
Remaining nan values in train_data for ceiling: 0
Remaining nan values in test_data for ceiling: 0
Remaining nan values in train_data for bathrooms_shared: 0
Remaining nan values in test_data for bathrooms_shared: 0
Remaining nan values in train_data for bathrooms_private

  0%|          | 0/5 [00:00<?, ?it/s]

Samples before: 23285 | Samples after: 23142


100%|██████████| 5/5 [02:33<00:00, 30.74s/it]
  0%|          | 0/5 [00:00<?, ?it/s]

Model: random_forest | Mean train rsmle: 0.010509247016844222 | Standard deviation: 0.0005371485020753602
Model: random_forest | Mean val rsmle: 0.12636792124471027 | Standard deviation: 0.00209824005264606


 20%|██        | 1/5 [00:10<00:40, 10.04s/it]



 40%|████      | 2/5 [00:19<00:28,  9.61s/it]



 60%|██████    | 3/5 [00:29<00:19,  9.83s/it]



 80%|████████  | 4/5 [00:37<00:09,  9.16s/it]



100%|██████████| 5/5 [00:46<00:00,  9.24s/it]
  0%|          | 0/5 [00:00<?, ?it/s]

Model: lgbm | Mean train rsmle: 0.03262165274702054 | Standard deviation: 0.00026687208275004084
Model: lgbm | Mean val rsmle: 0.12398800165205534 | Standard deviation: 0.003090660111203771


100%|██████████| 5/5 [01:20<00:00, 16.05s/it]
  0%|          | 0/5 [00:00<?, ?it/s]

Model: cat_boost | Mean train rsmle: 0.020711619862736983 | Standard deviation: 0.0003742398346707771
Model: cat_boost | Mean val rsmle: 0.12419229394010962 | Standard deviation: 0.0038912660819006295


100%|██████████| 5/5 [02:20<00:00, 28.12s/it]


Model: gbm | Mean train rsmle: 0.016245908869742573 | Standard deviation: 0.0006489817987125246
Model: gbm | Mean val rsmle: 0.15273776298562852 | Standard deviation: 0.0036999983618519377


In [232]:
train_data = train_with_feats.copy()
test_data = test_with_feats.copy()
train_data, test_data, features = prepare_data(train_data, test_data, config,)

Imputing missing data...
Remaining nan values in train_data for constructed: 0
Remaining nan values in test_data for constructed: 0
Remaining nan values in train_data for material: 0
Remaining nan values in test_data for material: 0
Remaining nan values in train_data for heating: 0
Remaining nan values in test_data for heating: 0
Remaining nan values in train_data for parking: 0
Remaining nan values in test_data for parking: 0
Remaining nan values in train_data for seller: 0
Remaining nan values in test_data for seller: 0
Remaining nan values in train_data for layout: 0
Remaining nan values in test_data for layout: 0
Remaining nan values in train_data for condition: 0
Remaining nan values in test_data for condition: 0
Remaining nan values in train_data for ceiling: 0
Remaining nan values in test_data for ceiling: 0
Remaining nan values in train_data for bathrooms_shared: 0
Remaining nan values in test_data for bathrooms_shared: 0
Remaining nan values in train_data for bathrooms_private

In [None]:
import optuna
X_train = train_data[features]
X_val = train_data[features]

def rf_objective(trial):
    
    rf_max_depth = trial.suggest_int("max_depth", 2, 32, log=True)
    rf_min_samples_split = trial.suggest_int("min_samples_split", 2, 100)
    rf_max_features = trial.suggest_int("max_features", 2, len(features))
    rf_min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 100)
    rf_min_weight_fraction_leaf = trial.suggest_float("min_weight_fraction_leaf", 1e-8, 0.5, log=True)
    reg = RandomForestRegressor(
        max_depth=rf_max_depth, 
        min_samples_split=rf_min_samples_split,
        max_features=rf_max_features,
        min_samples_leaf=rf_min_samples_leaf,
        min_weight_fraction_leaf=rf_min_weight_fraction_leaf,
        n_estimators=10,
    )
    scores = cross_validation({"rf": reg}, train_data, features, {"rf": "price_per_m2"}, k=5)
    return scores["rf"]

study = optuna.create_study(direction="minimize")
study.optimize(rf_objective, n_trials=100)
print(study.best_trial)

In [None]:
def lgbm_objective(trial):
    #train_data = train_with_feats.copy()
    #test_data = test_with_feats.copy()
    #train_data, test_data, features = prepare_data(train_data, test_data, config,)

    lgbm_num_leaves=trial.suggest_int("num_leaves", 2, 128),
    lgbm_min_data_in_leaf=trial.suggest_int("min_data_in_leaf", 5, 100),
    lgbm_max_depth=trial.suggest_int("max_depth", 2, 30),
    lgbm_max_bin=trial.suggest_int("max_bin", 100, 300)
    lgbm_reg_alpha=trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
    lgbm_reg_lambda=trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
    lgbm_min_child_samples=trial.suggest_int("min_child_samples", 5, 100),
    gbm = lgbm.LGBMRegressor(
        random_state=42,
        learning_rate=0.1,
        n_estimators=100,
        boosting_type='gbdt',
        n_jobs=2,
        num_leaves=lgbm_num_leaves,
        min_data_in_leaf=lgbm_min_data_in_leaf,
        max_depth=lgbm_max_depth,
        max_bin=lgbm_max_bin,
        reg_alpha=lgbm_reg_alpha,
        reg_lambda=lgbm_reg_lambda,
        min_child_samples=lgbm_min_child_samples,
    )
    scores = cross_validation({"lgbm": gbm}, train_data, features, {"lgbm": "log_price"}, k=5)
    return scores["lgbm"]

study = optuna.create_study(direction="minimize")
study.optimize(lgbm_objective, n_trials=100)
print(study.best_trial)

In [None]:
def cb_objective(trial):
    cb_depth=trial.suggest_int("depth", 2, 10)
    cb_l2_leaf_reg = trial.suggest_float("l2_leaf_reg", 1e-8, 10.0, log=True)
    cb_bagging_temperature = trial.suggest_int("bagging_temperature", 0, 100)
    cb_min_data_in_leaf = trial.suggest_int("min_data_in_leaf", 1, 50)
    cb = CatBoostRegressor(
        n_estimators=100,
        learning_rate=0.2,
        l2_leaf_reg=cb_l2_leaf_reg,
        bagging_temperature=cb_bagging_temperature,
        thread_count=2,
        depth=cb_depth,
        silent=True,
        random_seed=42,
        min_data_in_leaf=cb_min_data_in_leaf
    )    
    scores = cross_validation({"cb": cb}, train_data, features, {"cb": "log_price"}, k=5)
    return scores["cb"]

study = optuna.create_study(direction="minimize")
study.optimize(cb_objective, n_trials=100)
print(study.best_trial)

In [99]:
import xgboost as xgb

def xgb_objective(trial):

    dtrain = xgb.DMatrix(train_x, label=train_y)
    dvalid = xgb.DMatrix(valid_x, label=valid_y)
    param = {
        "verbosity": 0,
        "objective": "binary:logistic",
        # use exact for small dataset.
        "tree_method": "exact",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    }

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    bst = xgb.train(param, dtrain)
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
    accuracy = sklearn.metrics.accuracy_score(valid_y, pred_labels)
    return accuracy

[32m[I 2021-11-08 10:32:39,324][0m A new study created in memory with name: no-name-e4882bb9-8443-4862-aadd-440d1f1fccff[0m
[32m[I 2021-11-08 10:32:40,299][0m Trial 0 finished with value: 0.33573024618507613 and parameters: {'num_leaves': 254, 'max_depth': 18, 'min_samples_split': 28, 'subsample': 0.48453559760543863, 'learning_rate': 0.10511868163061969, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.33573024618507613.[0m
[32m[I 2021-11-08 10:32:41,239][0m Trial 1 finished with value: 0.24940335098802194 and parameters: {'num_leaves': 41, 'max_depth': 16, 'min_samples_split': 2, 'subsample': 0.531409018570974, 'learning_rate': 0.15674110776823233, 'min_samples_leaf': 26}. Best is trial 1 with value: 0.24940335098802194.[0m
[32m[I 2021-11-08 10:32:42,964][0m Trial 2 finished with value: 0.4101943959185244 and parameters: {'num_leaves': 169, 'max_depth': 33, 'min_samples_split': 25, 'subsample': 0.9859915442683048, 'learning_rate': 0.08081028899572111, 'min_samples_lea

FrozenTrial(number=87, values=[0.16369759836673356], datetime_start=datetime.datetime(2021, 11, 8, 10, 35, 12, 942859), datetime_complete=datetime.datetime(2021, 11, 8, 10, 35, 15, 850800), params={'num_leaves': 240, 'max_depth': 28, 'min_samples_split': 18, 'subsample': 0.9115382776453557, 'learning_rate': 0.3830043942496965, 'min_samples_leaf': 11}, distributions={'num_leaves': IntUniformDistribution(high=256, low=2, step=1), 'max_depth': IntUniformDistribution(high=35, low=2, step=1), 'min_samples_split': IntUniformDistribution(high=50, low=2, step=1), 'subsample': UniformDistribution(high=1.0, low=0.4), 'learning_rate': UniformDistribution(high=0.5, low=0.05), 'min_samples_leaf': IntUniformDistribution(high=30, low=1, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=87, state=TrialState.COMPLETE, value=None)


# Log settings

In [64]:
import os.path
settings = {
    "distance_to_center": True,
    "distance_to_metro": True,
    "distance_to_hospital": True,
    "elevator": True,
    "has_seller": True,
    "bathrooms": True,
    "bearing": True,
    "top_floor": True,
    "random_forest": True,
    "lgbm": True,
    "cat_boost": True,
    "gbm": False,
    "xgb": False,
    "drop_features": ["area_living", "area_kitchen"],
    "random_forest_target": "price_per_m2",
    "lgbm_target": "log_price",
    "cat_boost_target": "log_price",
    "gbm_target": "log_price",
    "xgb_target": "price_per_m2",
    "area_living": "impute_bins",
    "area_kitchen": "impute_bins",
    "bathrooms": "impute_bins",
    "constructed": "impute_bins",
    "material": "impute_bins",
    "heating": "impute_bins",
    "parking": "impute_category",
    "elevator_without": "impute_bins",
    "remove_outliers": False,
    "remove_min_max_outliers": False,
    "one_hot_encoding": True,
    "nan_threshold": 0,
    "corr_threshold": 0,
    "split_method": "random",
    "split_ratio": 0.2,
    "retrain_with_full_data": True,
    "prediction_file": "submissions_02.csv"
}
def log_performance(settings, rsmle_train, rsmle_val):
    for k, v in scores.items():
        settings["{}".format(k)] = v
    if os.path.isfile("log.csv"):
        df = pd.read_csv("log.csv")
        df = df.drop(df.columns[0], axis=1)
        df = df.append(pd.Series(settings), ignore_index=True)
    else:
        df = pd.DataFrame.from_records([settings])
    df.to_csv("log.csv")

log_performance(settings, rsmle_train, rsmle_test)