# Moscow Housing, Group 86
Henrik Ytrehus Ågotnes, studentnr: 491752

Alexander Michael Gerlach, studentnr: 557833

Fredrik Veidahl Aagaard, studentnr: 759482

**NOTE:** We manually selected two scores on kaggle. We attached the files to our submission marked as `submission_model1.csv` with a (public) score of 0.15885 corresponding to model 1 and `submission_model2.csv` with a (public) score of 0.15959 corresponding to model2. Both models are in this notebook. 

# Data Preparation

In [1]:
import pandas as pd
import json
import numpy as np
import geopy.distance
import geojson
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon
from sklearn.neighbors import NearestNeighbors
import seaborn as sns
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import KFold
from tqdm import tqdm
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgbm
from catboost import CatBoostRegressor
from sklearn.ensemble import BaggingRegressor
from xgboost import XGBRegressor

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

## Load data

In [2]:
apartments_train = pd.read_csv('data/apartments_train.csv')
buildings_train = pd.read_csv('data/buildings_train.csv')
apartments_test = pd.read_csv('data/apartments_test.csv')
buildings_test = pd.read_csv('data/buildings_test.csv')

train = pd.merge(
    apartments_train, 
    buildings_train.set_index('id'), 
    how='left', 
    left_on='building_id', 
    right_index=True
)
test = pd.merge(
    apartments_test, 
    buildings_test.set_index('id'), 
    how='left', 
    left_on='building_id', 
    right_index=True
)

## Replace missing coordinates and districts

In [3]:
ind = test[test["longitude"].isna()].index
test.at[ind, "latitude"] = 55.56776345324702
test.at[ind, "longitude"] = 37.48171529826662

In [4]:
ind = train[train.district.isna()].index
train.at[ind, "district"] = 12
ind = test[test.district.isna()].index
test.at[ind, "district"] = 12

## Fixing incorrect coordinates in testset

In [5]:
apartment_coordinates = {
    2511: [55.5438629741104, 37.48233890768276],
    2529: [55.66866253043727, 37.217152651528785],
    4719: [55.63132797843279, 37.426744466015705],
    5090: [55.54390546580924, 37.48229599441532],
    6959: [55.54390546580924, 37.48229599441532],
    8596: [55.54390546580924, 37.48229599441532],
    9547: [55.63399775464791, 37.41982879208156]
}
for k, v in apartment_coordinates.items():
    test.at[k, "latitude"] = v[0]
    test.at[k, "longitude"] = v[1]

# Feature Engineering

## Target features in training set

In [6]:
train["log_price"] = np.log1p(train.price)
train["price_per_m2"] = train.price / train.area_total

## Feature functions

In [7]:
def distance_to_center_feature(data):
    moscow_center = [55.751244, 37.618423]
    coordinates = data[['latitude', 'longitude']].to_numpy()
    dist = [geopy.distance.distance(moscow_center, coordinate).km for coordinate in coordinates]
    data['distance_to_center'] = dist

In [8]:
def distance_to_metro_feature(data, add_metro_lines=True):
    d = data.copy()

    metro_data = pd.read_csv("data/moscow_metro_data.csv", delimiter=";")

    # drop duplicates
    ind = metro_data[metro_data["English transcription"].duplicated()].index
    metro_data = metro_data.drop(ind)
    metro_data = metro_data.reset_index()
    metro_lines = ["metro_line_{}".format(i) for i in range(1, 16)]

    nbrs = NearestNeighbors(n_neighbors=1, algorithm="ball_tree").fit(metro_data[["latitude", "longitude"]])
    distances, indices = nbrs.kneighbors(d[["latitude", "longitude"]])
    d["metro_index"] = indices

    for ind, row in d.iterrows():
        metro_coordinate = metro_data.loc[[row.metro_index]][["latitude", "longitude"]].to_numpy()
        distance = geopy.distance.distance([[row["latitude"], row["longitude"]]], metro_coordinate).km
        data.at[ind, "distance_to_metro"] = distance

        if add_metro_lines:
            number_of_metro_lines = 0
            metro_lines_data = metro_data.loc[[row.metro_index]][metro_lines]
            for metro_line in metro_lines:
                access_to_line = metro_lines_data[metro_line].values[0]
                data.at[ind, metro_line] = access_to_line
                if access_to_line == 1:
                    number_of_metro_lines += 1
            data.at[ind, "number_of_metro_lines"] = number_of_metro_lines

In [9]:
def bearing_feature(data):
    for ind, row in data.iterrows():
        moscow_center = [55.751244, 37.618423]
        c1 = moscow_center
        c2 = [row["latitude"], row["longitude"]]
        y = np.sin(c2[1] - c1[1]) * np.cos(c2[0])
        x = np.cos(c1[0]) * np.sin(c2[0]) - np.sin(c1[0]) * np.cos(c2[0]) * np.cos(c2[1] - c1[1])
        rad = np.arctan2(y, x)
        bearing = ((rad * 180 / np.pi) + 360) % 360
        data.at[ind, "bearing"] = bearing

In [10]:
def distance_to_universities_feature(data):
    state_uni_coordinates = [55.70444300116007, 37.528611852796914]
    tech_uni_coordinates = [55.76666597872545, 37.68511242319504]
    distances_to_state_uni = []
    distances_to_tech_uni = []
    for ind, row in data.iterrows():
        distances_to_state_uni.append(geopy.distance.distance([row["latitude"], row["longitude"]], state_uni_coordinates).km)
        distances_to_tech_uni.append(geopy.distance.distance([row["latitude"], row["longitude"]], tech_uni_coordinates).km)
    data["distance_to_state_uni"] = distances_to_state_uni
    data["distance_to_tech_uni"] = distances_to_tech_uni

In [11]:
def wealthy_districts_features(data):
    with open("data/geojson/khamovniki_polygon_data.geojson") as f:
        gj_khamovniki = geojson.load(f)
    with open("data/geojson/yakimanka_polygon_data.geojson") as f:
        gj_yakimanka = geojson.load(f)
    with open("data/geojson/arbat_polygon_data.geojson") as f:
        gj_arbat = geojson.load(f)
    with open("data/geojson/presnensky_polygon_data.geojson") as f:
        gj_presnensky = geojson.load(f)
    with open("data/geojson/Tverskoy_polygon_data.geojson") as f:
        gj_tverskoy = geojson.load(f)
    khamovniki_polygon = Polygon(gj_khamovniki["geometries"][0]["coordinates"][0][0])
    yakimanka_polygon = Polygon(gj_yakimanka["geometries"][0]["coordinates"][0][0])
    arbat_polygon = Polygon(gj_arbat["geometries"][0]["coordinates"][0][0])
    presnensky_polygon = Polygon(gj_presnensky["geometries"][0]["coordinates"][0][0])
    tverskoy_polygon = Polygon(gj_tverskoy["geometries"][0]["coordinates"][0][0])
    for ind, row in data.iterrows():
        point = Point([row["longitude"], row["latitude"]])
        data.at[ind, "is_in_khamovniki"] = 1 if khamovniki_polygon.contains(point) else 0
        data.at[ind, "is_in_yakimanka"] = 1 if yakimanka_polygon.contains(point) else 0
        data.at[ind, "is_in_arbat"] = 1 if arbat_polygon.contains(point) else 0
        data.at[ind, "is_in_presnensky"] = 1 if presnensky_polygon.contains(point) else 0
        data.at[ind, "is_in_tverskoy"] = 1 if tverskoy_polygon.contains(point) else 0

In [12]:
def add_box_areas_more_boxes(data):
    """only add values in list to prevent unnecessary slowdown"""
    locs_with_high_corr = [
            'loc_37.575_55.75', 'loc_37.6_55.725', 'loc_37.525_55.75',
           'loc_37.55_55.775', 'loc_37.525_55.725', 'loc_37.575_55.725',
           'loc_37.525_55.7', 'loc_37.5_55.7', 'loc_37.55_55.725',
           'loc_37.475_55.7', 'loc_37.625_55.725', 'loc_37.5_55.75',
           'loc_37.525_55.775', 'loc_37.525_55.675', 'loc_37.55_55.675',
           'loc_37.55_55.75', 'loc_37.4_55.6', 'loc_37.725_55.575',
           'loc_37.925_55.675', 'loc_37.475_55.55', 'loc_37.65_55.575',
           'loc_37.525_55.875', 'loc_37.5_55.55', 'loc_37.475_55.525',
           'loc_37.925_55.7'
    ]
    for lon in np.arange(37.4, 37.95, 0.025):
        for lat in np.arange(55.525, 55.9, 0.025):
            column_name = "loc_" + str(round(lon, 3)) + "_" + str(round(lat, 3))
            if column_name in locs_with_high_corr: #column_name in locs_with_high_corr:
                data[column_name] = 0
                indexes = data[
                    (lon < data['longitude']) & (data['longitude'] < lon + 0.025) & (lat < data['latitude']) & (
                                data['latitude'] < lat + 0.025)
                    ].index

                data.loc[indexes, column_name] = 1

In [13]:
from sklearn.cluster import KMeans

def add_clusters(train_data, test_data):
    kmeans = KMeans(n_clusters=100, random_state=0)
    s1 = train_data.longitude
    s2 = test_data.longitude
    f1 = train_data.latitude
    f2 = test_data.latitude
    kmeans.fit(pd.concat([pd.concat([s1, s2]), pd.concat([f1, f2]) ], axis=1))
    train_data['clusters'] = kmeans.predict(train_data[['longitude', 'latitude']])
    pd.get_dummies(train_data, columns=['clusters'])
    test_data['clusters'] = kmeans.predict(test_data[['longitude', 'latitude']])
    test_data = pd.get_dummies(test_data,columns=['clusters'])

## Feature construction - (NOTE: TAKES SOME TIME)

In [14]:
def construct_features(data):
    distance_to_center_feature(data)
    distance_to_metro_feature(data, False)
    bearing_feature(data)
    distance_to_universities_feature(data)
    #wealthy_districts_features(data)
    add_box_areas_more_boxes(data)

train_with_feats = train.copy()
test_with_feats = test.copy()
construct_features(train_with_feats)
construct_features(test_with_feats)
add_clusters(train_with_feats, test_with_feats)

# Data Cleaning

## Imputing missing data

In [15]:
def impute_with_bin_mean(data, train_data, test_data, feature, bin_feature, bins=40, decimals=0, verbose=False):
    tr = train_data.copy()
    te = test_data.copy()
    database = pd.concat([tr, te])
    database["bin"], bins = pd.cut(database[bin_feature], bins=40, retbins=True)
    pb = 0
    for b in bins:
        bin_data = database[(database[bin_feature] >= pb) & (database[bin_feature] <= b)]
        bin_mean = bin_data[feature].mean()
        percentage = (bin_mean / bin_data[bin_feature]).mean()
        d = data[(data[bin_feature] >= pb) & (data[bin_feature] <= b)]
        ind = d[d[feature].isna()].index
        if pd.notna(bin_mean) and len(ind) > 0:
            bin_mean = round(bin_mean, decimals)
            data.at[ind, feature] = bin_mean
            if verbose:
                print(
                    "Set {} for {} rows with {} <= {} <= {} to {}".format(feature, len(ind), pb, bin_feature, b, bin_mean))
        pb = b

In [16]:
def impute_categorical_features(data):
    data.loc[:, "seller"] = data.loc[:, "seller"].fillna(4)
    data.loc[:, "layout"] = data.loc[:, "layout"].fillna(3)
    data.loc[:, "condition"] = data.loc[:, "condition"].fillna(4)
    data.loc[:, "material"] = data.loc[:, "material"].fillna(7)
    data.loc[:, "parking"] = data.loc[:, "parking"].fillna(3)
    data.loc[:, "heating"] = data.loc[:, "heating"].fillna(4)

In [17]:
def impute_boolean_features(data):
    data.loc[:, "elevator_without"] = data.loc[:, "elevator_without"].fillna(0)
    data.loc[:, "elevator_service"] = data.loc[:, "elevator_service"].fillna(0)
    data.loc[:, "elevator_passenger"] = data.loc[:, "elevator_passenger"].fillna(0)
    data.loc[:, "garbage_chute"] = data.loc[:, "garbage_chute"].fillna(0)
    data.loc[:, "balconies"] = data.loc[:, "balconies"].fillna(0)
    data.loc[:, "loggias"] = data.loc[:, "loggias"].fillna(0)

In [18]:
def impute_numerical_features(data):
    # constructed
    data.loc[data.new == 1, "constructed"] = data.loc[data.new == 1, "constructed"].fillna(2021)
    impute_with_bin_mean(data=data, train_data=train, test_data=test, feature="constructed", bins=12, bin_feature="district")
    data.loc[data.constructed >= 2021, "new"] = data.loc[data.constructed >= 2021, "new"].fillna(1)
    data.loc[data.constructed < 2021, "new"] = data.loc[data.constructed < 2021, "new"].fillna(0)
    
    numerical_features = ["phones", "ceiling", "bathrooms_shared", "bathrooms_private", "windows_street", "windows_court", "area_kitchen", "area_living"]
    for num_feat in numerical_features:
        impute_with_bin_mean(data=data, train_data=train, test_data=test, feature=num_feat, bin_feature="area_total", bins=20)
        fill_value = data[data[num_feat].notna()][num_feat].mean(axis=0).round()
        data.loc[:, num_feat] = data.loc[:, num_feat].fillna(fill_value)
    

In [19]:
def impute_nans(data):
    impute_categorical_features(data)
    impute_boolean_features(data)
    impute_numerical_features(data)

train_data = train_with_feats.copy()
test_data = test_with_feats.copy()
impute_nans(train_data)
impute_nans(test_data)

In [20]:
train_data.loc[train_data.area_living==0, 'area_living'] = train_data[train_data.area_living==0].area_total * 0.56

## One-hot encoding categorical features

In [21]:
district_categories = {
    3: "East",
    6: "South-West",
    5: "South",
    4: "South-East",
    0: "Central",
    2: "North-East",
    1: "North",
    8: "North-West",
    7: "West",
    11: "Novomoskovsk",
    10: "Troitsk",
    9: "Zelenograd",
    12: "Other_district"
}
material_categories = {
    0: "Bricks",
    1: "Wood",
    2: "Monolith",
    3: "Panel",
    4: "Block",
    5: "Monolithic_brick",
    6: "Stalin_project",
    7: "Other_material"
}
heating_categories = {
    0: "Central",
    1: "Individual",
    2: "Boiler",
    3: "Autonomous_boiler",
    4: "Other_heating"
}
parking_categories = {
    0: "Ground",
    1: "Underground",
    2: "Multilevel",
    3: "No_parking"
}
seller_categories = {
    0: "Owner",
    1: "Company",
    2: "Agents",
    3: "Developer",
    4: "Other_seller"
}
layout_categoires = {
    0: "Adjacent",
    1: "Isolated",
    2: "Adjacent_isolated",
    3: "Other_layout"
}
condition_categories = {
    0: "Undecorated",
    1: "Decorated",
    2: "Euro_repair",
    3: "Special_design",
    4: "Other_condition"
}
categorical_features = {
    "parking": parking_categories,
    "heating": heating_categories,
    "material": material_categories,
    "district": district_categories,
    "seller": seller_categories,
    "layout": layout_categoires,
    "condition": condition_categories
}

In [22]:
def one_hot_encode_categorical_features(data, remove_org=True):
    for orig_feat, categories in categorical_features.items():
        for k, category in categories.items():
            data[category] = (data[orig_feat] == k).astype(int)
        if remove_org:
            data.drop(orig_feat, axis=1, inplace=True)
one_hot_encode_categorical_features(train_data)
one_hot_encode_categorical_features(test_data)

train_data = pd.get_dummies(train_data, columns=['clusters'])
test_data = pd.get_dummies(test_data, columns=['clusters'])

## Feature selection

In [23]:
# features for model 1
features = train_data.columns
features = features.drop(["id", "building_id", "street", "address", "price", "log_price", "price_per_m2", "street", "address"], errors="ignore")
features = features.drop([col for col in train_data.columns if col[0:2] == 'cl']) 

# features for model 2
features_2 = train_data.columns
features_2 = features.drop(["id", "building_id", "street", "address", "price", "log_price", "price_per_m2", "street", "address"], errors="ignore")
features_2 = features.drop([col for col in train_data.columns if col[0:3] == 'loc']) 

## Removing outliers

In [24]:
def remove_min_max_outliers(train_data, test_data, features):
    t = train_data.copy()
    test_feat_min_max = pd.DataFrame({"max": test_data.describe().loc["max"], "min": test_data.describe().loc["min"]}).T
    for feature in features:
        max_val = test_feat_min_max[feature]["max"]
        min_val = test_feat_min_max[feature]["min"]
        max_ind = t[t[feature] > max_val].index
        min_ind = t[t[feature] < min_val].index
        t.drop(max_ind, inplace=True)
        t.drop(min_ind, inplace=True)

    print("Samples before: {} | Samples after: {}".format(len(train_data), len(t)))
    return t

train_data = remove_min_max_outliers(train_data, test_data, features)

Samples before: 23285 | Samples after: 23062


In [25]:
train_data = train_data.reset_index()

# Model definition, training, and prediction

In [26]:
def root_mean_squared_log_error(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

## Model 1 - using stacking with averaging and box_areas

In [27]:
SEED = 42
model1 = lgbm.LGBMRegressor(
    random_state=SEED,
    learning_rate=0.1,
    n_estimators=2000,
    boosting_type='gbdt',
    n_jobs=2,
    num_leaves=20,
    min_data_in_leaf=12,
    max_depth=20,
    max_bin=160,
)

model2 = CatBoostRegressor(
    n_estimators=1000,
    learning_rate=0.2,
    thread_count=2,
    depth=7,    
    min_data_in_leaf=10,
    silent=True,
    random_seed=SEED,
)

model3 = BaggingRegressor(
    n_estimators=200,
    n_jobs=2,
    random_state=SEED,
    max_features=0.7,
    max_samples=0.8,
)

In [28]:
def cross_validation(model, k, data, features, target_feature):
    kf = KFold(
        n_splits=k,
        shuffle=True,
        random_state=SEED
    )
    train_scores = []
    val_scores = []
    
    for i, (train_index, test_index) in tqdm(enumerate(kf.split(buildings_train)), total=k):
        building_val_ids = buildings_train.id.values[test_index]
        train = data[~data.building_id.isin(building_val_ids)]
        val = data[data.building_id.isin(building_val_ids)]
        X_train = train[features].values
        X_val = val[features].values
        y_train = train[[target_feature]].values.ravel()
        y_val = val[[target_feature]].values.ravel()
        y_train_true = train[["price"]].values.ravel()
        y_val_true = val[["price"]].values.ravel()

        model.fit(X_train, y_train)
        train_pred = model.predict(X_train)
        val_pred = model.predict(X_val)
        
        if target_feature == "log_price":
            train_pred = (np.e ** train_pred) - 1
            val_pred = (np.e ** val_pred) - 1
        elif target_feature == "price_per_m2":
            train_pred = train_pred * train.area_total
            val_pred = val_pred * val.area_total
        else:
            raise Exception("Unknown target feature")
            
        train_rsmle = root_mean_squared_log_error(y_true=y_train_true, y_pred=train_pred)
        val_rsmle = root_mean_squared_log_error(y_true=y_val_true, y_pred=val_pred)
        train_scores.append(train_rsmle)
        val_scores.append(val_rsmle)
    train_scores = np.array(train_scores)
    val_scores = np.array(val_scores)
    print("Train rsmle: {} | Std: {}".format(train_scores.mean(), train_scores.std()))
    print("Val rsmle: {} | Std: {}".format(val_scores.mean(), val_scores.std()))
    return val_scores.mean()

In [29]:
model1_scores = cross_validation(model1, 5, train_data, features, "log_price")
model2_scores = cross_validation(model2, 5, train_data, features, "log_price")
model3_scores = cross_validation(model3, 5, train_data, features, "price_per_m2")
scores = {
    "lgbm": model1_scores,
    "cb": model2_scores,
    "bagging": model3_scores,
}

  0%|                                                                                                                             | 0/5 [00:00<?, ?it/s]



 20%|███████████████████████▍                                                                                             | 1/5 [00:05<00:22,  5.71s/it]



 40%|██████████████████████████████████████████████▊                                                                      | 2/5 [00:11<00:16,  5.50s/it]



 60%|██████████████████████████████████████████████████████████████████████▏                                              | 3/5 [00:16<00:10,  5.37s/it]



 80%|█████████████████████████████████████████████████████████████████████████████████████████████▌                       | 4/5 [00:21<00:05,  5.16s/it]



100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:29<00:00,  5.96s/it]


Train rsmle: 0.04201944015599936 | Std: 0.0007927884079163581
Val rsmle: 0.20152252789785347 | Std: 0.024818056998283763


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:34<00:00,  6.92s/it]


Train rsmle: 0.05191085408711893 | Std: 0.0007495411745402716
Val rsmle: 0.20930695670556912 | Std: 0.033408242387527536


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:07<00:00, 25.54s/it]

Train rsmle: 0.0627379174016011 | Std: 0.0011539479574389216
Val rsmle: 0.2093521017162976 | Std: 0.03191689266337697





In [30]:
model_scores = pd.DataFrame(scores, index=[0])
model_scores = model_scores.T
model_scores.columns = ["rsmle"]
model_scores

Unnamed: 0,rsmle
lgbm,0.201523
cb,0.209307
bagging,0.209352


## Prediction using model 1

In [31]:
model1.fit(train_data[features], train_data["log_price"])
model2.fit(train_data[features], train_data["log_price"])
model3.fit(train_data[features], train_data["price_per_m2"])

model1_pred = model1.predict(test_data[features])
model2_pred = model2.predict(test_data[features])
model3_pred = model3.predict(test_data[features])

model1_pred = (np.e ** model1_pred) - 1
model2_pred = (np.e ** model2_pred) - 1
model3_pred *= test_data.area_total

predictions = [model1_pred, model2_pred, model3_pred]
pred = np.average(predictions, 
    weights = 1 / model_scores['rsmle'] ** 4,
    axis=0
)
submission = pd.DataFrame()
submission['id'] = test_data.id
submission['price_prediction'] = pred
submission.to_csv('submission_model1.csv', index=False)



## Model 2 - stacking with averaging and clusters instead of box_areas

In [32]:
model1_scores = cross_validation(model1, 5, train_data, features_2, "log_price")
model2_scores = cross_validation(model2, 5, train_data, features_2, "log_price")
model3_scores = cross_validation(model3, 5, train_data, features_2, "price_per_m2")
scores = {
    "lgbm": model1_scores,
    "cb": model2_scores,
    "bagging": model3_scores,
}

  0%|                                                                                                                             | 0/5 [00:00<?, ?it/s]



 20%|███████████████████████▍                                                                                             | 1/5 [00:05<00:20,  5.08s/it]



 40%|██████████████████████████████████████████████▊                                                                      | 2/5 [00:10<00:15,  5.24s/it]



 60%|██████████████████████████████████████████████████████████████████████▏                                              | 3/5 [00:15<00:09,  4.96s/it]



 80%|█████████████████████████████████████████████████████████████████████████████████████████████▌                       | 4/5 [00:19<00:04,  4.56s/it]



100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:22<00:00,  4.59s/it]


Train rsmle: 0.04154846719540952 | Std: 0.0006239318570326423
Val rsmle: 0.19928084042973432 | Std: 0.020495137796523448


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:36<00:00,  7.34s/it]


Train rsmle: 0.051509534830374834 | Std: 0.0007875993107409987
Val rsmle: 0.20463721754870146 | Std: 0.02078200122532646


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:01<00:00, 24.29s/it]

Train rsmle: 0.06279062923730641 | Std: 0.0011365991375978698
Val rsmle: 0.2092565787340182 | Std: 0.032681001329564544





In [34]:
model_scores = pd.DataFrame(scores, index=[0])
model_scores = model_scores.T
model_scores.columns = ["rsmle"]
model_scores

Unnamed: 0,rsmle
lgbm,0.199281
cb,0.204637
bagging,0.209257


## Prediction using model 2

In [35]:
model1.fit(train_data[features_2], train_data["log_price"])
model2.fit(train_data[features_2], train_data["log_price"])
model3.fit(train_data[features_2], train_data["price_per_m2"])

model1_pred = model1.predict(test_data[features_2])
model2_pred = model2.predict(test_data[features_2])
model3_pred = model3.predict(test_data[features_2])

model1_pred = (np.e ** model1_pred) - 1
model2_pred = (np.e ** model2_pred) - 1
model3_pred *= test_data.area_total

predictions = [model1_pred, model2_pred, model3_pred]
pred = np.average(predictions, 
    weights = 1 / model_scores['rsmle'] ** 4,
    axis=0
)
submission = pd.DataFrame()
submission['id'] = test_data.id
submission['price_prediction'] = pred
submission.to_csv('submission_model2.csv', index=False)

