In [16]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_log_error
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
from sklearn.metrics.pairwise import nan_euclidean_distances

In [17]:
macro_df = pd.read_csv('data/macro.csv', parse_dates=['timestamp'])

In [18]:
macro_df['child_on_acc_pre_school'] = macro_df['child_on_acc_pre_school'].map(
    {'#!': np.NaN,
     '45,713': 45713,
     '7,311': 7311,
     '3,013': 3013,
     '16,765': 16765},
    na_action='ignore'
)

macro_df['modern_education_share'] = macro_df['modern_education_share'].map(
    {'90,92': 90.92,
     '93,08': 93.08,
     '95,4918': 95.4918},
    na_action='ignore'
)

macro_df['old_education_build_share'] = macro_df['old_education_build_share'].map(
    {'23,14': 23.14,
     '25,47': 25.47,
     '8,2517': 8.2517},
    na_action='ignore'
)

In [19]:
train_df = pd.read_csv('data/train.csv', index_col='id', parse_dates=['timestamp'])
test_df = pd.read_csv('data/test.csv', index_col='id', parse_dates=['timestamp'])

train_macro_df = train_df.merge(macro_df, on='timestamp', how='left')
test_macro_df = test_df.merge(macro_df, on='timestamp', how='left' )

In [20]:
# imputer = make_pipeline(
#     StandardScaler(),
#     KNNImputer(n_neighbors=10),
#     StandardScaler()
# )
# 
# macro_df[:] = imputer.fit_transform(macro_df)

n_neighbors

In [21]:
def encode(df, fillna=True):
    # Timestamp encoding
    df['timestamp_year'] = df['timestamp'].dt.year
    df['timestamp_month'] = df['timestamp'].dt.month
    df['timestamp_day'] = df['timestamp'].dt.day
    df.drop(labels='timestamp', axis=1, inplace=True)

    # Categorical columns encoding
    df['product_type'] = df['product_type'].map({'Investment': 0, 'OwnerOccupier':1})

    eco_map = {'no data': np.NaN, # 0?
               'poor': 1,
               'satisfactory': 2,
               'good': 3,
               'excellent': 4,}
    df['ecology'] = df['ecology'].map(eco_map)

    one_hot = pd.get_dummies(df['sub_area'])
    df.drop('sub_area', axis=1, inplace=True)
    df = df.join(one_hot)

    cat_columns = df.select_dtypes(include='object').columns
    df[cat_columns] = df[cat_columns].applymap(lambda x: 0 if x=='no' else 1)
    
    # Filling NaNs
    # if fillna:
    #     df[:] = SimpleImputer(strategy='mean').fit_transform(df)

    return df

In [22]:
train_macro_df = encode(train_macro_df, fillna=False)
train_macro_df.pop('Poselenie Klenovskoe')

test_macro_df = encode(test_macro_df, fillna=False)

target = train_macro_df.pop('price_doc')

whole dataset without price_doc or train with price_doc

In [23]:
# df_train_test = pd.concat([df_full, test_df])

# df_scaled = df_train_test.copy()
# scaler = StandardScaler()
# df_scaled[:] = scaler.fit_transform(df_scaled)

In [24]:
# distances_1_5000 = nan_euclidean_distances(X=df_full_scaled.loc[:5000], Y=df_full_scaled)
# np.save('data/distances_1', distances_1_5000)

# distances_1_5000.shape

In [25]:
# imputer = make_pipeline(
#     StandardScaler(),
#     KNNImputer(),
#     StandardScaler()
# )
# imputer.fit(df_full)

In [26]:
# imputer = SimpleImputer(strategy='mean')
# df_full[:] = imputer.fit_transform(df_full)

# train_df = df_full.loc[:train_df.index[-1], :]
# test_df = df_full.loc[test_df.index[0]:, :]

In [27]:
train_macro_df.isna().any().any(), test_macro_df.isna().any().any()

(True, True)

In [28]:
X, y = train_macro_df.values, target
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, y_train.shape

((24376, 535), (24376,))

max_depth < 7

In [63]:
params = {'n_estimators': 500,
          'booster': 'gbtree',
          'max_depth': 5,
          'eval_metric': 'mae',
          'learning_rate': 0.05,
          'min_child_weight': 0,
          'subsample': 0.8,
          'colsample_bytree': 0.79,
          'seed': 42,
          'nthread': -1
          }

model = XGBRegressor(objective='reg:squarederror', **params)

# cv(params, xg_train, num_boost_round=300, metrics=['rmsle'], nfold=5, seed=42, verbose_eval=True)
model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)])


[0]	validation_0-mae:6791570.00000	validation_1-mae:6680488.00000
[1]	validation_0-mae:6454578.00000	validation_1-mae:6347582.50000
[2]	validation_0-mae:6139276.00000	validation_1-mae:6036851.00000
[3]	validation_0-mae:5847299.00000	validation_1-mae:5748914.00000
[4]	validation_0-mae:5578894.00000	validation_1-mae:5484525.00000
[5]	validation_0-mae:5327978.00000	validation_1-mae:5238898.50000
[6]	validation_0-mae:5093818.00000	validation_1-mae:5008089.50000
[7]	validation_0-mae:4874336.00000	validation_1-mae:4791682.50000
[8]	validation_0-mae:4669714.00000	validation_1-mae:4590656.00000
[9]	validation_0-mae:4477863.00000	validation_1-mae:4402622.50000
[10]	validation_0-mae:4297635.00000	validation_1-mae:4225125.00000
[11]	validation_0-mae:4127222.00000	validation_1-mae:4057015.50000
[12]	validation_0-mae:3967090.50000	validation_1-mae:3900303.25000
[13]	validation_0-mae:3817338.25000	validation_1-mae:3753313.25000
[14]	validation_0-mae:3675040.25000	validation_1-mae:3615243.25000
[15]	

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.79, eval_metric='mae',
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.05, max_delta_step=0,
             max_depth=5, min_child_weight=0, missing=nan,
             monotone_constraints='()', n_estimators=500, n_jobs=8, nthread=-1,
             num_parallel_tree=1, random_state=45, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, seed=45, subsample=0.8, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [64]:
y_pred_val = np.abs(model.predict(X_val))
y_pred_train = np.abs(model.predict(X_train))
np.sqrt(mean_squared_log_error(y_train, y_pred_train)), np.sqrt(mean_squared_log_error(y_val, y_pred_val))

(0.4119518176014153, 0.4627803840266515)

In [65]:
global X_test
X_test = test_macro_df.values

In [66]:
def create_submission(model):
    submission = pd.read_csv('data/sample_submission.csv')
    pred = model.predict(X_test)
    if len(pred[pred < 0]):
        print('WARNING: NEGATIVE PREDICTIONS')
        pred = np.abs(pred)
    submission['price_doc'] = pred
    submission.to_csv('submission.csv', index=False)

In [67]:
create_submission(model)

In [68]:
!kaggle competitions submit -c sberbank-russian-housing-market -f submission.csv -m "naive XGBRegressor with raw macro.csv"

100%|████████████████████████████████████████| 121k/121k [00:26<00:00, 4.71kB/s]
Successfully submitted to Sberbank Russian Housing Market

In [69]:
!kaggle competitions submissions -c sberbank-russian-housing-market

fileName        date                 description                            status    publicScore  privateScore  
--------------  -------------------  -------------------------------------  --------  -----------  ------------  
submission.csv  2021-11-12 16:14:31  naive XGBRegressor with raw macro.csv  complete  0.32494      0.32895       
submission.csv  2021-11-12 16:00:03  naive XGBRegressor with raw macro.csv  complete  0.35097      0.35469       
submission.csv  2021-11-12 15:48:48  naive XGBRegressor with raw macro.csv  complete  0.34873      0.35337       
submission.csv  2021-11-09 00:47:57  naive XGBRegressor                     complete  0.33041      0.33885       
submission.csv  2021-11-09 00:31:33  naive XGBRegressor                     complete  0.35039      0.34927       
submission.csv  2021-11-09 00:30:50  naive XGBRegressor                     error     None         None          
submission.csv  2021-11-09 00:25:37  naive XGBRegressor                     error     No

In [70]:
def get_place(my_score):
    df = pd.read_csv('publicleaderboard.csv')
    scores = df['Score'].values
    scores = np.append(scores, my_score)
    scores = np.sort(scores)
    print(f'{np.where(scores == my_score)[0][0]} / {len(scores)}')

In [71]:
my_score = 0.32494
get_place(my_score)

1936 / 3266


In [75]:
df = pd.read_csv('publicleaderboard.csv')
list(df['Score'].values)

[0.29755,
 0.30069,
 0.30629,
 0.3064,
 0.30664,
 0.3067,
 0.30714,
 0.30738,
 0.30758,
 0.30758,
 0.30782,
 0.30793,
 0.30842,
 0.30842,
 0.30842,
 0.30843,
 0.30845,
 0.30853,
 0.30859,
 0.30893,
 0.30897,
 0.30897,
 0.30903,
 0.30906,
 0.30906,
 0.3091,
 0.30916,
 0.30916,
 0.30918,
 0.30922,
 0.30922,
 0.30927,
 0.30933,
 0.30935,
 0.30936,
 0.30936,
 0.3094,
 0.30943,
 0.30948,
 0.30948,
 0.30949,
 0.30956,
 0.30958,
 0.30958,
 0.30958,
 0.30965,
 0.30966,
 0.30967,
 0.30968,
 0.30969,
 0.30972,
 0.30972,
 0.30973,
 0.30974,
 0.30974,
 0.30974,
 0.30975,
 0.30975,
 0.30976,
 0.30976,
 0.30978,
 0.30978,
 0.30979,
 0.3098,
 0.30981,
 0.30983,
 0.30983,
 0.30983,
 0.30985,
 0.30985,
 0.30987,
 0.30987,
 0.30988,
 0.30988,
 0.30991,
 0.30991,
 0.30992,
 0.30992,
 0.30992,
 0.30993,
 0.30994,
 0.30995,
 0.30996,
 0.30997,
 0.30998,
 0.30999,
 0.30999,
 0.30999,
 0.31,
 0.31,
 0.31,
 0.31,
 0.31002,
 0.31003,
 0.31003,
 0.31003,
 0.31004,
 0.31004,
 0.31005,
 0.31007,
 0.31007,
 0.3100