In [22]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_log_error
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

In [2]:
macro_df = pd.read_csv('data/macro.csv', parse_dates=[0])

In [3]:
macro_df['timestamp_year'] = macro_df['timestamp'].dt.year
macro_df['timestamp_month'] = macro_df['timestamp'].dt.month
macro_df['timestamp_day'] = macro_df['timestamp'].dt.day
macro_df.drop(labels='timestamp', axis=1, inplace=True)

In [4]:
temp = macro_df['child_on_acc_pre_school'].map(
    {'#!': np.NaN,
     '45,713': 45713,
     '7,311': 7311,
     '3,013': 3013,
     '16,765': 16765},
    na_action='ignore'
)
macro_df['child_on_acc_pre_school'] = temp

temp = macro_df['modern_education_share'].map(
    {'90,92': 90.92,
     '93,08': 93.08,
     '95,4918': 95.4918},
    na_action='ignore'
)
macro_df['modern_education_share'] = temp

temp = macro_df['old_education_build_share'].map(
    {'23,14': 23.14,
     '25,47': 25.47,
     '8,2517': 8.2517},
    na_action='ignore'
)
macro_df['old_education_build_share'] = temp

n_neighbors

In [5]:
model = make_pipeline(
    StandardScaler(),
    KNNImputer(n_neighbors=10),
    StandardScaler()
)

macro_df[:] = model.fit_transform(macro_df)

In [6]:
train_df = pd.read_csv('data/train.csv', index_col='id', parse_dates=[1])
test_df = pd.read_csv('data/test.csv', index_col='id', parse_dates=[1])

In [7]:
def encode(df, fillna=True):
    # Timestamp encoding
    df['timestamp_year'] = df['timestamp'].dt.year
    df['timestamp_month'] = df['timestamp'].dt.month
    df['timestamp_day'] = df['timestamp'].dt.day
    df.drop(labels='timestamp', axis=1, inplace=True)

    # Categorical columns encoding
    df['product_type'] = df['product_type'].map({'Investment': 0, 'OwnerOccupier':1})

    eco_map = {'no data': np.NaN, # 0?
               'poor': 1,
               'satisfactory': 2,
               'good': 3,
               'excellent': 4,}
    df['ecology'] = df['ecology'].map(eco_map)

    one_hot = pd.get_dummies(df['sub_area'])
    df.drop('sub_area', axis=1, inplace=True)
    df = df.join(one_hot)

    cat_columns = df.select_dtypes(include='object').columns
    df[cat_columns] = df[cat_columns].applymap(lambda x: 0 if x=='no' else 1)
    
    # Filling NaNs
    # if fillna:
    #     df[:] = SimpleImputer(strategy='mean').fit_transform(df)

    return df

In [8]:
train_df = encode(train_df, fillna=False)
train_df.pop('Poselenie Klenovskoe')
test_df = encode(test_df, fillna=False)

In [9]:
train_df.isna().any().any(), test_df.isna().any().any()

(True, True)

whole dataset without price_doc or train with price_doc

In [11]:
target = train_df.pop('price_doc')

In [12]:
df_full = pd.concat([train_df, test_df])
imputer = SimpleImputer(strategy='mean')
df_full[:] = imputer.fit_transform(df_full)

In [13]:
train_df = df_full.loc[:train_df.index[-1], :]
test_df = df_full.loc[test_df.index[0]:, :]

In [14]:
X, y = train_df.values, target
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
X_train.shape, y_train.shape

((24376, 436), (24376,))

max_depth < 7

In [59]:
model = XGBRegressor(n_estimators=200, max_depth=4, learning_rate=0.2, eval_metric='rmsle', n_jobs=6)
model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)])

[0]	validation_0-rmsle:1.22035	validation_1-rmsle:1.21621
[1]	validation_0-rmsle:0.76207	validation_1-rmsle:0.75716
[2]	validation_0-rmsle:0.58738	validation_1-rmsle:0.58371
[3]	validation_0-rmsle:0.51706	validation_1-rmsle:0.51428
[4]	validation_0-rmsle:0.49029	validation_1-rmsle:0.48909
[5]	validation_0-rmsle:0.48108	validation_1-rmsle:0.48048
[6]	validation_0-rmsle:0.47790	validation_1-rmsle:0.47823
[7]	validation_0-rmsle:0.47766	validation_1-rmsle:0.47850
[8]	validation_0-rmsle:0.47778	validation_1-rmsle:0.47902
[9]	validation_0-rmsle:0.47809	validation_1-rmsle:0.47974
[10]	validation_0-rmsle:0.47719	validation_1-rmsle:0.47972
[11]	validation_0-rmsle:0.47695	validation_1-rmsle:0.47967
[12]	validation_0-rmsle:0.47682	validation_1-rmsle:0.47974
[13]	validation_0-rmsle:0.47617	validation_1-rmsle:0.47956
[14]	validation_0-rmsle:0.47508	validation_1-rmsle:0.47860
[15]	validation_0-rmsle:0.47481	validation_1-rmsle:0.47853
[16]	validation_0-rmsle:0.47393	validation_1-rmsle:0.47787
[17]	va

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, eval_metric='rmsle',
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=4, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=200, n_jobs=6,
             num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [52]:
y_pred_val = np.abs(model.predict(X_val))
y_pred_train = np.abs(model.predict(X_train))
np.sqrt(mean_squared_log_error(y_train, y_pred_train)), np.sqrt(mean_squared_log_error(y_val, y_pred_val))

(0.3935693078710831, 0.46602859736307783)

In [26]:
global X_test
X_test = test_df.values

In [43]:
def create_submission(model):
    submission = pd.read_csv('data/sample_submission.csv')
    pred = model.predict(X_test)
    if len(pred[pred < 0]):
        print('WARNING: NEGATIVE PREDICTIONS')
        pred = np.abs(pred)
    submission['price_doc'] = pred
    submission.to_csv('submission.csv', index=False)

In [44]:
create_submission(model)



In [45]:
!kaggle competitions submit -c sberbank-russian-housing-market -f submission.csv -m "naive XGBRegressor"

100%|████████████████████████████████████████| 121k/121k [00:03<00:00, 36.3kB/s]
Successfully submitted to Sberbank Russian Housing Market

In [46]:
!kaggle competitions submissions -c sberbank-russian-housing-market

fileName        date                 description         status    publicScore  privateScore  
--------------  -------------------  ------------------  --------  -----------  ------------  
submission.csv  2021-11-09 00:31:33  naive XGBRegressor  complete  0.35039      0.34927       
submission.csv  2021-11-09 00:30:50  naive XGBRegressor  error     None         None          
submission.csv  2021-11-09 00:25:37  naive XGBRegressor  error     None         None          
submission.csv  2021-11-08 16:09:11  dummy               complete  0.34874      0.34785       
submission.csv  2021-11-08 00:12:53  test_submit         complete  0.41135      0.40794       


In [None]:
df = pd.read_csv('publicleaderboard.csv')
scores = df['Score'].values

In [None]:
my_score = 0.34874

In [None]:
scores = np.append(scores, my_score)
scores = np.sort(scores)
scores

array([ 0.29755,  0.30069,  0.30629, ..., 11.77794, 11.77962, 35.09525])

In [None]:
f'{np.where(scores == my_score)[0][0]} / {len(scores)}'

'2632 / 3267'