In [64]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from xgboost import XGBRegressor

In [4]:
from scipy import stats
import matplotlib.pyplot as plt

In [5]:
macro_df = pd.read_csv('data/macro.csv', parse_dates=['timestamp'])
train_df = pd.read_csv('data/train.csv', index_col='id', parse_dates=['timestamp'])
test_df = pd.read_csv('data/test.csv', index_col='id', parse_dates=['timestamp'])
fix_df = pd.read_excel('data/BAD_ADDRESS_FIX.xlsx').drop_duplicates('id').set_index('id')
name2sub_area = pd.read_csv('data/name2sub_area.csv', index_col='sub_area')

In [6]:
# train_df.update(fix_df, overwrite=True)
# test_df.update(fix_df, overwrite=True)
del fix_df

In [7]:
macro_df['child_on_acc_pre_school'] = macro_df['child_on_acc_pre_school'].str.replace('#!', 'nan')
for column in macro_df.select_dtypes('object').columns:
    macro_df[column] = macro_df[column].str.replace(',', '.')
    macro_df[column] = macro_df[column].astype(float)

In [8]:
train_macro_df = train_df.merge(macro_df, on='timestamp', how='left')
test_macro_df = test_df.merge(macro_df, on='timestamp', how='left' )

In [9]:
train_macro_df['sub_area'] = train_macro_df['sub_area'].map(name2sub_area['OKRUG'].to_dict())
test_macro_df['sub_area'] = test_macro_df['sub_area'].map(name2sub_area['OKRUG'].to_dict())
del name2sub_area

In [10]:
def encode(df):
    # Timestamp encoding
    df['timestamp_year'] = df['timestamp'].dt.year
    df['timestamp_month'] = df['timestamp'].dt.month
    df['timestamp_day'] = df['timestamp'].dt.day
    df.drop(labels='timestamp', axis=1, inplace=True)

    # Categorical columns encoding
    df['product_type'] = df['product_type'].map({'Investment': 0, 'OwnerOccupier':1})

    # Ecology
    eco_map = {'no data': np.NaN, # 0?
               'poor': 1,
               'satisfactory': 2,
               'good': 3,
               'excellent': 4,}
    df['ecology'] = df['ecology'].map(eco_map)

    # Sub_area 
    one_hot = pd.get_dummies(df['sub_area'])
    df.drop('sub_area', axis=1, inplace=True)
    df = df.join(one_hot)

    # yes/no
    cat_columns = df.select_dtypes(include='object').columns
    df[cat_columns] = df[cat_columns].applymap(lambda x: 0 if x=='no' else 1)
    
    return df

In [11]:
train_macro_df = encode(train_macro_df)
test_macro_df = encode(test_macro_df)
target = train_macro_df.pop('price_doc')

In [12]:
X, y = train_macro_df.values, target
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, y_train.shape

((24376, 402), (24376,))

In [61]:
params = {'n_estimators': 10,
          'booster': 'gbtree',
          'max_depth': 5,
          'eval_metric': 'mae',
          'learning_rate': 0.45,
          'reg_alpha': 100,
          'min_child_weight': 0,
          # 'subsample': 0.8,
          # 'colsample_bytree': 0.79,
          'seed': 42,
          'nthread': -1
          }

model = XGBRegressor(objective='reg:squarederror', **params)

model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)])


[0]	validation_0-mae:4189275.75000	validation_1-mae:4121061.75000
[1]	validation_0-mae:2792969.00000	validation_1-mae:2758492.75000
[2]	validation_0-mae:2102783.00000	validation_1-mae:2093478.87500
[3]	validation_0-mae:1771786.87500	validation_1-mae:1772549.37500
[4]	validation_0-mae:1611383.75000	validation_1-mae:1635524.87500
[5]	validation_0-mae:1519072.12500	validation_1-mae:1555397.00000
[6]	validation_0-mae:1482268.00000	validation_1-mae:1525650.62500
[7]	validation_0-mae:1441603.00000	validation_1-mae:1497423.37500
[8]	validation_0-mae:1409881.12500	validation_1-mae:1477316.00000
[9]	validation_0-mae:1396910.62500	validation_1-mae:1472758.50000


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, eval_metric='mae', gamma=0,
             gpu_id=-1, importance_type='gain', interaction_constraints='',
             learning_rate=0.45, max_delta_step=0, max_depth=5,
             min_child_weight=0, missing=nan, monotone_constraints='()',
             n_estimators=10, n_jobs=8, nthread=-1, num_parallel_tree=1,
             random_state=42, reg_alpha=100, reg_lambda=1, scale_pos_weight=1,
             seed=42, subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [62]:
y_pred_val = np.abs(model.predict(X_val))
y_pred_train = np.abs(model.predict(X_train))
np.sqrt(mean_squared_log_error(y_train, y_pred_train)), np.sqrt(mean_squared_log_error(y_val, y_pred_val))

(0.4644388724215377, 0.4771816998451314)

In [65]:
model = XGBRegressor(objective='reg:squarederror', **params)
cross_val_score(model, X, y, cv=KFold(shuffle=True, random_state=42), scoring='neg_mean_squared_log_error', n_jobs=6)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   5 out of   5 | elapsed:   36.3s finished


array([-0.22770237, -0.22812722, -0.22101153, -0.23741211, -0.2331106 ])

In [66]:
cv_results = [-0.22770237, -0.22812722, -0.22101153, -0.23741211, -0.2331106]
np.mean(np.abs(cv_results))

0.229472766

### train_test_split 
```
local = {'train':0.46444, 
         'test': 0.47718}
```
real: (0.33844, 0.33819)
### naive cv
local: 0.23
real: (0.32886, 0.32857)
### smart cv
local: (0.44323, 0.46569)
real: (0.32886, 0.32857)

In [56]:
global X_test
X_test = test_macro_df.values

In [57]:
def create_submission(model):
    submission = pd.read_csv('data/sample_submission.csv')
    pred = model.predict(X_test)
    if len(pred[pred < 0]):
        print('WARNING: NEGATIVE PREDICTIONS')
        pred = np.abs(pred)
    submission['price_doc'] = pred
    submission.to_csv('submission.csv', index=False)

In [58]:
create_submission(model)

In [59]:
!kaggle competitions submit -c sberbank-russian-housing-market -f submission.csv -m "test train_test_split"

100%|████████████████████████████████████████| 121k/121k [00:02<00:00, 60.6kB/s]
Successfully submitted to Sberbank Russian Housing Market

In [60]:
!kaggle competitions submissions -c sberbank-russian-housing-market

fileName        date                 description                                   status    publicScore  privateScore  
--------------  -------------------  --------------------------------------------  --------  -----------  ------------  
submission.csv  2021-11-19 19:43:05  test train_test_split                         complete  0.33844      0.33819       
submission.csv  2021-11-19 19:11:49  test train_test_split                         complete  0.32886      0.32857       
submission.csv  2021-11-19 19:10:42  test train_test_split                         complete  0.32886      0.32857       
submission.csv  2021-11-12 18:19:49  naive XGBRegressor with OKRUG                 complete  0.33627      0.34593       
submission.csv  2021-11-12 18:07:03  naive XGBRegressor with OKRUG                 complete  0.33615      0.35028       
submission.csv  2021-11-12 18:02:19  naive XGBRegressor with OKRUG                 complete  0.32857      0.33310       
submission.csv  2021-11-12 17:31

In [None]:
def get_place(my_score):
    df = pd.read_csv('publicleaderboard.csv')
    scores = df['Score'].values
    scores = np.append(scores, my_score)
    scores = np.sort(scores)
    print(f'{np.where(scores == my_score)[0][0]} / {len(scores)}')

In [None]:
my_score = 0.32494
get_place(my_score)

1936 / 3266
