In [1]:
from utils import *

import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_log_error, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler

import lightgbm as lgb
from xgboost import XGBRegressor, DMatrix, cv, XGBClassifier
from xgboost import train as train_xgb

## Data description

In [2]:
macro_df = pd.read_csv('data/macro.csv', parse_dates=['timestamp'])
train_df = pd.read_csv('data/train.csv', index_col='id', parse_dates=['timestamp'])
test_df = pd.read_csv('data/test.csv', index_col='id', parse_dates=['timestamp'])

tverskoe_issue_fix(train_df)
tverskoe_issue_fix(test_df)

Fix:  550
Fix:  149


## 1. Data preprocessing
## I part (encoding and correcting mistakes)

### Macro dataset

In [3]:
macro_df['child_on_acc_pre_school'] = macro_df['child_on_acc_pre_school'].str.replace('#!', 'nan')
for column in macro_df.select_dtypes('object').columns:
    macro_df[column] = macro_df[column].str.replace(',', '.')
    macro_df[column] = macro_df[column].astype(float)

if not len(macro_df.select_dtypes('object').columns):
    print('OK')

OK


### Train dataset

In [4]:
train_df = encode(train_df)

### Test dataset

In [5]:
test_df = encode(test_df)

## II part (Filling missing values)

XGBRegressor model handles `np.NaN` values itself

## 2. Encoding `sub_area` feature

In [6]:
coords_train_df = pd.read_csv('data/geo/train_lat_lon.csv')
coords_train_df.drop(['key', 'tolerance_m'], axis=1, inplace=True)
coords_train_df.index = coords_train_df.id
coords_train_df.drop(['id'], axis=1, inplace=True)
coords_train_df = coords_train_df.sort_index()

coords_test_df = pd.read_csv('data/geo/test_lat_lon.csv')
coords_test_df.drop(['key', 'tolerance_m'], axis=1, inplace=True)
coords_test_df.index = coords_test_df.id
coords_test_df.drop(['id'], axis=1, inplace=True)
coords_test_df = coords_test_df.sort_index()

coords_all_df = pd.concat([coords_train_df, coords_test_df])

In [7]:
train_df['is_train'] = 1
test_df['is_train'] = 0

# coords_df = pd.read_csv('data/coords.csv', index_col='id')
all_df = pd.concat([train_df, test_df])

all_df['latitude'] = coords_all_df['lat']
all_df['longitude'] = coords_all_df['lon']

## 3. Removing outliers

In [8]:
all_df = remove_outliers(all_df)

## 4. Feature engineering

In [9]:
all_df = create_new_features(all_df)

## 5. Analyzing fake prices

In [10]:
train_df = all_df[all_df['is_train'] == 1].drop(['is_train'], axis=1)
test_df = all_df[all_df['is_train'] == 0].drop(['is_train', 'price_doc'], axis=1)

In [11]:
idx_outliers = np.loadtxt('outliers/idx_outliers_full.txt').astype(int)
idx_df = train_df.loc[idx_outliers].copy()
class_df = train_df.copy()

In [12]:
class_df['label'] = 0
class_df.loc[idx_df[idx_df['price_doc'] == 1_000_000].index, 'label'] = 1
class_df.loc[idx_df[idx_df['price_doc'] == 2_000_000].index, 'label'] = 1
class_df.loc[idx_df[idx_df['price_doc'] == 3_000_000].index, 'label'] = 1

In [13]:
X = class_df.drop(['sub_area', 'price_doc', 'label'], axis=1).copy()
y = class_df['label']

In [14]:
X_test = test_df.drop(['sub_area'], axis=1).copy()

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_val.shape

((24376, 306), (6095, 306))

In [23]:
28765/1706

16.86107854630715

In [16]:
1706 / 28765

0.05930818703285243

In [17]:
np.unique(y_train)

array([0, 1])

In [36]:
params = {'max_depth': 5,
          'objective': 'binary:logistic',
          'n_estimators': 1000,
          'tree_method': 'gpu_hist',
          'booster': 'gbtree',
          'learning_rate': 0.3,
          'eval_metric': 'error',
          'seed': 42,
          'nthread': -1,
          'scale_pos_weight': 20
          }

model = XGBRegressor(**params)

model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], early_stopping_rounds=50)

[0]	validation_0-error:0.55046	validation_1-error:0.55045
[1]	validation_0-error:0.54033	validation_1-error:0.54126
[2]	validation_0-error:0.52310	validation_1-error:0.52453
[3]	validation_0-error:0.51694	validation_1-error:0.51961
[4]	validation_0-error:0.50394	validation_1-error:0.50960
[5]	validation_0-error:0.50127	validation_1-error:0.50714
[6]	validation_0-error:0.48732	validation_1-error:0.49352
[7]	validation_0-error:0.48232	validation_1-error:0.48892
[8]	validation_0-error:0.47387	validation_1-error:0.48564
[9]	validation_0-error:0.47268	validation_1-error:0.48335
[10]	validation_0-error:0.46829	validation_1-error:0.47728
[11]	validation_0-error:0.46250	validation_1-error:0.47235
[12]	validation_0-error:0.45615	validation_1-error:0.46809
[13]	validation_0-error:0.45176	validation_1-error:0.46399
[14]	validation_0-error:0.44806	validation_1-error:0.46366
[15]	validation_0-error:0.43826	validation_1-error:0.45480
[16]	validation_0-error:0.43350	validation_1-error:0.45185
[17]	va

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, eval_metric='error',
             gamma=0, gpu_id=0, importance_type='gain',
             interaction_constraints='', learning_rate=0.3, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=8, nthread=-1,
             num_parallel_tree=1, objective='binary:logistic', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=20, seed=42,
             subsample=1, tree_method='gpu_hist', validate_parameters=1,
             verbosity=None)

In [41]:
y_pred_train = model.predict(X_train)

In [42]:
y_pred_train[y_pred_train > 0.5] = 1

In [47]:
y_pred_train[y_pred_train == 1].shape

(1366,)

In [50]:
y_train[y_train == 1].shape

(1365,)

In [67]:
y_pred = model.predict(X_test)

In [68]:
y_pred[y_pred > 0.8] = 1

In [69]:
y_pred[y_pred < 0.5] = 0

In [70]:
y_pred[y_pred == 1].shape

(79,)

In [71]:
np.argwhere(y_pred == 1)

array([[  70],
       [ 256],
       [ 271],
       [ 301],
       [ 396],
       [ 400],
       [ 448],
       [ 490],
       [ 574],
       [ 575],
       [ 661],
       [ 850],
       [ 868],
       [ 918],
       [ 930],
       [1009],
       [1197],
       [1280],
       [1311],
       [1412],
       [1675],
       [1776],
       [1818],
       [1826],
       [1877],
       [1980],
       [2126],
       [2142],
       [2514],
       [2561],
       [2642],
       [3029],
       [3102],
       [3157],
       [3587],
       [3591],
       [3741],
       [3838],
       [3868],
       [4022],
       [4186],
       [4345],
       [4356],
       [4477],
       [4487],
       [4749],
       [5190],
       [5345],
       [5471],
       [5487],
       [5507],
       [5571],
       [5776],
       [5793],
       [5850],
       [5907],
       [5927],
       [5929],
       [5960],
       [6010],
       [6121],
       [6262],
       [6329],
       [6355],
       [6444],
       [6489],
       [65