In [1]:
from utils import *

import folium
import geopandas as gpd

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_log_error, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from xgboost import XGBRegressor

## Data description

In [2]:
macro_df = pd.read_csv('data/macro.csv', parse_dates=['timestamp'])
train_df = pd.read_csv('data/train.csv', index_col='id', parse_dates=['timestamp'])
test_df = pd.read_csv('data/test.csv', index_col='id', parse_dates=['timestamp'])
tverskoe_issue_fix(train_df)
tverskoe_issue_fix(test_df)

Fix:  550
Fix:  149


## 1. Data preprocessing
## I part (encoding and correcting mistakes)

### Macro dataset

In [3]:
macro_df['child_on_acc_pre_school'] = macro_df['child_on_acc_pre_school'].str.replace('#!', 'nan')
for column in macro_df.select_dtypes('object').columns:
    macro_df[column] = macro_df[column].str.replace(',', '.')
    macro_df[column] = macro_df[column].astype(float)

if not len(macro_df.select_dtypes('object').columns):
    print('OK')

OK


### Train dataset

In [4]:
train_df = encode(train_df)

### Test dataset

In [5]:
test_df = encode(test_df)

## II part (Filling missing values)

XGBRegressor model handles `np.NaN` values itself

## 2. Encoding `sub_area` feature

In [6]:
train_df['is_train'] = 1
test_df['is_train'] = 0

coords_df = pd.read_csv('data/coords.csv', index_col='id')
all_df = pd.concat([train_df, test_df])

all_df['latitude'] = coords_df['latitude']
all_df['longitude'] = coords_df['longitude']

## 3. Removing outliers

In [7]:
all_df = remove_outliers(all_df)

## 4. Feature engineering

In [None]:
def create_new_features(all_df):
    all_df['floor_by_max_floor'] = all_df['floor'] / all_df['max_floor']
    all_df["extra_sq"] = all_df["full_sq"] - all_df["life_sq"]

    # Room
    all_df['avg_room_size'] = (all_df['life_sq'] - all_df['kitch_sq']) / all_df['num_room']
    all_df['life_sq_prop'] = all_df['life_sq'] / all_df['full_sq']
    all_df['kitch_sq_prop'] = all_df['kitch_sq'] / all_df['full_sq']

    # Calculate age of building
    all_df['build_age'] = all_df['year'] - all_df['build_year']
    all_df = all_df.drop('build_year', axis=1)

    # Population
    all_df['population_den'] = all_df['raion_popul'] / all_df['area_m']
    all_df['gender_rate'] = all_df['male_f'] / all_df['female_f']
    all_df['working_rate'] = all_df['work_all'] / all_df['full_all']

    # Education
    all_df['preschool_ratio'] = all_df['children_preschool'] / all_df['preschool_quota']
    all_df['school_ratio'] = all_df['children_school'] / all_df['school_quota']

    # NaNs count
    all_df['nan_count'] = all_df[['full_sq', 'build_age', 'life_sq', 'floor', 'max_floor', 'num_room']].isnull().sum(axis=1)


In [None]:
all_df = create_new_features(all_df)

## 5. Removing fake prices

In [8]:
train_df = all_df[all_df['is_train'] == 1].drop(['is_train'], axis=1)
test_df = all_df[all_df['is_train'] == 0].drop(['is_train', 'price_doc'], axis=1)

In [9]:
# train_df = remove_fake_prices(train_df)
idx_outliers = np.loadtxt('data/idx_outliers.txt').astype(int)
train_df = train_df.drop(idx_outliers)

In [None]:
# # Assign weight
# allDf['w'] = 1
# allDf.loc[allDf.price_doc==1000000,'w'] *= 0.5
# allDf.loc[allDf.year==2015,'w'] *= 1.5