In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [2]:
train_df = pd.read_csv('data/train.csv', index_col='id', parse_dates=[1])
test_df = pd.read_csv('data/test.csv', index_col='id', parse_dates=[1])

In [3]:
cat_columns = train_df.select_dtypes(include='object').columns
train_df[cat_columns].apply(pd.unique)

product_type                                       [Investment, OwnerOccupier]
sub_area                     [Bibirevo, Nagatinskij Zaton, Tekstil'shhiki, ...
culture_objects_top_25                                               [no, yes]
thermal_power_plant_raion                                            [no, yes]
incineration_raion                                                   [no, yes]
oil_chemistry_raion                                                  [no, yes]
radiation_raion                                                      [no, yes]
railroad_terminal_raion                                              [no, yes]
big_market_raion                                                     [no, yes]
nuclear_reactor_raion                                                [no, yes]
detention_facility_raion                                             [no, yes]
water_1line                                                          [no, yes]
big_road1_1line                                     

In [4]:
def preprocess(df):
    # Timestamp encoding
    df['timestamp_year'] = df['timestamp'].dt.year
    df['timestamp_month'] = df['timestamp'].dt.month
    df['timestamp_day'] = df['timestamp'].dt.day
    df.drop(labels='timestamp', axis=1, inplace=True)

    # Categorical columns encoding
    df['product_type'] = df['product_type'].map({'Investment': 0, 'OwnerOccupier':1})

    eco_map = {'no data': 0,
               'poor': 1,
               'satisfactory': 2,
               'good': 3,
               'excellent': 4,}
    df['ecology'] = df['ecology'].map(eco_map)

    one_hot = pd.get_dummies(df['sub_area'])
    df.drop('sub_area', axis=1, inplace=True)
    df = df.join(one_hot)

    cat_columns = df.select_dtypes(include='object').columns
    df[cat_columns] = df[cat_columns].applymap(lambda x: 0 if x=='no' else 1)
    
    # Filling NaNs
    df[:] = SimpleImputer(strategy='mean').fit_transform(df)

    return df

## Preproccesing

In [5]:
train_df = preprocess(train_df)
test_df = preprocess(test_df)

In [6]:
set(train_df.columns.tolist()).difference(set(test_df.columns.tolist()))

{'Poselenie Klenovskoe', 'price_doc'}

In [7]:
test_df['Poselenie Klenovskoe'] = 0

In [8]:
set(train_df.columns.tolist()).difference(set(test_df.columns.tolist()))

{'price_doc'}

In [9]:
train_df.shape, test_df.shape

((30471, 438), (7662, 437))

In [10]:
train_df.isna().any().any(), test_df.isna().any().any()

(False, False)

## Modelling

In [11]:
train_df.head()

Unnamed: 0_level_0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,product_type,...,Vnukovo,Vojkovskoe,Vostochnoe,Vostochnoe Degunino,Vostochnoe Izmajlovo,Vyhino-Zhulebino,Zamoskvorech'e,Zapadnoe Degunino,Zjablikovo,Zjuzino
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,43.0,27.0,4.0,12.558974,1.827121,3068.057097,1.909804,6.399301,2.107025,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,34.0,19.0,3.0,12.558974,1.827121,3068.057097,1.909804,6.399301,2.107025,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,43.0,29.0,2.0,12.558974,1.827121,3068.057097,1.909804,6.399301,2.107025,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,89.0,50.0,9.0,12.558974,1.827121,3068.057097,1.909804,6.399301,2.107025,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,77.0,77.0,4.0,12.558974,1.827121,3068.057097,1.909804,6.399301,2.107025,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
target = train_df.pop('price_doc')
train_df['price_doc'] = target

In [13]:
X, y = train_df.values[:, 0:-1], train_df['price_doc'].values
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
X_train.shape, y_train.shape

((24376, 437), (24376,))

In [15]:
model = DecisionTreeRegressor(max_depth=10)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
np.sqrt(mean_squared_log_error(y_val, y_pred))

0.508522521238224

In [16]:
global X_test
X_test = test_df.values

In [21]:
def create_submission(model):
    submission = pd.read_csv('data/sample_submission.csv')
    submission['price_doc'] = model.predict(X_test)
    submission.to_csv('submission.csv', index=False)

In [22]:
create_submission(model)

Missing data

In [25]:
!kaggle competitions submissions -c sberbank-russian-housing-market

fileName        date                 description  status    publicScore  privateScore  
--------------  -------------------  -----------  --------  -----------  ------------  
submission.csv  2021-11-08 00:12:53  test_submit  complete  0.41135      0.40794       


In [52]:
df = pd.read_csv('publicleaderboard.csv')
scores = df['Score'].values

In [53]:
scores = np.append(scores, 0.41135)
scores = np.sort(scores)
scores

array([ 0.29755,  0.30069,  0.30629, ..., 11.77794, 11.77962, 35.09525])

In [58]:
f'{np.where(scores == 0.41135)[0][0]} / {len(scores)}'

'2921 / 3266'

In [None]:
# idx = train_df.index.tolist()
# set(np.arange(idx[0], idx[-1])).difference(set(idx))

In [None]:
# train_df.loc[1251:1253]

In [None]:
# idx = test_df.index.tolist()
# set(np.arange(idx[0], idx[-1])).difference(set(idx))