In [69]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split

In [2]:
train_df = pd.read_csv('data/train.csv', index_col='id', parse_dates=[1])
test_df = pd.read_csv('data/test.csv', index_col='id', parse_dates=[1])

In [3]:
cat_columns = train_df.select_dtypes(include='object').columns
train_df[cat_columns].apply(pd.unique)

product_type                                       [Investment, OwnerOccupier]
sub_area                     [Bibirevo, Nagatinskij Zaton, Tekstil'shhiki, ...
culture_objects_top_25                                               [no, yes]
thermal_power_plant_raion                                            [no, yes]
incineration_raion                                                   [no, yes]
oil_chemistry_raion                                                  [no, yes]
radiation_raion                                                      [no, yes]
railroad_terminal_raion                                              [no, yes]
big_market_raion                                                     [no, yes]
nuclear_reactor_raion                                                [no, yes]
detention_facility_raion                                             [no, yes]
water_1line                                                          [no, yes]
big_road1_1line                                     

In [5]:
def encode(df, fillna=True):
    # Timestamp encoding
    df['timestamp_year'] = df['timestamp'].dt.year
    df['timestamp_month'] = df['timestamp'].dt.month
    df['timestamp_day'] = df['timestamp'].dt.day
    df.drop(labels='timestamp', axis=1, inplace=True)

    # Categorical columns encoding
    df['product_type'] = df['product_type'].map({'Investment': 0, 'OwnerOccupier':1})

    eco_map = {'no data': np.NaN, # 0?
               'poor': 1,
               'satisfactory': 2,
               'good': 3,
               'excellent': 4,}
    df['ecology'] = df['ecology'].map(eco_map)

    one_hot = pd.get_dummies(df['sub_area'])
    df.drop('sub_area', axis=1, inplace=True)
    df = df.join(one_hot)

    cat_columns = df.select_dtypes(include='object').columns
    df[cat_columns] = df[cat_columns].applymap(lambda x: 0 if x=='no' else 1)
    
    # Filling NaNs
    # if fillna:
    #     df[:] = SimpleImputer(strategy='mean').fit_transform(df)

    return df

## Encoding

In [6]:
train_df = encode(train_df, fillna=False)
target = train_df.pop('price_doc')
train_df['price_doc'] = target
test_df = encode(test_df, fillna=False)

In [7]:
set(train_df.columns.tolist()).difference(set(test_df.columns.tolist()))

{'Poselenie Klenovskoe', 'price_doc'}

In [8]:
test_df['Poselenie Klenovskoe'] = 0

In [9]:
set(train_df.columns.tolist()).difference(set(test_df.columns.tolist()))

{'price_doc'}

In [10]:
train_df.shape, test_df.shape

((30471, 438), (7662, 437))

## Filling NaNs

In [11]:
train_df.isna().any().any(), test_df.isna().any().any()

(True, True)

In [27]:
imputer = SimpleImputer(strategy='mean')
imputer.fit(train_df.drop('price_doc', axis=1))

SimpleImputer()

In [28]:
# imputer = KNNImputer(n_neighbors=5)
# imputer.fit(train_df) # merge dfs

In [40]:
test_df[:] = imputer.transform(test_df)
train_df.loc[:, train_df.columns != 'price_doc'] = imputer.transform(train_df.values[:, 0:-1])

In [41]:
train_df.isna().any().any(), test_df.isna().any().any()

(False, False)

## Modelling

In [43]:
X, y = train_df.values[:, 0:-1], train_df['price_doc'].values
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
X_train.shape, y_train.shape

((24376, 437), (24376,))

In [56]:
model = DecisionTreeRegressor(max_depth=8)
model.fit(X_train, y_train)
y_pred_val = model.predict(X_val)
y_pred_train = model.predict(X_train)
np.sqrt(mean_squared_log_error(y_train, y_pred_train)), np.sqrt(mean_squared_log_error(y_val, y_pred_val))

(0.4712085241233668, 0.4902869050993713)

In [57]:
global X_test
X_test = test_df.values

In [58]:
def create_submission(model):
    submission = pd.read_csv('data/sample_submission.csv')
    submission['price_doc'] = model.predict(X_test)
    submission.to_csv('submission.csv', index=False)

In [59]:
create_submission(model)

In [61]:
!kaggle competitions submit -c sberbank-russian-housing-market -f submission.csv -m dummy

100%|████████████████████████████████████████| 177k/177k [00:03<00:00, 55.2kB/s]
Successfully submitted to Sberbank Russian Housing Market

Missing data

In [62]:
!kaggle competitions submissions -c sberbank-russian-housing-market

fileName        date                 description  status    publicScore  privateScore  
--------------  -------------------  -----------  --------  -----------  ------------  
submission.csv  2021-11-08 16:09:11  dummy        complete  0.34874      0.34785       
submission.csv  2021-11-08 00:12:53  test_submit  complete  0.41135      0.40794       


In [63]:
df = pd.read_csv('publicleaderboard.csv')
scores = df['Score'].values

In [66]:
my_score = 0.34874

In [67]:
scores = np.append(scores, my_score)
scores = np.sort(scores)
scores

array([ 0.29755,  0.30069,  0.30629, ..., 11.77794, 11.77962, 35.09525])

In [68]:
f'{np.where(scores == my_score)[0][0]} / {len(scores)}'

'2632 / 3267'

In [None]:
# idx = train_df.index.tolist()
# set(np.arange(idx[0], idx[-1])).difference(set(idx))

In [None]:
# train_df.loc[1251:1253]

In [None]:
# idx = test_df.index.tolist()
# set(np.arange(idx[0], idx[-1])).difference(set(idx))